From e820e59cbd430cd308a4734747b381996c573996 Mon Sep 17 00:00:00 2001
From: Edward Irby <e.irby@pm.me>
Date: Tue, 10 Mar 2026 21:22:21 -0700
Subject: [PATCH 1/7] feat: replace pipeline with agent-build trial tooling

Full replacement of src/ and bin/ with trial runner, schemas,
and CLI utilities ported from agent-build. Strips BP-specific
types (no RISK_TAG, SelectionBid, TOOL_STATUS). Inlines
simplified TrajectoryStepSchema with 3 step types.

BREAKING: New CLI entry point, new exports structure.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .agents/skills/compare-trials/SKILL.md        |   96 ++
 .../compare-trials/references/bootstrap.ts    |   58 +
 .../compare-trials/references/compare.ts      |  195 +++
 .agents/skills/trial-adapters/SKILL.md        |  258 ++++
 .agents/skills/trial-runner/SKILL.md          |  182 +++
 bin/cli.ts                                    |  160 ---
 bin/tests/cli.spec.ts                         |  529 --------
 bun.lock                                      |    5 -
 package.json                                  |   23 +-
 src/cli.ts                                    |   22 +
 src/cli.utils.ts                              |   99 ++
 src/commands.ts                               |   33 -
 src/commands/balance.ts                       |  245 ----
 src/commands/calibrate.ts                     |  304 -----
 src/commands/capture.ts                       |  391 ------
 src/commands/execution.ts                     |  245 ----
 src/commands/summarize.ts                     |  226 ----
 src/commands/tests/balance-helpers.spec.ts    |  279 -----
 src/commands/tests/calibrate-helpers.spec.ts  |  226 ----
 src/commands/tests/capture-cli.spec.ts        |  274 -----
 src/commands/tests/capture-helpers.spec.ts    |  634 ----------
 src/commands/tests/summarize-helpers.spec.ts  |  339 ------
 .../tests/trials-calculations.spec.ts         |  209 ----
 src/commands/tests/trials-cli.spec.ts         |  215 ----
 src/commands/trials.ts                        |  377 ------
 src/commands/validate-refs.ts                 |  171 ---
 src/core.ts                                   |   46 -
 src/core/core.ts                              |   51 -
 src/core/loading.ts                           |  207 ----
 src/core/output.ts                            |  120 --
 src/core/streaming.ts                         |  172 ---
 src/core/tests/core.spec.ts                   |  310 -----
 src/core/tests/streaming.spec.ts              |  399 ------
 src/core/tests/worker-pool.spec.ts            |  377 ------
 src/core/trajectory.ts                        |  172 ---
 src/core/worker-pool.ts                       |  220 ----
 src/graders.ts                                |   39 -
 src/graders/bootstrap.ts                      |  135 ---
 src/graders/compare-statistical.ts            |  115 --
 src/graders/compare-weighted.ts               |  112 --
 src/graders/tests/bootstrap.spec.ts           |  169 ---
 src/graders/tests/compare-graders.spec.ts     |  293 -----
 .../tests/trials-compare-graders.spec.ts      |  358 ------
 src/graders/trials-compare-statistical.ts     |  183 ---
 src/graders/trials-compare-weighted.ts        |  128 --
 src/harness.ts                                |   46 -
 src/headless.ts                               |   72 --
 src/headless/headless-cli.ts                  |  428 -------
 src/headless/headless-history-builder.ts      |  141 ---
 src/headless/headless-output-parser.ts        |  388 ------
 src/headless/headless-session-manager.ts      |  590 ---------
 src/headless/headless.schemas.ts              |  321 -----
 src/headless/headless.types.ts                |   19 -
 .../tests/fixtures/claude-headless.json       |   40 -
 .../tests/fixtures/gemini-headless.json       |   37 -
 src/headless/tests/headless.spec.ts           |  873 --------------
 src/integration_tests/claude.spec.ts          |  157 ---
 src/integration_tests/gemini.spec.ts          |  139 ---
 src/pipeline.ts                               |   34 -
 src/pipeline/compare-format-detection.ts      |  100 --
 src/pipeline/compare-trials.ts                |  800 ------------
 src/pipeline/compare-utils.ts                 |   85 --
 src/pipeline/compare.ts                       |  818 -------------
 src/pipeline/extract.ts                       |  241 ----
 src/pipeline/format.ts                        |  291 -----
 src/pipeline/grade.ts                         |  175 ---
 src/pipeline/pipeline.ts                      |   42 -
 src/pipeline/pipeline.types.ts                |  325 -----
 src/pipeline/run.ts                           |  414 -------
 .../tests/compare-format-detection.spec.ts    |  142 ---
 .../tests/compare-statistical.spec.ts         |  289 -----
 src/pipeline/tests/compare-trials.spec.ts     |  592 ---------
 src/pipeline/tests/compare-utils.spec.ts      |  128 --
 src/pipeline/tests/pipeline.spec.ts           |  356 ------
 src/schemas.ts                                |  134 --
 src/schemas/constants.ts                      |   94 --
 src/schemas/grader-loader.ts                  |  203 ----
 src/schemas/schemas-cli.ts                    |  227 ----
 src/schemas/schemas.ts                        | 1073 -----------------
 src/schemas/tests/constants.spec.ts           |  121 --
 .../tests/fixtures/grader-bad-module.ts       |    5 -
 .../tests/fixtures/grader-exec-fail.py        |    9 -
 .../tests/fixtures/grader-exec-invalid.py     |    6 -
 src/schemas/tests/fixtures/grader-exec.py     |   29 -
 src/schemas/tests/fixtures/grader-git.ts      |  116 --
 src/schemas/tests/fixtures/grader-module.ts   |   14 -
 src/schemas/tests/grader-git.spec.ts          |  222 ----
 src/schemas/tests/grader-loader.spec.ts       |  153 ---
 src/schemas/tests/schemas-cli.spec.ts         |  142 ---
 src/schemas/tests/schemas.spec.ts             |  606 ----------
 src/tests/trial.spec.ts                       |  723 +++++++++++
 src/trial.constants.ts                        |   11 +
 src/trial.schemas.ts                          |  288 +++++
 src/trial.ts                                  |  319 +++++
 src/trial.utils.ts                            |  444 +++++++
 tsconfig.json                                 |    2 +-
 96 files changed, 2703 insertions(+), 19722 deletions(-)
 create mode 100644 .agents/skills/compare-trials/SKILL.md
 create mode 100644 .agents/skills/compare-trials/references/bootstrap.ts
 create mode 100644 .agents/skills/compare-trials/references/compare.ts
 create mode 100644 .agents/skills/trial-adapters/SKILL.md
 create mode 100644 .agents/skills/trial-runner/SKILL.md
 delete mode 100644 bin/cli.ts
 delete mode 100644 bin/tests/cli.spec.ts
 create mode 100644 src/cli.ts
 create mode 100644 src/cli.utils.ts
 delete mode 100644 src/commands.ts
 delete mode 100644 src/commands/balance.ts
 delete mode 100644 src/commands/calibrate.ts
 delete mode 100644 src/commands/capture.ts
 delete mode 100644 src/commands/execution.ts
 delete mode 100644 src/commands/summarize.ts
 delete mode 100644 src/commands/tests/balance-helpers.spec.ts
 delete mode 100644 src/commands/tests/calibrate-helpers.spec.ts
 delete mode 100644 src/commands/tests/capture-cli.spec.ts
 delete mode 100644 src/commands/tests/capture-helpers.spec.ts
 delete mode 100644 src/commands/tests/summarize-helpers.spec.ts
 delete mode 100644 src/commands/tests/trials-calculations.spec.ts
 delete mode 100644 src/commands/tests/trials-cli.spec.ts
 delete mode 100644 src/commands/trials.ts
 delete mode 100644 src/commands/validate-refs.ts
 delete mode 100644 src/core.ts
 delete mode 100644 src/core/core.ts
 delete mode 100644 src/core/loading.ts
 delete mode 100644 src/core/output.ts
 delete mode 100644 src/core/streaming.ts
 delete mode 100644 src/core/tests/core.spec.ts
 delete mode 100644 src/core/tests/streaming.spec.ts
 delete mode 100644 src/core/tests/worker-pool.spec.ts
 delete mode 100644 src/core/trajectory.ts
 delete mode 100644 src/core/worker-pool.ts
 delete mode 100644 src/graders.ts
 delete mode 100644 src/graders/bootstrap.ts
 delete mode 100644 src/graders/compare-statistical.ts
 delete mode 100644 src/graders/compare-weighted.ts
 delete mode 100644 src/graders/tests/bootstrap.spec.ts
 delete mode 100644 src/graders/tests/compare-graders.spec.ts
 delete mode 100644 src/graders/tests/trials-compare-graders.spec.ts
 delete mode 100644 src/graders/trials-compare-statistical.ts
 delete mode 100644 src/graders/trials-compare-weighted.ts
 delete mode 100644 src/harness.ts
 delete mode 100644 src/headless.ts
 delete mode 100644 src/headless/headless-cli.ts
 delete mode 100644 src/headless/headless-history-builder.ts
 delete mode 100644 src/headless/headless-output-parser.ts
 delete mode 100644 src/headless/headless-session-manager.ts
 delete mode 100644 src/headless/headless.schemas.ts
 delete mode 100644 src/headless/headless.types.ts
 delete mode 100644 src/headless/tests/fixtures/claude-headless.json
 delete mode 100644 src/headless/tests/fixtures/gemini-headless.json
 delete mode 100644 src/headless/tests/headless.spec.ts
 delete mode 100644 src/integration_tests/claude.spec.ts
 delete mode 100644 src/integration_tests/gemini.spec.ts
 delete mode 100644 src/pipeline.ts
 delete mode 100644 src/pipeline/compare-format-detection.ts
 delete mode 100644 src/pipeline/compare-trials.ts
 delete mode 100644 src/pipeline/compare-utils.ts
 delete mode 100644 src/pipeline/compare.ts
 delete mode 100644 src/pipeline/extract.ts
 delete mode 100644 src/pipeline/format.ts
 delete mode 100644 src/pipeline/grade.ts
 delete mode 100644 src/pipeline/pipeline.ts
 delete mode 100644 src/pipeline/pipeline.types.ts
 delete mode 100644 src/pipeline/run.ts
 delete mode 100644 src/pipeline/tests/compare-format-detection.spec.ts
 delete mode 100644 src/pipeline/tests/compare-statistical.spec.ts
 delete mode 100644 src/pipeline/tests/compare-trials.spec.ts
 delete mode 100644 src/pipeline/tests/compare-utils.spec.ts
 delete mode 100644 src/pipeline/tests/pipeline.spec.ts
 delete mode 100644 src/schemas.ts
 delete mode 100644 src/schemas/constants.ts
 delete mode 100644 src/schemas/grader-loader.ts
 delete mode 100644 src/schemas/schemas-cli.ts
 delete mode 100644 src/schemas/schemas.ts
 delete mode 100644 src/schemas/tests/constants.spec.ts
 delete mode 100644 src/schemas/tests/fixtures/grader-bad-module.ts
 delete mode 100755 src/schemas/tests/fixtures/grader-exec-fail.py
 delete mode 100755 src/schemas/tests/fixtures/grader-exec-invalid.py
 delete mode 100755 src/schemas/tests/fixtures/grader-exec.py
 delete mode 100644 src/schemas/tests/fixtures/grader-git.ts
 delete mode 100644 src/schemas/tests/fixtures/grader-module.ts
 delete mode 100644 src/schemas/tests/grader-git.spec.ts
 delete mode 100644 src/schemas/tests/grader-loader.spec.ts
 delete mode 100644 src/schemas/tests/schemas-cli.spec.ts
 delete mode 100644 src/schemas/tests/schemas.spec.ts
 create mode 100644 src/tests/trial.spec.ts
 create mode 100644 src/trial.constants.ts
 create mode 100644 src/trial.schemas.ts
 create mode 100644 src/trial.ts
 create mode 100644 src/trial.utils.ts

diff --git a/.agents/skills/compare-trials/SKILL.md b/.agents/skills/compare-trials/SKILL.md
new file mode 100644
index 0000000..9ac63be
--- /dev/null
+++ b/.agents/skills/compare-trials/SKILL.md
@@ -0,0 +1,96 @@
+---
+name: compare-trials
+description: Compare trial results from the trial runner. Teaches agents to write comparison and analysis scripts against TrialResult JSONL files for pass@k reliability analysis, bootstrap confidence intervals, and flakiness detection.
+license: ISC
+---
+
+# Compare Trials
+
+## Purpose
+
+This skill teaches agents how to analyze and compare `TrialResult` JSONL output from the `trial` runner. Instead of a built-in comparison command, agents write scripts directly — the analysis is domain-specific and benefits from code-level flexibility.
+
+**Use this when:**
+- Comparing trial results from multiple adapter runs
+- Computing statistical metrics (bootstrap confidence intervals, effect sizes)
+- Analyzing flakiness (pass@k vs pass^k gap)
+- Generating comparison reports
+
+## TrialResult Schema
+
+Each line in a trial JSONL file matches this shape:
+
+```typescript
+type TrialResult = {
+  id: string                           // Prompt identifier
+  input: string | string[]             // Original prompt
+  hint?: string                        // Grader context
+  k: number                            // Trials per prompt
+  passRate?: number                    // passes / k
+  passAtK?: number                     // 1 - (1 - passRate)^k
+  passExpK?: number                    // passRate^k
+  trials: TrialEntry[]                 // Individual trial data
+  metadata?: Record<string, unknown>   // Custom metadata
+}
+
+type TrialEntry = {
+  trialNum: number
+  output: string
+  trajectory?: TrajectoryStep[]
+  duration: number                     // Wall-clock ms
+  timing?: { total?: number; inputTokens?: number; outputTokens?: number }
+  exitCode?: number | null
+  timedOut?: boolean
+  pass?: boolean
+  score?: number
+  reasoning?: string
+  outcome?: Record<string, unknown>
+}
+```
+
+## Key Metrics
+
+| Metric | Formula | Meaning |
+|--------|---------|---------|
+| `passRate` | passes / k | Raw success rate |
+| `passAtK` | 1 - (1 - passRate)^k | Capability — can it solve this at all? |
+| `passExpK` | passRate^k | Reliability — does it solve this every time? |
+| `flakiness` | passAtK - passExpK | Gap between capability and reliability |
+
+## How to Compare
+
+1. Load two (or more) JSONL files
+2. Index results by prompt `id`
+3. Compute aggregate metrics per run
+4. Bootstrap for confidence intervals
+5. Output comparison as structured JSON
+
+## Reference Implementation
+
+**[compare.ts](references/compare.ts)** — Complete comparison script
+
+Takes two JSONL file paths as arguments, loads and indexes them, computes per-run and per-prompt metrics, runs bootstrap resampling for confidence intervals, and outputs a structured comparison report.
+
+**[bootstrap.ts](references/bootstrap.ts)** — Bootstrap sampling utility
+
+Reusable bootstrap function for computing confidence intervals on any metric. Used by the comparison script for reliable statistical comparisons.
+
+## Usage Pattern
+
+```bash
+# Agent writes and runs a comparison script
+bun run compare.ts baseline.jsonl challenger.jsonl > report.json
+
+# Or inline in the trial runner
+const results = await runTrial({ adapter, prompts, k: 10, grader })
+// Agent analyzes results array directly — no file round-trip needed
+```
+
+## Key Points for Agents
+
+- `TrialResult` files are JSONL (one JSON object per line)
+- Always match results by `id` — prompts may arrive in different order
+- Bootstrap needs at least 30 samples for reliable CIs (use 1000+ resamples)
+- Flakiness = passAtK - passExpK measures inconsistency
+- Token usage is optional — only present if the adapter reports it
+- Comparison is agent-written code, not a built-in command
diff --git a/.agents/skills/compare-trials/references/bootstrap.ts b/.agents/skills/compare-trials/references/bootstrap.ts
new file mode 100644
index 0000000..dbc9a19
--- /dev/null
+++ b/.agents/skills/compare-trials/references/bootstrap.ts
@@ -0,0 +1,58 @@
+/**
+ * Bootstrap sampling utility for confidence intervals.
+ *
+ * @remarks
+ * Reusable bootstrap function — computes confidence intervals for any
+ * numeric metric by resampling with replacement.
+ */
+
+/**
+ * Compute bootstrap confidence interval for a metric.
+ *
+ * @param values - Array of observed metric values
+ * @param statFn - Function to compute the statistic (default: mean)
+ * @param options - Bootstrap configuration
+ * @returns [lower, upper] confidence interval bounds
+ */
+export const bootstrap = (
+  values: number[],
+  statFn: (samples: number[]) => number = mean,
+  options: { resamples?: number; confidence?: number } = {},
+): [number, number] => {
+  const { resamples = 1000, confidence = 0.95 } = options
+
+  if (values.length === 0) return [0, 0]
+  if (values.length === 1) return [values[0] ?? 0, values[0] ?? 0]
+
+  const stats: number[] = []
+
+  for (let i = 0; i < resamples; i++) {
+    const sample = Array.from({ length: values.length }, () => {
+      const v = values[Math.floor(Math.random() * values.length)]
+      return v ?? 0
+    })
+    stats.push(statFn(sample))
+  }
+
+  stats.sort((a, b) => a - b)
+
+  const alpha = (1 - confidence) / 2
+  const lower = stats[Math.floor(alpha * stats.length)] ?? 0
+  const upper = stats[Math.floor((1 - alpha) * stats.length)] ?? 0
+
+  return [lower, upper]
+}
+
+/** Compute mean of an array */
+export const mean = (values: number[]): number => {
+  if (values.length === 0) return 0
+  return values.reduce((sum, v) => sum + v, 0) / values.length
+}
+
+/** Compute median of a sorted array */
+export const median = (values: number[]): number => {
+  if (values.length === 0) return 0
+  const sorted = [...values].sort((a, b) => a - b)
+  const mid = Math.floor(sorted.length / 2)
+  return sorted.length % 2 === 0 ? ((sorted[mid - 1] ?? 0) + (sorted[mid] ?? 0)) / 2 : (sorted[mid] ?? 0)
+}
diff --git a/.agents/skills/compare-trials/references/compare.ts b/.agents/skills/compare-trials/references/compare.ts
new file mode 100644
index 0000000..0131f4a
--- /dev/null
+++ b/.agents/skills/compare-trials/references/compare.ts
@@ -0,0 +1,195 @@
+/**
+ * Reference comparison script for TrialResult JSONL files.
+ *
+ * @remarks
+ * Takes two JSONL file paths as arguments, computes aggregate and
+ * per-prompt metrics, runs bootstrap resampling for confidence intervals,
+ * and outputs a structured JSON comparison report.
+ *
+ * Usage: bun run compare.ts baseline.jsonl challenger.jsonl
+ */
+
+import { bootstrap, mean, median } from './bootstrap.ts'
+
+// ============================================================================
+// Types (inline — this is a standalone script)
+// ============================================================================
+
+type TrialEntry = {
+  trialNum: number
+  output: string
+  duration: number
+  pass?: boolean
+  score?: number
+}
+
+type TrialResult = {
+  id: string
+  input: string | string[]
+  k: number
+  passRate?: number
+  passAtK?: number
+  passExpK?: number
+  trials: TrialEntry[]
+  metadata?: Record<string, unknown>
+}
+
+type RunMetrics = {
+  label: string
+  promptCount: number
+  avgPassRate: number
+  avgPassAtK: number
+  avgPassExpK: number
+  avgFlakiness: number
+  avgDuration: number
+  medianDuration: number
+  passRateCI: [number, number]
+  passAtKCI: [number, number]
+}
+
+type PerPromptComparison = {
+  id: string
+  baselinePassRate: number | null
+  challengerPassRate: number | null
+  baselinePassAtK: number | null
+  challengerPassAtK: number | null
+  winner: string | null
+}
+
+type ComparisonReport = {
+  baseline: RunMetrics
+  challenger: RunMetrics
+  perPrompt: PerPromptComparison[]
+  summary: {
+    baselineWins: number
+    challengerWins: number
+    ties: number
+    totalPrompts: number
+  }
+}
+
+// ============================================================================
+// Loading
+// ============================================================================
+
+const loadJsonl = async (path: string): Promise<TrialResult[]> => {
+  const content = await Bun.file(path).text()
+  return content
+    .trim()
+    .split('\n')
+    .filter(Boolean)
+    .map((line) => JSON.parse(line) as TrialResult)
+}
+
+const indexById = (results: TrialResult[]): Map<string, TrialResult> => {
+  const map = new Map<string, TrialResult>()
+  for (const r of results) map.set(r.id, r)
+  return map
+}
+
+// ============================================================================
+// Metrics Computation
+// ============================================================================
+
+const computeRunMetrics = (label: string, results: TrialResult[]): RunMetrics => {
+  const passRates = results.map((r) => r.passRate ?? 0)
+  const passAtKs = results.map((r) => r.passAtK ?? 0)
+  const passExpKs = results.map((r) => r.passExpK ?? 0)
+  const flakiness = results.map((r) => (r.passAtK ?? 0) - (r.passExpK ?? 0))
+  const durations = results.flatMap((r) => r.trials.map((t) => t.duration))
+
+  return {
+    label,
+    promptCount: results.length,
+    avgPassRate: mean(passRates),
+    avgPassAtK: mean(passAtKs),
+    avgPassExpK: mean(passExpKs),
+    avgFlakiness: mean(flakiness),
+    avgDuration: mean(durations),
+    medianDuration: median(durations),
+    passRateCI: bootstrap(passRates),
+    passAtKCI: bootstrap(passAtKs),
+  }
+}
+
+// ============================================================================
+// Comparison
+// ============================================================================
+
+const compare = (baseline: TrialResult[], challenger: TrialResult[]): ComparisonReport => {
+  const baselineIndex = indexById(baseline)
+  const challengerIndex = indexById(challenger)
+
+  // Get all unique prompt IDs
+  const allIds = new Set([...baselineIndex.keys(), ...challengerIndex.keys()])
+
+  const perPrompt: PerPromptComparison[] = []
+  let baselineWins = 0
+  let challengerWins = 0
+  let ties = 0
+
+  for (const id of allIds) {
+    const b = baselineIndex.get(id)
+    const c = challengerIndex.get(id)
+
+    const bPassAtK = b?.passAtK ?? null
+    const cPassAtK = c?.passAtK ?? null
+
+    let winner: string | null = null
+    if (bPassAtK !== null && cPassAtK !== null) {
+      if (bPassAtK > cPassAtK) {
+        winner = 'baseline'
+        baselineWins++
+      } else if (cPassAtK > bPassAtK) {
+        winner = 'challenger'
+        challengerWins++
+      } else {
+        ties++
+      }
+    }
+
+    perPrompt.push({
+      id,
+      baselinePassRate: b?.passRate ?? null,
+      challengerPassRate: c?.passRate ?? null,
+      baselinePassAtK: bPassAtK,
+      challengerPassAtK: cPassAtK,
+      winner,
+    })
+  }
+
+  return {
+    baseline: computeRunMetrics('baseline', baseline),
+    challenger: computeRunMetrics('challenger', challenger),
+    perPrompt,
+    summary: {
+      baselineWins,
+      challengerWins,
+      ties,
+      totalPrompts: allIds.size,
+    },
+  }
+}
+
+// ============================================================================
+// Main
+// ============================================================================
+
+const main = async () => {
+  const [baselinePath, challengerPath] = process.argv.slice(2)
+
+  if (!baselinePath || !challengerPath) {
+    console.error('Usage: bun run compare.ts <baseline.jsonl> <challenger.jsonl>')
+    process.exit(1)
+  }
+
+  const baseline = await loadJsonl(baselinePath)
+  const challenger = await loadJsonl(challengerPath)
+
+  const report = compare(baseline, challenger)
+
+  // biome-ignore lint/suspicious/noConsole: CLI stdout output
+  console.log(JSON.stringify(report, null, 2))
+}
+
+await main()
diff --git a/.agents/skills/trial-adapters/SKILL.md b/.agents/skills/trial-adapters/SKILL.md
new file mode 100644
index 0000000..b8feb28
--- /dev/null
+++ b/.agents/skills/trial-adapters/SKILL.md
@@ -0,0 +1,258 @@
+---
+name: trial-adapters
+description: Write adapter scripts for the trial runner. Adapters wrap any CLI agent as a polyglot script (TypeScript module or executable) using the stdin/stdout JSON contract. Includes patterns for rich trajectory capture and token usage reporting.
+license: ISC
+---
+
+# Trial Adapters
+
+## Purpose
+
+Write adapter scripts that wrap any CLI agent for the trial runner. Adapters follow the polyglot pattern — TypeScript modules or executables with stdin/stdout JSON.
+
+**Use this when:**
+- Wrapping a new CLI agent for evaluation
+- Creating adapters for different agent configurations
+- Building adapters that capture rich trajectory data
+- Integrating external tools (Gemini CLI, local models, A2A agents)
+
+## Adapter Contract
+
+### TypeScript Module
+
+Export an `adapt` function matching the `Adapter` type:
+
+```typescript
+import type { Adapter } from './src/trial.schemas.ts'
+
+export const adapt: Adapter = async ({ prompt, cwd }) => {
+  const text = Array.isArray(prompt) ? prompt.join('\n') : prompt
+  const proc = Bun.spawn(['my-agent', '--prompt', text], { cwd, stdout: 'pipe', stderr: 'pipe' })
+  const output = await new Response(proc.stdout).text()
+  const exitCode = await proc.exited
+  return { output: output.trim(), exitCode }
+}
+```
+
+### Executable Script
+
+Any executable — reads `AdapterInput` from stdin, writes `AdapterResult` to stdout:
+
+```python
+#!/usr/bin/env python3
+import json, sys, subprocess
+
+data = json.load(sys.stdin)
+prompt = data["prompt"]
+cwd = data.get("cwd")
+
+result = subprocess.run(
+    ["my-agent", prompt],
+    capture_output=True, text=True, cwd=cwd
+)
+
+print(json.dumps({
+    "output": result.stdout.strip(),
+    "exitCode": result.returncode
+}))
+```
+
+### Input Type
+
+```typescript
+type AdapterInput = {
+  prompt: string | string[]  // Single or multi-turn
+  cwd?: string               // Working directory
+}
+```
+
+### Output Type
+
+```typescript
+type AdapterResult = {
+  output: string                   // Final agent response (required)
+  trajectory?: TrajectoryStep[]    // Optional structured trajectory
+  timing?: {
+    total?: number                 // Adapter-measured duration (ms)
+    inputTokens?: number           // Input tokens consumed
+    outputTokens?: number          // Output tokens generated
+  }
+  exitCode?: number | null         // Process exit code (null if signaled)
+  timedOut?: boolean               // Whether the adapter timed out
+}
+```
+
+## Loading
+
+The trial runner loads adapters via `loadAdapter()`:
+
+```typescript
+import { loadAdapter } from './src/trial.utils.ts'
+
+// TS module: imports and extracts 'adapt' export
+const adapter = await loadAdapter('./my-adapter.ts')
+
+// Executable: wraps as stdin/stdout JSON subprocess
+const adapter = await loadAdapter('./my-adapter.py')
+```
+
+Detection is by file extension: `.ts`, `.js`, `.mjs`, `.cjs` are imported as ES modules. Everything else is spawned as a subprocess.
+
+## Patterns
+
+### Minimal Adapter (Output Only)
+
+Simplest possible adapter — just captures text output:
+
+```typescript
+import type { Adapter } from './src/trial.schemas.ts'
+
+export const adapt: Adapter = async ({ prompt }) => {
+  const text = Array.isArray(prompt) ? prompt.join('\n') : prompt
+  const result = await Bun.$`echo ${text} | my-agent`.text()
+  return { output: result.trim() }
+}
+```
+
+### Rich Adapter (Trajectory + Timing)
+
+Captures structured trajectory for detailed analysis:
+
+```typescript
+import type { Adapter } from './src/trial.schemas.ts'
+import type { TrajectoryStep } from './src/trial.schemas.ts'
+
+export const adapt: Adapter = async ({ prompt, cwd }) => {
+  const text = Array.isArray(prompt) ? prompt.join('\n') : prompt
+  const start = Date.now()
+
+  const proc = Bun.spawn(
+    ['my-agent', '--prompt', text, '--output-format', 'json'],
+    { cwd, stdout: 'pipe', stderr: 'pipe' },
+  )
+
+  const raw = await new Response(proc.stdout).text()
+  const exitCode = await proc.exited
+  const elapsed = Date.now() - start
+
+  // Parse agent's JSON output into trajectory
+  const events = raw.trim().split('\n').filter(Boolean).map((l) => JSON.parse(l))
+  const trajectory: TrajectoryStep[] = []
+  let output = ''
+
+  for (const event of events) {
+    if (event.type === 'thinking') {
+      trajectory.push({ type: 'thought', content: event.text, timestamp: Date.now() })
+    } else if (event.type === 'tool_use') {
+      trajectory.push({
+        type: 'tool_call',
+        name: event.name,
+        status: 'completed',
+        input: event.input,
+        output: event.result,
+        timestamp: Date.now(),
+      })
+    } else if (event.type === 'text') {
+      output += event.text
+      trajectory.push({ type: 'message', content: event.text, timestamp: Date.now() })
+    }
+  }
+
+  return {
+    output,
+    trajectory,
+    timing: {
+      total: elapsed,
+      inputTokens: events.find((e) => e.usage)?.usage?.input_tokens,
+      outputTokens: events.find((e) => e.usage)?.usage?.output_tokens,
+    },
+    exitCode,
+  }
+}
+```
+
+### Multi-Turn Adapter
+
+Handles `prompt: string[]` by sending each turn sequentially:
+
+```typescript
+import type { Adapter } from './src/trial.schemas.ts'
+
+export const adapt: Adapter = async ({ prompt, cwd }) => {
+  const turns = Array.isArray(prompt) ? prompt : [prompt]
+  const outputs: string[] = []
+
+  for (const turn of turns) {
+    const result = await Bun.$`my-agent --prompt ${turn} --cwd ${cwd ?? '.'}`.text()
+    outputs.push(result.trim())
+  }
+
+  return { output: outputs[outputs.length - 1] }
+}
+```
+
+### Timeout-Aware Adapter
+
+Reports its own timeout detection:
+
+```typescript
+import type { Adapter } from './src/trial.schemas.ts'
+
+export const adapt: Adapter = async ({ prompt, cwd }) => {
+  const text = Array.isArray(prompt) ? prompt.join('\n') : prompt
+  const timeout = 30_000
+
+  const proc = Bun.spawn(['my-agent', '--prompt', text], { cwd, stdout: 'pipe' })
+  const timer = setTimeout(() => proc.kill(), timeout)
+
+  const output = await new Response(proc.stdout).text()
+  const exitCode = await proc.exited
+  clearTimeout(timer)
+
+  const timedOut = exitCode === null // null = killed by signal
+  return { output: timedOut ? '' : output.trim(), exitCode, timedOut }
+}
+```
+
+### In-Process Adapter (No Subprocess)
+
+For agents with a library API — no process spawning needed:
+
+```typescript
+import type { Adapter } from './src/trial.schemas.ts'
+
+export const adapt: Adapter = async ({ prompt, cwd }) => {
+  // Call agent library directly
+  const agent = createAgent({ workspace: cwd })
+  const response = await agent.run(Array.isArray(prompt) ? prompt.join('\n') : prompt)
+  return {
+    output: response.text,
+    trajectory: response.steps,
+    timing: { inputTokens: response.usage.input, outputTokens: response.usage.output },
+  }
+}
+```
+
+## Usage with Trial Runner
+
+```bash
+# CLI: path-based loading
+agent-eval-harness trials '{"adapterPath":"./my-adapter.ts","promptsPath":"prompts.jsonl","k":5}'
+
+# Library: function-based (primary)
+import { runTrial } from './src/trial.ts'
+const results = await runTrial({ adapter: myAdapter, prompts, k: 5 })
+```
+
+## Tips
+
+- Return `trajectory` for richer analysis (thought steps, tool calls)
+- Return `timing.inputTokens`/`outputTokens` if the agent exposes usage
+- Set `timedOut: true` if the adapter detects its own timeout
+- Use `cwd` for workspace-isolated code generation tasks
+- For multi-turn, the runner sends the full `prompt: string[]` — the adapter decides how to sequence turns
+
+## Related
+
+- **[trial-runner](../trial-runner/SKILL.md)** — Running trials with adapters
+- **[compare-trials](../compare-trials/SKILL.md)** — Comparing trial results
diff --git a/.agents/skills/trial-runner/SKILL.md b/.agents/skills/trial-runner/SKILL.md
new file mode 100644
index 0000000..c1c87ef
--- /dev/null
+++ b/.agents/skills/trial-runner/SKILL.md
@@ -0,0 +1,182 @@
+---
+name: trial-runner
+description: Run trials against adapters (any CLI/agent), capture trajectories, and optionally grade results. Library-first API with CLI secondary. Supports pass@k reliability analysis, workspace isolation, and polyglot graders.
+license: ISC
+---
+
+# Trial Runner
+
+## Purpose
+
+Run prompts against any adapter, capture structured results, and optionally grade them. The fundamental operation is a **trial** — running k attempts per prompt and measuring pass@k reliability.
+
+**The runner executes trials. You provide adapters and graders.**
+
+| Runner Provides | You Provide |
+|-----------------|-------------|
+| Trial execution (k runs per prompt) | Adapter script (wraps your agent CLI) |
+| Structured JSONL output | Grader script (scores output) |
+| pass@k/pass^k metrics | Prompts (JSONL) |
+| Concurrent execution + workspace isolation | Comparison analysis scripts |
+
+**Use this when:**
+- Evaluating agent quality with pass@k reliability metrics
+- Capturing trajectories for downstream scoring or training
+- Comparing agents across configurations (via `compare-trials` skill)
+- Orchestrating distillation pipelines
+
+## Library API (Primary)
+
+The in-process API is the primary interface. Agents call `runTrial()` directly:
+
+```typescript
+import { runTrial } from './src/trial.ts'
+import type { Adapter, Grader } from './src/trial.schemas.ts'
+
+const adapter: Adapter = async ({ prompt, cwd }) => {
+  const proc = Bun.spawn(['my-agent', '--prompt', prompt], { cwd })
+  const output = await new Response(proc.stdout).text()
+  return { output }
+}
+
+const results = await runTrial({
+  adapter,
+  prompts: [{ id: 'p1', input: 'Create a button component' }],
+  grader: async ({ output, cwd }) => {
+    const tests = await Bun.$`cd ${cwd} && bun test`.nothrow()
+    return { pass: tests.exitCode === 0, score: tests.exitCode === 0 ? 1 : 0 }
+  },
+  k: 10,
+  concurrency: 4,
+  workspaceDir: './workspaces',
+})
+// results[0].passRate, results[0].passAtK, results[0].passExpK
+```
+
+### runTrial Config
+
+| Field | Type | Default | Description |
+|-------|------|---------|-------------|
+| `adapter` | `Adapter` | required | Function or loaded from path |
+| `prompts` | `PromptCase[]` | required | Prompt cases to run |
+| `grader` | `Grader` | none | Optional grading function |
+| `k` | `number` | 1 | Trials per prompt |
+| `outputPath` | `string` | none | JSONL output file (stdout if absent) |
+| `cwd` | `string` | none | Working directory for adapter |
+| `timeout` | `number` | 60000 | Timeout per prompt in ms |
+| `concurrency` | `number` | 1 | Concurrent workers |
+| `workspaceDir` | `string` | none | Per-prompt workspace isolation base dir |
+| `progress` | `boolean` | false | Show progress to stderr |
+| `append` | `boolean` | false | Append to output file |
+
+## CLI (Secondary)
+
+The CLI resolves file paths to functions, then delegates to `runTrial`:
+
+```bash
+# Basic trial
+agent-eval-harness trials '{"adapterPath":"./adapter.ts","promptsPath":"prompts.jsonl","k":5}'
+
+# With grader and progress
+agent-eval-harness trials '{"adapterPath":"./adapter.ts","promptsPath":"prompts.jsonl","k":10,"graderPath":"./grader.ts","concurrency":4,"progress":true}'
+
+# Schema discovery
+agent-eval-harness trials --schema input
+agent-eval-harness trials --schema output
+agent-eval-harness trials --help
+```
+
+## Input Format (prompts.jsonl)
+
+```jsonl
+{"id":"test-001","input":"Create a button","hint":"should contain <button>"}
+{"id":"test-002","input":["Create a button","Make it blue"],"metadata":{"category":"ui"}}
+```
+
+| Field | Required | Description |
+|-------|----------|-------------|
+| `id` | Yes | Unique identifier |
+| `input` | Yes | Single prompt (string) or multi-turn (string[]) |
+| `hint` | No | Grader context |
+| `reference` | No | Reference solution |
+| `metadata` | No | Tags, category, difficulty |
+| `timeout` | No | Per-case timeout override (ms) |
+
+## Output Format (TrialResult JSONL)
+
+Each line is a `TrialResult`:
+
+```jsonl
+{"id":"test-001","input":"Create a button","k":5,"passRate":0.8,"passAtK":0.99,"passExpK":0.33,"trials":[{"trialNum":1,"output":"...","duration":1234,"pass":true,"score":1.0},...],"metadata":{"category":"ui"}}
+```
+
+### TrialEntry Fields
+
+| Field | Always | Description |
+|-------|--------|-------------|
+| `trialNum` | Yes | Trial number (1-indexed) |
+| `output` | Yes | Agent response text |
+| `duration` | Yes | Wall-clock ms |
+| `trajectory` | No | Structured trajectory (adapter-dependent) |
+| `timing` | No | Adapter-reported timing + token counts |
+| `pass` | No | Pass/fail (with grader) |
+| `score` | No | Numeric score 0-1 (with grader) |
+| `reasoning` | No | Grader explanation |
+
+## Key Metrics
+
+| Metric | Formula | Meaning |
+|--------|---------|---------|
+| `passRate` | passes / k | Raw success rate |
+| `passAtK` | 1 - (1 - passRate)^k | Capability — can it solve this at all? |
+| `passExpK` | passRate^k | Reliability — does it solve this every time? |
+
+## Grader Contract
+
+Graders follow the polyglot pattern — TS module (`export const grade`) or executable (stdin/stdout JSON).
+
+### Git-Based Grading (Coding Tasks)
+
+```typescript
+import type { Grader } from './src/trial.schemas.ts'
+
+export const grade: Grader = async ({ output, hint, cwd }) => {
+  if (!cwd) return { pass: false, score: 0, reasoning: 'No cwd' }
+  const tests = await Bun.$`cd ${cwd} && bun test`.nothrow()
+  return {
+    pass: tests.exitCode === 0,
+    score: tests.exitCode === 0 ? 1 : 0,
+    reasoning: `Tests: ${tests.exitCode === 0 ? 'pass' : 'fail'}`,
+  }
+}
+```
+
+### Executable Graders
+
+Any executable — reads JSON from stdin, writes `GraderResult` to stdout:
+
+```python
+#!/usr/bin/env python3
+import json, sys
+data = json.load(sys.stdin)
+output = data.get("output", "").lower()
+hint = (data.get("hint") or "").lower()
+passed = hint in output if hint else True
+print(json.dumps({"pass": passed, "score": 1.0 if passed else 0.0}))
+```
+
+## Schema Exports
+
+```typescript
+import { TrialResultSchema, TrialInputSchema, TrialOutputSchema } from './src/trial.ts'
+import { PromptCaseSchema, GraderResultSchema, AdapterResultSchema } from './src/trial.schemas.ts'
+import * as z from 'zod'
+
+// Generate JSON Schema (Zod 4 native)
+const jsonSchema = z.toJSONSchema(TrialResultSchema)
+```
+
+## Related
+
+- **[trial-adapters](../trial-adapters/SKILL.md)** — Writing adapter scripts for the trial runner
+- **[compare-trials](../compare-trials/SKILL.md)** — Statistical comparison of trial results
diff --git a/bin/cli.ts b/bin/cli.ts
deleted file mode 100644
index d69d782..0000000
--- a/bin/cli.ts
+++ /dev/null
@@ -1,160 +0,0 @@
-#!/usr/bin/env bun
-
-/**
- * Agent Eval Harness CLI - Agent evaluation toolkit.
- *
- * @remarks
- * Router for harness commands. Thin wrapper that delegates to command modules.
- *
- * Commands:
- * - capture: Core trajectory capture
- * - trials: Multi-run pass@k/pass^k analysis
- * - summarize: Derive compact views from results
- * - calibrate: Sample failures for grader review
- * - validate-refs: Check reference solutions
- * - balance: Analyze test set coverage
- * - schemas: Export JSON schemas for non-TS users
- * - headless: Schema-driven adapter for any headless CLI agent
- */
-
-import { balance } from '../src/commands/balance.ts'
-import { calibrate } from '../src/commands/calibrate.ts'
-import { capture } from '../src/commands/capture.ts'
-import { summarize } from '../src/commands/summarize.ts'
-import { trials } from '../src/commands/trials.ts'
-import { validateRefs } from '../src/commands/validate-refs.ts'
-import { headless } from '../src/headless.ts'
-import { compare, extract, format, grade, run } from '../src/pipeline.ts'
-import { schemasCli } from '../src/schemas/schemas-cli.ts'
-
-const [command, ...args] = Bun.argv.slice(2)
-
-const printHelp = () => {
-  console.log(`
-agent-eval-harness - CLI tool for agent evaluation
-
-Commands:
-  capture          Capture trajectories from CLI agents
-  trials           Run prompts multiple times for pass@k/pass^k metrics
-  summarize        Derive compact views from results
-  calibrate        Sample failures for grader review
-  validate-refs    Check reference solutions against grader
-  balance          Analyze test set coverage
-  schemas          Export JSON schemas for non-TypeScript users
-  headless         Schema-driven adapter for any headless CLI agent
-
-Pipeline Commands (Unix-style composable):
-  run              Execute prompts and output raw results
-  extract          Parse raw output into trajectories
-  grade            Apply grader to extracted results
-  format           Convert results to different output formats
-  compare          Compare multiple runs of the same prompts
-
-Run 'agent-eval-harness <command> --help' for command-specific help.
-
-Examples:
-  # Basic capture with schema
-  agent-eval-harness capture prompts.jsonl --schema claude.json -o results.jsonl
-
-  # With grader
-  agent-eval-harness capture prompts.jsonl -s claude.json --grader ./grader.ts -o results.jsonl
-
-  # Multi-run trials
-  agent-eval-harness trials prompts.jsonl -s claude.json -k 5 --grader ./grader.ts -o trials.jsonl
-
-  # Derive summary view
-  agent-eval-harness summarize results.jsonl -o summary.jsonl
-
-  # Pipeline workflow
-  cat prompts.jsonl | \\
-    agent-eval-harness run -s claude.json | \\
-    agent-eval-harness extract -s claude.json | \\
-    agent-eval-harness grade -g ./grader.ts | \\
-    agent-eval-harness format -f markdown > report.md
-
-  # Compare multiple runs
-  agent-eval-harness compare run1.jsonl run2.jsonl -g ./compare-grader.ts
-
-Documentation: https://github.com/plaited/agent-eval-harness
-`)
-}
-
-const main = async () => {
-  switch (command) {
-    case 'capture':
-      await capture(args)
-      break
-
-    case 'trials':
-      await trials(args)
-      break
-
-    case 'summarize':
-      await summarize(args)
-      break
-
-    case 'calibrate':
-      await calibrate(args)
-      break
-
-    case 'validate-refs':
-      await validateRefs(args)
-      break
-
-    case 'balance':
-      await balance(args)
-      break
-
-    case 'schemas':
-      await schemasCli(args)
-      break
-
-    case 'headless':
-      await headless(args)
-      break
-
-    // Pipeline commands
-    case 'run':
-      await run(args)
-      break
-
-    case 'extract':
-      await extract(args)
-      break
-
-    case 'grade':
-      await grade(args)
-      break
-
-    case 'format':
-      await format(args)
-      break
-
-    case 'compare':
-      await compare(args)
-      break
-
-    case '-h':
-    case '--help':
-    case undefined:
-      printHelp()
-      break
-
-    case '-v':
-    case '--version': {
-      const { version } = await import('../package.json')
-      console.log(version)
-      break
-    }
-
-    default:
-      console.error(`Unknown command: ${command}`)
-      console.error("Run 'agent-eval-harness --help' for usage")
-      process.exit(1)
-  }
-}
-
-main().catch((error) => {
-  console.error('Error:', error instanceof Error ? error.message : error)
-  process.exit(1)
-})
diff --git a/bin/tests/cli.spec.ts b/bin/tests/cli.spec.ts
deleted file mode 100644
index 957b294..0000000
--- a/bin/tests/cli.spec.ts
+++ /dev/null
@@ -1,529 +0,0 @@
-import { describe, expect, test } from 'bun:test'
-import { join } from 'node:path'
-import { z } from 'zod'
-
-/**
- * Tests for the agent-eval-harness CLI.
- *
- * @remarks
- * Tests CLI argument parsing, help output, and output format schemas.
- * Integration tests requiring an actual CLI agent are in *.docker.ts files.
- */
-
-const CLI_PATH = join(import.meta.dir, '..', 'cli.ts')
-
-// ============================================================================
-// CLI Invocation Tests
-// ============================================================================
-
-describe('CLI invocation', () => {
-  test('shows help with --help flag', async () => {
-    const proc = Bun.spawn(['bun', CLI_PATH, '--help'], {
-      stdout: 'pipe',
-      stderr: 'pipe',
-    })
-    const stdout = await new Response(proc.stdout).text()
-    const exitCode = await proc.exited
-
-    expect(exitCode).toBe(0)
-    expect(stdout).toContain('agent-eval-harness')
-    expect(stdout).toContain('Commands:')
-    expect(stdout).toContain('capture')
-    expect(stdout).toContain('trials')
-    expect(stdout).toContain('summarize')
-  })
-
-  test('shows help with -h flag', async () => {
-    const proc = Bun.spawn(['bun', CLI_PATH, '-h'], {
-      stdout: 'pipe',
-      stderr: 'pipe',
-    })
-    const stdout = await new Response(proc.stdout).text()
-    const exitCode = await proc.exited
-
-    expect(exitCode).toBe(0)
-    expect(stdout).toContain('agent-eval-harness')
-  })
-
-  test('shows help when no arguments provided', async () => {
-    const proc = Bun.spawn(['bun', CLI_PATH], {
-      stdout: 'pipe',
-      stderr: 'pipe',
-    })
-    const stdout = await new Response(proc.stdout).text()
-    const exitCode = await proc.exited
-
-    expect(exitCode).toBe(0) // Exits cleanly when showing help
-    expect(stdout).toContain('agent-eval-harness')
-  })
-
-  test('help shows example commands', async () => {
-    const proc = Bun.spawn(['bun', CLI_PATH, '--help'], {
-      stdout: 'pipe',
-      stderr: 'pipe',
-    })
-    const stdout = await new Response(proc.stdout).text()
-
-    expect(stdout).toContain('--schema')
-    expect(stdout).toContain('prompts.jsonl')
-    expect(stdout).toContain('results.jsonl')
-  })
-
-  test('help shows available commands', async () => {
-    const proc = Bun.spawn(['bun', CLI_PATH, '--help'], {
-      stdout: 'pipe',
-      stderr: 'pipe',
-    })
-    const stdout = await new Response(proc.stdout).text()
-
-    expect(stdout).toContain('capture')
-    expect(stdout).toContain('trials')
-    expect(stdout).toContain('summarize')
-    expect(stdout).toContain('calibrate')
-    expect(stdout).toContain('balance')
-    expect(stdout).toContain('schemas')
-  })
-
-  test('fails with non-existent schema file', async () => {
-    const proc = Bun.spawn(['bun', CLI_PATH, 'capture', 'prompts.jsonl', '--schema', 'nonexistent.json'], {
-      stdout: 'pipe',
-      stderr: 'pipe',
-    })
-    const stderr = await new Response(proc.stderr).text()
-    const exitCode = await proc.exited
-
-    expect(exitCode).not.toBe(0)
-    expect(stderr).toContain('Schema file not found')
-  })
-
-  test('fails when no schema provided', async () => {
-    const tmpFile = `/tmp/test-prompts-${Date.now()}.jsonl`
-    await Bun.write(tmpFile, '{"id":"test-001","input":"test"}\n')
-
-    const proc = Bun.spawn(['bun', CLI_PATH, 'capture', tmpFile], {
-      stdout: 'pipe',
-      stderr: 'pipe',
-    })
-    const stderr = await new Response(proc.stderr).text()
-    const exitCode = await proc.exited
-
-    expect(exitCode).toBe(1)
-    expect(stderr).toContain('--schema is required')
-  })
-
-  test('fails with unknown command', async () => {
-    const proc = Bun.spawn(['bun', CLI_PATH, 'unknown-command'], {
-      stdout: 'pipe',
-      stderr: 'pipe',
-    })
-    const stderr = await new Response(proc.stderr).text()
-    const exitCode = await proc.exited
-
-    expect(exitCode).toBe(1)
-    expect(stderr).toContain('Unknown command')
-  })
-
-  test('capture command shows help with --help', async () => {
-    const proc = Bun.spawn(['bun', CLI_PATH, 'capture', '--help'], {
-      stdout: 'pipe',
-      stderr: 'pipe',
-    })
-    const stdout = await new Response(proc.stdout).text()
-    const exitCode = await proc.exited
-
-    expect(exitCode).toBe(0)
-    expect(stdout).toContain('capture')
-    expect(stdout).toContain('prompts.jsonl')
-    expect(stdout).toContain('--output')
-  })
-
-  test('trials command shows help with --help', async () => {
-    const proc = Bun.spawn(['bun', CLI_PATH, 'trials', '--help'], {
-      stdout: 'pipe',
-      stderr: 'pipe',
-    })
-    const stdout = await new Response(proc.stdout).text()
-    const exitCode = await proc.exited
-
-    expect(exitCode).toBe(0)
-    expect(stdout).toContain('trials')
-    expect(stdout).toContain('-k')
-    expect(stdout).toContain('pass@k')
-  })
-})
-
-// ============================================================================
-// Output Format Schemas (for downstream validation)
-// ============================================================================
-
-const SummaryResultSchema = z.object({
-  id: z.string(),
-  input: z.string(),
-  output: z.string(),
-  toolCalls: z.array(z.string()),
-  duration: z.number(),
-})
-
-const TrajectoryStepSchema = z.discriminatedUnion('type', [
-  z.object({
-    type: z.literal('thought'),
-    content: z.string(),
-    timestamp: z.number(),
-    stepId: z.string().optional(),
-  }),
-  z.object({
-    type: z.literal('message'),
-    content: z.string(),
-    timestamp: z.number(),
-    stepId: z.string().optional(),
-  }),
-  z.object({
-    type: z.literal('tool_call'),
-    name: z.string(),
-    status: z.string(),
-    input: z.unknown().optional(),
-    output: z.unknown().optional(),
-    duration: z.number().optional(),
-    timestamp: z.number(),
-    stepId: z.string().optional(),
-  }),
-  z.object({
-    type: z.literal('plan'),
-    entries: z.array(z.unknown()),
-    timestamp: z.number(),
-    stepId: z.string().optional(),
-  }),
-])
-
-const CaptureResultSchema = z.object({
-  id: z.string(),
-  input: z.string(),
-  output: z.string(),
-  expected: z.string().optional(),
-  trajectory: z.array(TrajectoryStepSchema),
-  metadata: z.record(z.string(), z.unknown()),
-  timing: z.object({
-    start: z.number(),
-    end: z.number(),
-    firstResponse: z.number().optional(),
-  }),
-  toolErrors: z.boolean(),
-  errors: z.array(z.string()).optional(),
-})
-
-// ============================================================================
-// Sample Output Data (matches harness output format)
-// ============================================================================
-
-const SAMPLE_SUMMARY_JSONL = `{"id":"test-001","input":"Create a button","output":"I created the button","toolCalls":["Write"],"duration":1234}
-{"id":"test-002","input":"Fix the bug","output":"I fixed the bug","toolCalls":["Read","Edit"],"duration":2567}
-{"id":"test-003","input":"Broken test","output":"","toolCalls":[],"duration":500}`
-
-const SAMPLE_CAPTURE_JSONL = `{"id":"test-001","input":"Create a button","output":"I created the button","trajectory":[{"type":"thought","content":"I'll create a button template","timestamp":100,"stepId":"test-001-step-1"},{"type":"tool_call","name":"Write","status":"completed","input":{"file_path":"src/button.tsx","content":"export const Button = () => <button>Click</button>"},"output":"File written","duration":234,"timestamp":150,"stepId":"test-001-step-2"},{"type":"message","content":"I created the button","timestamp":500,"stepId":"test-001-step-3"}],"metadata":{"category":"ui","agent":"claude-headless"},"timing":{"start":1704067200000,"end":1704067201234,"firstResponse":100},"toolErrors":false}
-{"id":"test-002","input":"Fix the bug","output":"I fixed the bug","trajectory":[{"type":"tool_call","name":"Read","status":"completed","input":{"file_path":"src/app.ts"},"output":"file contents...","duration":100,"timestamp":50,"stepId":"test-002-step-1"},{"type":"tool_call","name":"Edit","status":"completed","input":{"file_path":"src/app.ts","old_string":"bug","new_string":"fix"},"duration":150,"timestamp":200,"stepId":"test-002-step-2"},{"type":"message","content":"I fixed the bug","timestamp":400,"stepId":"test-002-step-3"}],"metadata":{"category":"bugfix","agent":"claude-headless"},"timing":{"start":1704067300000,"end":1704067302567},"toolErrors":false}`
-
-// ============================================================================
-// Downstream Pattern Tests
-// ============================================================================
-
-describe('downstream patterns: summary JSONL', () => {
-  const parseResults = (jsonl: string) =>
-    jsonl
-      .trim()
-      .split('\n')
-      .map((line) => JSON.parse(line))
-
-  test('parses summary JSONL correctly', () => {
-    const results = parseResults(SAMPLE_SUMMARY_JSONL)
-
-    expect(results).toHaveLength(3)
-    for (const result of results) {
-      expect(() => SummaryResultSchema.parse(result)).not.toThrow()
-    }
-  })
-
-  test('filters by output presence (jq pattern)', () => {
-    const results = parseResults(SAMPLE_SUMMARY_JSONL)
-    const withOutput = results.filter((r) => r.output.length > 0)
-
-    expect(withOutput).toHaveLength(2)
-  })
-
-  test('calculates average duration (jq pattern)', () => {
-    const results = parseResults(SAMPLE_SUMMARY_JSONL)
-    const avg = results.reduce((sum, r) => sum + r.duration, 0) / results.length
-
-    expect(avg).toBeCloseTo(1433.67, 0)
-  })
-
-  test('counts tool usage (jq pattern)', () => {
-    const results = parseResults(SAMPLE_SUMMARY_JSONL)
-    const allTools = results.flatMap((r) => r.toolCalls)
-    const toolCounts = allTools.reduce<Record<string, number>>((acc, tool) => {
-      acc[tool] = (acc[tool] ?? 0) + 1
-      return acc
-    }, {})
-
-    expect(toolCounts).toEqual({ Write: 1, Read: 1, Edit: 1 })
-  })
-
-  test('calculates success rate by output presence', () => {
-    const results = parseResults(SAMPLE_SUMMARY_JSONL)
-    const withOutput = results.filter((r) => r.output.length > 0).length
-    const total = results.length
-
-    expect(withOutput).toBe(2)
-    expect(total).toBe(3)
-    expect(withOutput / total).toBeCloseTo(0.667, 2)
-  })
-})
-
-describe('downstream patterns: capture JSONL', () => {
-  const parseResults = (jsonl: string) =>
-    jsonl
-      .trim()
-      .split('\n')
-      .map((line) => JSON.parse(line))
-
-  test('parses capture JSONL with trajectories', () => {
-    const results = parseResults(SAMPLE_CAPTURE_JSONL)
-
-    expect(results).toHaveLength(2)
-    for (const result of results) {
-      expect(() => CaptureResultSchema.parse(result)).not.toThrow()
-    }
-  })
-
-  test('step IDs follow expected format', () => {
-    const results = parseResults(SAMPLE_CAPTURE_JSONL)
-
-    for (const result of results) {
-      for (const step of result.trajectory) {
-        expect(step.stepId).toMatch(new RegExp(`^${result.id}-step-\\d+$`))
-      }
-    }
-  })
-
-  test('step-level retrieval pattern works', () => {
-    const results = parseResults(SAMPLE_CAPTURE_JSONL)
-
-    // Build step index (pattern from downstream.md)
-    const stepIndex = new Map<string, unknown>()
-    for (const result of results) {
-      for (const step of result.trajectory) {
-        stepIndex.set(step.stepId, step)
-      }
-    }
-
-    // Retrieve specific step by ID
-    const step = stepIndex.get('test-001-step-2') as { name: string; input: { file_path: string } }
-    expect(step).toBeDefined()
-    expect(step.name).toBe('Write')
-    expect(step.input.file_path).toBe('src/button.tsx')
-  })
-
-  test('extracts tool calls from trajectory', () => {
-    const results = parseResults(SAMPLE_CAPTURE_JSONL)
-    const result = results[1] // test-002
-
-    const toolCalls = result.trajectory.filter((s: { type: string }) => s.type === 'tool_call')
-    expect(toolCalls).toHaveLength(2)
-    expect(toolCalls.map((t: { name: string }) => t.name)).toEqual(['Read', 'Edit'])
-  })
-
-  test('filters by metadata category', () => {
-    const results = parseResults(SAMPLE_CAPTURE_JSONL)
-    const uiResults = results.filter((r) => r.metadata.category === 'ui')
-
-    expect(uiResults).toHaveLength(1)
-    expect(uiResults[0]?.id).toBe('test-001')
-  })
-
-  test('identifies results with tool errors', () => {
-    const results = parseResults(SAMPLE_CAPTURE_JSONL)
-    const withErrors = results.filter((r) => r.toolErrors)
-
-    expect(withErrors).toHaveLength(0) // Sample data has no errors
-  })
-})
-
-describe('downstream patterns: advanced filtering', () => {
-  const parseResults = (jsonl: string) =>
-    jsonl
-      .trim()
-      .split('\n')
-      .map((line) => JSON.parse(line))
-
-  test('filters by tool usage (jq contains pattern)', () => {
-    const results = parseResults(SAMPLE_SUMMARY_JSONL)
-    const withWrite = results.filter((r) => r.toolCalls.includes('Write'))
-
-    expect(withWrite).toHaveLength(1)
-    expect(withWrite[0]?.id).toBe('test-001')
-  })
-
-  test('filters by duration threshold (slow evaluations)', () => {
-    const results = parseResults(SAMPLE_SUMMARY_JSONL)
-    const slow = results.filter((r) => r.duration > 2000)
-
-    expect(slow).toHaveLength(1)
-    expect(slow[0]?.id).toBe('test-002')
-  })
-
-  test('finds slowest evaluations (sorted)', () => {
-    const results = parseResults(SAMPLE_SUMMARY_JSONL)
-    const sorted = [...results].sort((a, b) => b.duration - a.duration)
-    const top2 = sorted.slice(0, 2)
-
-    expect(top2[0]?.id).toBe('test-002')
-    expect(top2[1]?.id).toBe('test-001')
-  })
-
-  test('deduplicates by ID keeping latest (merge pattern)', () => {
-    const combinedJsonl = `${SAMPLE_SUMMARY_JSONL}
-{"id":"test-001","input":"Create a button v2","output":"I created the button v2","toolCalls":["Write","Edit"],"duration":1500}`
-
-    const results = parseResults(combinedJsonl)
-
-    // Group by ID and keep last occurrence (simulates jq group_by + last)
-    const byId = new Map<string, unknown>()
-    for (const result of results) {
-      byId.set(result.id, result)
-    }
-    const deduped = Array.from(byId.values())
-
-    expect(deduped).toHaveLength(3) // test-001, test-002, test-003
-    const test001 = deduped.find((r) => (r as { id: string }).id === 'test-001') as { input: string }
-    expect(test001?.input).toBe('Create a button v2')
-  })
-
-  test('groups by category and counts', () => {
-    const results = parseResults(SAMPLE_CAPTURE_JSONL)
-
-    // Group by category (simulates jq group_by pattern)
-    const grouped = results.reduce<Record<string, number>>((acc, r) => {
-      const cat = r.metadata.category as string
-      acc[cat] = (acc[cat] ?? 0) + 1
-      return acc
-    }, {})
-
-    expect(grouped).toEqual({ ui: 1, bugfix: 1 })
-  })
-
-  test('extracts timing information', () => {
-    const results = parseResults(SAMPLE_CAPTURE_JSONL)
-    const result = results[0]
-
-    expect(result.timing.start).toBe(1704067200000)
-    expect(result.timing.end).toBe(1704067201234)
-    expect(result.timing.firstResponse).toBe(100)
-    expect(result.timing.end - result.timing.start).toBe(1234) // matches duration
-  })
-})
-
-// ============================================================================
-// MCP Server Config Parsing Tests
-// ============================================================================
-
-describe('MCP server config parsing', () => {
-  test('parses stdio MCP server config', () => {
-    const json = '{"type":"stdio","name":"fs","command":"mcp-filesystem","args":["/data"],"env":[]}'
-    const proc = Bun.spawn(
-      ['bun', CLI_PATH, 'capture', '/tmp/test.jsonl', '--schema', './test-schema.json', '--mcp-server', json, '--help'],
-      {
-        stdout: 'pipe',
-        stderr: 'pipe',
-      },
-    )
-
-    // If it doesn't crash, the parsing worked
-    expect(proc.exited).resolves.toBeDefined()
-  })
-
-  test('parses http MCP server config', () => {
-    const json =
-      '{"type":"http","name":"api","url":"https://example.com/mcp","headers":[{"name":"Authorization","value":"Bearer token"}]}'
-    const proc = Bun.spawn(
-      ['bun', CLI_PATH, 'capture', '/tmp/test.jsonl', '--schema', './test-schema.json', '--mcp-server', json, '--help'],
-      {
-        stdout: 'pipe',
-        stderr: 'pipe',
-      },
-    )
-
-    // If it doesn't crash, the parsing worked
-    expect(proc.exited).resolves.toBeDefined()
-  })
-
-  test('accepts multiple MCP servers', () => {
-    const json1 = '{"type":"stdio","name":"fs","command":"mcp-filesystem","args":[],"env":[]}'
-    const json2 = '{"type":"http","name":"api","url":"https://example.com","headers":[]}'
-    const proc = Bun.spawn(
-      [
-        'bun',
-        CLI_PATH,
-        'capture',
-        '/tmp/test.jsonl',
-        '--schema',
-        './test-schema.json',
-        '--mcp-server',
-        json1,
-        '--mcp-server',
-        json2,
-        '--help',
-      ],
-      {
-        stdout: 'pipe',
-        stderr: 'pipe',
-      },
-    )
-
-    // If it doesn't crash, the parsing worked
-    expect(proc.exited).resolves.toBeDefined()
-  })
-})
-
-// ============================================================================
-// Error Handling Tests
-// ============================================================================
-
-describe('error handling', () => {
-  test('fails when schema file does not exist', async () => {
-    const tmpFile = `/tmp/invalid-${Date.now()}.jsonl`
-    await Bun.write(tmpFile, '{"id": "t1", "input": "test"}\n')
-
-    const proc = Bun.spawn(['bun', CLI_PATH, 'capture', tmpFile, '--schema', 'nonexistent-schema.json'], {
-      stdout: 'pipe',
-      stderr: 'pipe',
-    })
-    const stderr = await new Response(proc.stderr).text()
-    const exitCode = await proc.exited
-
-    expect(exitCode).not.toBe(0)
-    expect(stderr).toContain('Schema file not found')
-  })
-
-  test('capture command requires prompts path', async () => {
-    const proc = Bun.spawn(['bun', CLI_PATH, 'capture'], {
-      stdout: 'pipe',
-      stderr: 'pipe',
-    })
-    const stderr = await new Response(proc.stderr).text()
-    const exitCode = await proc.exited
-
-    expect(exitCode).toBe(1)
-    expect(stderr).toContain('prompts.jsonl path is required')
-  })
-
-  test('summarize command requires input path', async () => {
-    const proc = Bun.spawn(['bun', CLI_PATH, 'summarize'], {
-      stdout: 'pipe',
-      stderr: 'pipe',
-    })
-    const stderr = await new Response(proc.stderr).text()
-    const exitCode = await proc.exited
-
-    expect(exitCode).toBe(1)
-    expect(stderr).toContain('results.jsonl path is required')
-  })
-})
diff --git a/bun.lock b/bun.lock
index 983a915..5737ff7 100644
--- a/bun.lock
+++ b/bun.lock
@@ -5,7 +5,6 @@
     "": {
       "name": "@plaited/acp",
       "dependencies": {
-        "@plaited/development-skills": "0.8.0",
         "zod": "^4.3.6",
       },
       "devDependencies": {
@@ -52,8 +51,6 @@
 
     "@nodelib/fs.walk": ["@nodelib/fs.walk@1.2.8", "", { "dependencies": { "@nodelib/fs.scandir": "2.1.5", "fastq": "^1.6.0" } }, "sha512-oGB+UxlgWcgQkgwo8GcEGwemoTFt3FIO9ababBmaGwXIoBKZ+GTy0pP185beGg7Llih/NSHSV2XAs1lnznocSg=="],
 
-    "@plaited/development-skills": ["@plaited/development-skills@0.8.0", "", { "peerDependencies": { "typescript-language-server": "^5.1.3" }, "bin": { "development-skills": "bin/cli.ts" } }, "sha512-1dXWKPco9fkFLJhFuuOJB1aktF1qY97v/evthfNsun11VG1gA8efD44s3txxUIPs4FcL5KSvcvvlaqbmHQk7ew=="],
-
     "@types/bun": ["@types/bun@1.3.9", "", { "dependencies": { "bun-types": "1.3.9" } }, "sha512-KQ571yULOdWJiMH+RIWIOZ7B2RXQGpL1YQrBtLIV3FqDcCu6FsbFUBwhdKUlCKUpS3PJDsHlJ1QKlpxoVR+xtw=="],
 
     "@types/node": ["@types/node@25.0.9", "", { "dependencies": { "undici-types": "~7.16.0" } }, "sha512-/rpCXHlCWeqClNBwUhDcusJxXYDjZTyE8v5oTO7WbL8eij2nKhUeU89/6xgjU7N4/Vh3He0BtyhJdQbDyhiXAw=="],
@@ -218,8 +215,6 @@
 
     "typescript": ["typescript@5.9.3", "", { "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" } }, "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw=="],
 
-    "typescript-language-server": ["typescript-language-server@5.1.3", "", { "bin": { "typescript-language-server": "lib/cli.mjs" } }, "sha512-r+pAcYtWdN8tKlYZPwiiHNA2QPjXnI02NrW5Sf2cVM3TRtuQ3V9EKKwOxqwaQ0krsaEXk/CbN90I5erBuf84Vg=="],
-
     "undici-types": ["undici-types@7.16.0", "", {}, "sha512-Zz+aZWSj8LE6zoxD+xrjh4VfkIG8Ya6LvYkZqtUQGJPZjYl53ypCaUwWqo7eI0x66KBGeRo+mlBEkMSeSZ38Nw=="],
 
     "universalify": ["universalify@2.0.1", "", {}, "sha512-gptHNQghINnc/vTGIk0SOFGFNXw7JVrlRUtConJRlvaw6DuX0wO5Jeko9sWrMBhh+PsYAZ7oXAiOnf/UKogyiw=="],
diff --git a/package.json b/package.json
index 352c0cb..af150e1 100644
--- a/package.json
+++ b/package.json
@@ -1,7 +1,7 @@
 {
   "name": "@plaited/agent-eval-harness",
-  "version": "0.13.0",
-  "description": "CLI tool for capturing agent trajectories from headless CLI agents",
+  "version": "1.0.0",
+  "description": "General-purpose eval harness for running trials against CLI agents",
   "license": "ISC",
   "engines": {
     "bun": ">= v1.2.9"
@@ -15,22 +15,17 @@
   },
   "homepage": "https://github.com/plaited/agent-eval-harness/tree/main#readme",
   "bin": {
-    "agent-eval-harness": "./bin/cli.ts"
+    "agent-eval-harness": "./src/cli.ts"
   },
   "type": "module",
   "exports": {
-    ".": "./src/harness.ts",
-    "./schemas": "./src/schemas.ts",
-    "./headless": "./src/headless.ts",
-    "./pipeline": "./src/pipeline.ts"
+    ".": "./src/trial.ts",
+    "./schemas": "./src/trial.schemas.ts"
   },
   "files": [
     "./src/**",
-    "./bin/**",
     "!./src/**/tests/*",
-    "!./src/**/*.spec.ts",
-    "!./bin/**/tests/*",
-    "!./bin/**/*.spec.ts"
+    "!./src/**/*.spec.ts"
   ],
   "publishConfig": {
     "access": "public"
@@ -42,10 +37,7 @@
     "check:types": "tsc --noEmit",
     "check:write": "biome check --write && format-package --write",
     "prepare": "git rev-parse --git-dir > /dev/null 2>&1 && git config core.hooksPath .hooks || true",
-    "test": "bun run test:bin && bun test:src",
-    "test:bin": "bun test bin/tests/*.spec.ts",
-    "test:integration": "bun test ./**/integration_tests/*.spec.ts",
-    "test:src": "bun test src/**/tests/*.spec.ts"
+    "test": "bun test src/"
   },
   "lint-staged": {
     "*.{js,cjs,jsx,tsx,ts}": [
@@ -56,7 +48,6 @@
     ]
   },
   "dependencies": {
-    "@plaited/development-skills": "0.8.0",
     "zod": "^4.3.6"
   },
   "devDependencies": {
diff --git a/src/cli.ts b/src/cli.ts
new file mode 100644
index 0000000..7821670
--- /dev/null
+++ b/src/cli.ts
@@ -0,0 +1,22 @@
+#!/usr/bin/env bun
+import { trialCli } from './trial.ts'
+
+const [command, ...args] = process.argv.slice(2)
+
+switch (command) {
+  case 'trials':
+    await trialCli(args)
+    break
+  case 'compare':
+    console.error('compare command not yet implemented')
+    process.exit(1)
+    break
+  case 'calibrate':
+    console.error('calibrate command not yet implemented')
+    process.exit(1)
+    break
+  default:
+    console.error(`Unknown command: ${command}`)
+    console.error('Available commands: trials, compare, calibrate')
+    process.exit(1)
+}
diff --git a/src/cli.utils.ts b/src/cli.utils.ts
new file mode 100644
index 0000000..6567d8e
--- /dev/null
+++ b/src/cli.utils.ts
@@ -0,0 +1,99 @@
+/**
+ * Shared CLI utilities for the eval harness.
+ *
+ * @remarks
+ * Implements the CLI tool pattern: JSON positional arg or stdin pipe,
+ * `--schema input|output` for discovery, `--help` for usage, exit codes 0/1/2.
+ *
+ * @internal
+ */
+
+import * as z from 'zod'
+
+// ============================================================================
+// Raw Input Extraction (shared plumbing)
+// ============================================================================
+
+/**
+ * Extract and parse raw JSON from CLI args or stdin.
+ *
+ * @remarks
+ * Handles `--help`, `--schema`, positional JSON arg, and stdin pipe.
+ * Calls `process.exit()` on meta flags and bad input — only returns
+ * on valid JSON.
+ *
+ * @internal
+ */
+const parseRawCliInput = async (
+  args: string[],
+  schema: z.ZodSchema,
+  options: { name: string; outputSchema?: z.ZodSchema },
+): Promise<unknown> => {
+  if (args.includes('--help') || args.includes('-h')) {
+    console.error(`Usage: agent-eval-harness ${options.name} '<json>' | --schema input`)
+    process.exit(0)
+  }
+
+  const schemaIdx = args.indexOf('--schema')
+  if (schemaIdx !== -1) {
+    const target = args[schemaIdx + 1]
+    if (target === 'output' && options.outputSchema) {
+      console.log(JSON.stringify(z.toJSONSchema(options.outputSchema), null, 2))
+    } else {
+      console.log(JSON.stringify(z.toJSONSchema(schema), null, 2))
+    }
+    process.exit(0)
+  }
+
+  const positionals = args.filter((arg) => !arg.startsWith('--'))
+  let rawInput: string | undefined
+
+  if (positionals.length > 0) {
+    rawInput = positionals[0]
+  } else if (!process.stdin.isTTY) {
+    const stdinData = await Bun.stdin.text()
+    if (stdinData.trim()) rawInput = stdinData.trim()
+  }
+
+  if (!rawInput) {
+    console.error(`Usage: agent-eval-harness ${options.name} '<json>' | --schema input`)
+    process.exit(2)
+  }
+
+  try {
+    return JSON.parse(rawInput)
+  } catch {
+    console.error('Invalid JSON input')
+    process.exit(2)
+  }
+}
+
+// ============================================================================
+// CLI Input Parser
+// ============================================================================
+
+/**
+ * Parse CLI input following the CLI tool pattern.
+ *
+ * @remarks
+ * - `--help` / `-h`: prints usage, exits 0
+ * - `--schema input`: emits input JSON Schema, exits 0
+ * - `--schema output`: emits output JSON Schema (if provided), exits 0
+ * - First positional arg or stdin pipe: JSON validated with Zod
+ * - Exit 2 on bad input
+ *
+ * @internal
+ */
+export const parseCli = async <T extends z.ZodSchema>(
+  args: string[],
+  schema: T,
+  options: { name: string; outputSchema?: z.ZodSchema },
+): Promise<z.infer<T>> => {
+  const raw = await parseRawCliInput(args, schema, options)
+  const parsed = schema.safeParse(raw)
+  if (!parsed.success) {
+    console.error(JSON.stringify(parsed.error.issues, null, 2))
+    process.exit(2)
+  }
+  return parsed.data
+}
diff --git a/src/commands.ts b/src/commands.ts
deleted file mode 100644
index cd1969c..0000000
--- a/src/commands.ts
+++ /dev/null
@@ -1,33 +0,0 @@
-/**
- * CLI command implementations for agent evaluation harness.
- *
- * @remarks
- * Re-exports all CLI commands for programmatic use.
- * For CLI usage, run `agent-eval-harness <command> --help`.
- *
- * @packageDocumentation
- */
-
-// Balance command
-export type { BalanceConfig } from './commands/balance.ts'
-export { balance, runBalance } from './commands/balance.ts'
-
-// Calibrate command
-export type { CalibrateConfig } from './commands/calibrate.ts'
-export { calibrate, runCalibrate } from './commands/calibrate.ts'
-
-// Capture command
-export type { CaptureConfig } from './commands/capture.ts'
-export { capture, runCapture } from './commands/capture.ts'
-
-// Summarize command
-export type { SummarizeConfig } from './commands/summarize.ts'
-export { runSummarize, summarize } from './commands/summarize.ts'
-
-// Trials command
-export type { TrialsConfig } from './commands/trials.ts'
-export { runTrials, trials } from './commands/trials.ts'
-
-// Validate-refs command
-export type { ValidateRefsConfig } from './commands/validate-refs.ts'
-export { runValidateRefs, validateRefs } from './commands/validate-refs.ts'
diff --git a/src/commands/balance.ts b/src/commands/balance.ts
deleted file mode 100644
index 115cfd2..0000000
--- a/src/commands/balance.ts
+++ /dev/null
@@ -1,245 +0,0 @@
-/**
- * Balance command - analyze test set coverage.
- *
- * @remarks
- * Analyzes the distribution of test cases by metadata categories.
- * Identifies underrepresented categories and suggests improvements.
- *
- * @packageDocumentation
- */
-
-import { parseArgs } from 'node:util'
-import { loadPrompts, resolvePath } from '../core.ts'
-import type { BalanceAnalysis, CategoryDistribution, PromptCase } from '../schemas.ts'
-
-// ============================================================================
-// Types
-// ============================================================================
-
-/** Configuration for balance command */
-export type BalanceConfig = {
-  /** Path to prompts.jsonl file */
-  promptsPath: string
-  /** Output file path */
-  outputPath?: string
-  /** Metadata key to analyze (default: 'category') */
-  key?: string
-  /** Threshold for underrepresentation (percentage) */
-  threshold?: number
-}
-
-/**
- * Analyze category distribution across prompts.
- *
- * @param prompts - Array of prompt cases
- * @param key - Metadata key to analyze
- * @returns Array of category distributions sorted by count descending
- *
- * @public
- */
-export const analyzeCategories = (prompts: PromptCase[], key: string): CategoryDistribution[] => {
-  const counts = new Map<string, number>()
-
-  for (const prompt of prompts) {
-    const value = prompt.metadata?.[key]
-    const category = value !== undefined ? String(value) : '(uncategorized)'
-    counts.set(category, (counts.get(category) ?? 0) + 1)
-  }
-
-  const total = prompts.length
-  const distributions: CategoryDistribution[] = []
-
-  for (const [name, count] of counts) {
-    distributions.push({
-      name,
-      count,
-      percentage: Math.round((count / total) * 100),
-    })
-  }
-
-  // Sort by count descending
-  distributions.sort((a, b) => b.count - a.count)
-
-  return distributions
-}
-
-/**
- * Identify underrepresented categories.
- *
- * @param distributions - Array of category distributions
- * @param threshold - Percentage threshold relative to even distribution
- * @returns Array of underrepresented category names
- *
- * @public
- */
-export const findUnderrepresented = (distributions: CategoryDistribution[], threshold: number): string[] => {
-  // Expected percentage if evenly distributed
-  const evenPercentage = 100 / distributions.length
-
-  return distributions.filter((d) => d.percentage < evenPercentage * (threshold / 100)).map((d) => d.name)
-}
-
-/**
- * Generate suggestions for improving test set balance.
- *
- * @param distributions - Array of category distributions
- * @param underrepresented - Array of underrepresented category names
- * @param total - Total number of test cases
- * @returns Array of suggestion strings
- *
- * @public
- */
-export const generateSuggestions = (
-  distributions: CategoryDistribution[],
-  underrepresented: string[],
-  total: number,
-): string[] => {
-  const suggestions: string[] = []
-
-  if (underrepresented.length > 0) {
-    suggestions.push(`Consider adding more test cases for: ${underrepresented.join(', ')}`)
-  }
-
-  // Check for category with > 50% of cases
-  const dominant = distributions.find((d) => d.percentage > 50)
-  if (dominant) {
-    suggestions.push(`Category '${dominant.name}' has ${dominant.percentage}% of cases - consider diversifying`)
-  }
-
-  // Check for very small categories
-  const tiny = distributions.filter((d) => d.count < 3)
-  if (tiny.length > 0) {
-    suggestions.push(`Categories with < 3 cases may not be reliable: ${tiny.map((d) => d.name).join(', ')}`)
-  }
-
-  // Check total test count
-  if (total < 20) {
-    suggestions.push(`Consider expanding test set (currently ${total} cases) for more statistical significance`)
-  }
-
-  if (suggestions.length === 0) {
-    suggestions.push('Test set appears well-balanced')
-  }
-
-  return suggestions
-}
-
-// ============================================================================
-// Balance Implementation
-// ============================================================================
-
-/**
- * Execute balance analysis with configuration object.
- *
- * @param config - Balance configuration
- * @returns Balance analysis result
- */
-export const runBalance = async (config: BalanceConfig): Promise<BalanceAnalysis> => {
-  const { promptsPath, outputPath, key = 'category', threshold = 50 } = config
-
-  // Load prompts
-  const prompts = await loadPrompts(promptsPath)
-
-  console.error(`Analyzing ${prompts.length} prompts by '${key}' metadata...`)
-
-  // Analyze distribution
-  const categories = analyzeCategories(prompts, key)
-  const underrepresented = findUnderrepresented(categories, threshold)
-  const suggestions = generateSuggestions(categories, underrepresented, prompts.length)
-
-  const analysis: BalanceAnalysis = {
-    totalCases: prompts.length,
-    categories,
-    underrepresented,
-    suggestions,
-  }
-
-  // Format output
-  const output = JSON.stringify(analysis, null, 2)
-
-  // Write output
-  if (outputPath) {
-    await Bun.write(resolvePath(outputPath), output)
-  } else {
-    console.log(output)
-  }
-
-  // Summary to stderr
-  console.error('\nCategory Distribution:')
-  for (const cat of categories) {
-    const bar = '█'.repeat(Math.round(cat.percentage / 5))
-    console.error(`  ${cat.name}: ${cat.count} (${cat.percentage}%) ${bar}`)
-  }
-
-  if (underrepresented.length > 0) {
-    console.error(`\nUnderrepresented: ${underrepresented.join(', ')}`)
-  }
-
-  console.error('\nSuggestions:')
-  for (const suggestion of suggestions) {
-    console.error(`  - ${suggestion}`)
-  }
-
-  return analysis
-}
-
-// ============================================================================
-// CLI Entry Point
-// ============================================================================
-
-/**
- * Balance command CLI handler.
- *
- * @param args - Command line arguments (after 'balance')
- */
-export const balance = async (args: string[]): Promise<void> => {
-  const { values, positionals } = parseArgs({
-    args,
-    options: {
-      output: { type: 'string', short: 'o' },
-      key: { type: 'string', short: 'k', default: 'category' },
-      threshold: { type: 'string', short: 't', default: '50' },
-      help: { type: 'boolean', short: 'h' },
-    },
-    allowPositionals: true,
-  })
-
-  if (values.help) {
-    console.log(`
-Usage: agent-eval-harness balance <prompts.jsonl> [options]
-
-Arguments:
-  prompts.jsonl     Input file with prompts
-
-Options:
-  -o, --output      Output file (default: stdout)
-  -k, --key         Metadata key to analyze (default: 'category')
-  -t, --threshold   Underrepresentation threshold % (default: 50)
-  -h, --help        Show this help message
-
-Output:
-  JSON with category distribution, underrepresented categories, and suggestions.
-
-Examples:
-  # Analyze by default 'category' key
-  agent-eval-harness balance prompts.jsonl -o balance.json
-
-  # Analyze by custom metadata key
-  agent-eval-harness balance prompts.jsonl --key difficulty -o balance.json
-`)
-    return
-  }
-
-  const promptsPath = positionals[0]
-  if (!promptsPath) {
-    console.error('Error: prompts.jsonl path is required')
-    process.exit(1)
-  }
-
-  await runBalance({
-    promptsPath,
-    outputPath: values.output,
-    key: values.key ?? 'category',
-    threshold: Number.parseInt(values.threshold ?? '50', 10),
-  })
-}
diff --git a/src/commands/calibrate.ts b/src/commands/calibrate.ts
deleted file mode 100644
index 4bd9466..0000000
--- a/src/commands/calibrate.ts
+++ /dev/null
@@ -1,304 +0,0 @@
-/**
- * Calibrate command - sample failures for grader review.
- *
- * @remarks
- * Helps identify grader bugs by sampling failures for human review.
- * Can optionally re-score with a different grader for comparison.
- *
- * @packageDocumentation
- */
-
-import { parseArgs } from 'node:util'
-import { loadResults, resolvePath } from '../core.ts'
-import { DEFAULT_CALIBRATION_SAMPLE_SIZE } from '../schemas/constants.ts'
-import { loadGraderOrExit } from '../schemas/grader-loader.ts'
-import type { CalibrationSample, Grader, GraderResult, TrajectoryStep } from '../schemas.ts'
-
-// ============================================================================
-// Types
-// ============================================================================
-
-/** Configuration for calibrate command */
-export type CalibrateConfig = {
-  /** Path to results.jsonl file */
-  resultsPath: string
-  /** Output file path */
-  outputPath?: string
-  /** Number of samples to include */
-  sample?: number
-  /** Optional grader for re-scoring */
-  grader?: Grader
-}
-
-/**
- * Randomly sample n elements from an array using Fisher-Yates shuffle.
- *
- * @param arr - Array to sample from
- * @param n - Number of samples to take
- * @returns Array of sampled elements in random order
- *
- * @remarks
- * Uses Fisher-Yates (Knuth) shuffle for uniform distribution.
- * Creates a copy to avoid mutating the input array.
- * O(n) time complexity with O(n) space for the copy.
- * Not cryptographically secure (uses Math.random).
- *
- * @public
- */
-export const sampleArray = <T>(arr: T[], n: number): T[] => {
-  if (n <= 0) return []
-  if (n >= arr.length) return [...arr]
-
-  const copy = [...arr]
-
-  // Fisher-Yates shuffle working backwards through array
-  // Only shuffle enough elements to get n samples
-  const limit = copy.length - n
-  for (let i = copy.length - 1; i >= limit && i > 0; i--) {
-    // Random index from 0 to i (inclusive)
-    const j = Math.floor(Math.random() * (i + 1))
-    // Swap elements
-    ;[copy[i], copy[j]] = [copy[j]!, copy[i]!]
-  }
-
-  return copy.slice(-n)
-}
-
-/**
- * Get snippet of trajectory for review.
- *
- * @remarks
- * Includes first 2 steps, middle step, and last 2 steps.
- *
- * @param trajectory - Full trajectory
- * @param maxSteps - Maximum number of steps to include
- * @returns Trajectory snippet
- *
- * @public
- */
-export const getTrajectorySnippet = (trajectory: TrajectoryStep[], maxSteps = 5): TrajectoryStep[] => {
-  // Include first and last steps, plus some from the middle
-  if (trajectory.length <= maxSteps) return trajectory
-
-  const result: TrajectoryStep[] = []
-
-  // First 2 steps
-  result.push(...trajectory.slice(0, 2))
-
-  // Middle step
-  const mid = Math.floor(trajectory.length / 2)
-  result.push(trajectory[mid] as TrajectoryStep)
-
-  // Last 2 steps
-  result.push(...trajectory.slice(-2))
-
-  return result
-}
-
-/** Format calibration sample as markdown */
-const formatCalibrationMarkdown = (samples: CalibrationSample[]): string => {
-  const lines: string[] = [
-    '# Grader Calibration Report',
-    '',
-    `Generated: ${new Date().toISOString()}`,
-    `Samples: ${samples.length}`,
-    '',
-    '## Instructions',
-    '',
-    'Review each failure below and mark whether:',
-    '- [ ] **Valid failure** - Grader correctly identified a problem',
-    '- [ ] **Grader bug** - Output was actually correct, grader was wrong',
-    '- [ ] **Ambiguous** - Unclear if the output is correct or not',
-    '',
-    '---',
-    '',
-  ]
-
-  for (let i = 0; i < samples.length; i++) {
-    const sample = samples[i]
-    if (!sample) continue
-
-    lines.push(`## Sample ${i + 1}: ${sample.id}`)
-    lines.push('')
-    lines.push(`**Input:** ${sample.input}`)
-    lines.push('')
-
-    if (sample.hint) {
-      lines.push(`**Hint:** ${sample.hint}`)
-      lines.push('')
-    }
-
-    lines.push(`**Output:** ${sample.output.slice(0, 500)}${sample.output.length > 500 ? '...' : ''}`)
-    lines.push('')
-
-    lines.push(`**Original Score:** ${sample.originalScore.pass ? 'PASS' : 'FAIL'} (${sample.originalScore.score})`)
-    if (sample.originalScore.reasoning) {
-      lines.push(`**Reasoning:** ${sample.originalScore.reasoning}`)
-    }
-    lines.push('')
-
-    if (sample.rescoredResult) {
-      lines.push(`**Re-scored:** ${sample.rescoredResult.pass ? 'PASS' : 'FAIL'} (${sample.rescoredResult.score})`)
-      if (sample.rescoredResult.reasoning) {
-        lines.push(`**Re-score Reasoning:** ${sample.rescoredResult.reasoning}`)
-      }
-      lines.push('')
-    }
-
-    lines.push('**Trajectory Snippet:**')
-    lines.push('```')
-    for (const step of sample.trajectorySnippet) {
-      if (step.type === 'tool_call') {
-        lines.push(`[${step.type}] ${step.name}: ${step.status}`)
-      } else if (step.type === 'message' || step.type === 'thought') {
-        lines.push(`[${step.type}] ${step.content.slice(0, 100)}...`)
-      } else if (step.type === 'plan') {
-        lines.push(`[${step.type}] ${(step.entries as Array<{ content: string }>).length} entries`)
-      }
-    }
-    lines.push('```')
-    lines.push('')
-
-    lines.push('**Review:**')
-    lines.push('- [ ] Valid failure')
-    lines.push('- [ ] Grader bug')
-    lines.push('- [ ] Ambiguous')
-    lines.push('')
-    lines.push('---')
-    lines.push('')
-  }
-
-  return lines.join('\n')
-}
-
-// ============================================================================
-// Calibrate Implementation
-// ============================================================================
-
-/**
- * Execute calibrate with configuration object.
- *
- * @param config - Calibrate configuration
- * @returns Calibration samples
- */
-export const runCalibrate = async (config: CalibrateConfig): Promise<CalibrationSample[]> => {
-  const { resultsPath, outputPath, sample = DEFAULT_CALIBRATION_SAMPLE_SIZE, grader } = config
-
-  // Load results
-  const results = await loadResults(resultsPath)
-
-  // Filter to failures (or results without scores)
-  const failures = results.filter((r) => r.score && !r.score.pass)
-
-  if (failures.length === 0) {
-    console.error('No failures found in results')
-    return []
-  }
-
-  // Sample failures
-  const sampled = sampleArray(failures, Math.min(sample, failures.length))
-
-  // Build calibration samples
-  const samples: CalibrationSample[] = []
-
-  for (const result of sampled) {
-    const calibrationSample: CalibrationSample = {
-      id: result.id,
-      input: result.input,
-      output: result.output,
-      hint: result.hint,
-      originalScore: result.score as GraderResult,
-      trajectorySnippet: getTrajectorySnippet(result.trajectory),
-    }
-
-    // Re-score with different grader if provided
-    if (grader) {
-      calibrationSample.rescoredResult = await grader({
-        input: result.input,
-        output: result.output,
-        hint: result.hint,
-        trajectory: result.trajectory,
-        metadata: result.metadata,
-      })
-    }
-
-    samples.push(calibrationSample)
-  }
-
-  // Format as markdown
-  const markdown = formatCalibrationMarkdown(samples)
-
-  // Write output
-  if (outputPath) {
-    await Bun.write(resolvePath(outputPath), markdown)
-  } else {
-    console.log(markdown)
-  }
-
-  return samples
-}
-
-// ============================================================================
-// CLI Entry Point
-// ============================================================================
-
-/**
- * Calibrate command CLI handler.
- *
- * @param args - Command line arguments (after 'calibrate')
- */
-export const calibrate = async (args: string[]): Promise<void> => {
-  const { values, positionals } = parseArgs({
-    args,
-    options: {
-      output: { type: 'string', short: 'o' },
-      sample: { type: 'string', short: 's', default: String(DEFAULT_CALIBRATION_SAMPLE_SIZE) },
-      grader: { type: 'string', short: 'g' },
-      help: { type: 'boolean', short: 'h' },
-    },
-    allowPositionals: true,
-  })
-
-  if (values.help) {
-    console.log(`
-Usage: agent-eval-harness calibrate <results.jsonl> [options]
-
-Arguments:
-  results.jsonl     Input file with scored capture results
-
-Options:
-  -o, --output      Output file (default: stdout)
-  -s, --sample      Number of failures to sample (default: ${DEFAULT_CALIBRATION_SAMPLE_SIZE})
-  -g, --grader      Path to alternative grader (.ts/.js module or executable script)
-  -h, --help        Show this help message
-
-Output:
-  Markdown report with sampled failures for human review.
-  Includes checkboxes for labeling (valid failure / grader bug / ambiguous).
-
-Examples:
-  # Sample failures for review
-  agent-eval-harness calibrate results.jsonl --sample 10 -o calibration.md
-
-  # Re-score with different grader to compare
-  agent-eval-harness calibrate results.jsonl --grader ./loose-grader.ts -o comparison.md
-`)
-    return
-  }
-
-  const resultsPath = positionals[0]
-  if (!resultsPath) {
-    console.error('Error: results.jsonl path is required')
-    process.exit(1)
-  }
-
-  // Load grader if specified
-  const grader = values.grader ? await loadGraderOrExit(values.grader) : undefined
-
-  await runCalibrate({
-    resultsPath,
-    outputPath: values.output,
-    sample: Number.parseInt(values.sample ?? String(DEFAULT_CALIBRATION_SAMPLE_SIZE), 10),
-    grader,
-  })
-}
diff --git a/src/commands/capture.ts b/src/commands/capture.ts
deleted file mode 100644
index c78f863..0000000
--- a/src/commands/capture.ts
+++ /dev/null
@@ -1,391 +0,0 @@
-/**
- * Core trajectory capture command.
- *
- * @remarks
- * Executes prompts against a CLI agent and captures full trajectories.
- * This is the foundational command - all other views derive from its output.
- *
- * Output format is always full trajectory JSONL (`CaptureResultSchema`).
- * Use `summarize` command to derive compact views.
- *
- * @packageDocumentation
- */
-
-import { parseArgs } from 'node:util'
-import {
-  createWorkspaceDir,
-  detectTrajectoryRichness,
-  extractOutput,
-  extractTrajectory,
-  getInputPreview,
-  hasToolErrors,
-  logProgress,
-  readStdinPrompts,
-} from '../core.ts'
-import type { ParsedUpdate } from '../headless/headless-output-parser.ts'
-import type { ProcessExitInfo, PromptResult } from '../headless/headless-session-manager.ts'
-import { loadGraderOrExit } from '../schemas/grader-loader.ts'
-import type { CaptureResult, PromptCase, TrajectoryRichness } from '../schemas.ts'
-import { type BaseExecutionConfig, executePrompts, parseConcurrency, prepareExecution } from './execution.ts'
-
-// ============================================================================
-// Re-exports for backward compatibility
-// ============================================================================
-
-// These functions are now in core/ but re-exported here for existing consumers
-export {
-  detectTrajectoryRichness,
-  extractContent,
-  extractFilePath,
-  extractOutput,
-  extractTrajectory,
-  hasToolErrors,
-  headTailPreview,
-  loadPrompts,
-} from '../core.ts'
-
-// ============================================================================
-// Types
-// ============================================================================
-
-/** Configuration for capture command */
-export type CaptureConfig = BaseExecutionConfig
-
-// ============================================================================
-// Capture Implementation
-// ============================================================================
-
-/**
- * Execute capture with configuration object.
- *
- * @remarks
- * Creates a fresh session for each JSONL entry to ensure isolation.
- * Supports multi-turn conversations via `input: string[]`.
- *
- * @param config - Capture configuration
- * @returns Array of capture results
- */
-export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]> => {
-  const ctx = await prepareExecution(config)
-  const {
-    schema,
-    prompts,
-    sessions,
-    resolvedOutputPath,
-    resolvedWorkspaceDir,
-    defaultWorkingDir,
-    progress,
-    grader,
-    debug,
-  } = ctx
-
-  // Log progress info
-  logProgress(`Loaded ${prompts.length} prompts from ${config.promptsPath ?? 'stdin'}`, progress)
-  logProgress(`Schema: ${schema.name} (${config.schemaPath})`, progress)
-  logProgress(`Timeout: ${ctx.effectiveTimeout}ms`, progress)
-  if (ctx.concurrency > 1) {
-    logProgress(`Concurrency: ${ctx.concurrency} workers`, progress)
-  }
-  if (resolvedWorkspaceDir) {
-    logProgress(`Workspace: ${resolvedWorkspaceDir}`, progress)
-  }
-  if (resolvedOutputPath) {
-    logProgress(`Output: ${resolvedOutputPath}`, progress)
-  }
-  if (debug) {
-    logProgress(`Debug mode: enabled`, progress)
-  }
-
-  // Process a single prompt (used by worker pool)
-  const processPrompt = async (promptCase: (typeof prompts)[number], index: number): Promise<CaptureResult> => {
-    // Determine working directory (per-prompt workspace or default)
-    const workingDir = resolvedWorkspaceDir
-      ? await createWorkspaceDir(resolvedWorkspaceDir, promptCase.id)
-      : defaultWorkingDir
-
-    logProgress(`[${index + 1}/${prompts.length}] ${promptCase.id}: ${getInputPreview(promptCase.input)}...`, progress)
-
-    const startTime = Date.now()
-    let result: CaptureResult
-    let sessionId: string | undefined
-
-    try {
-      // Create fresh session for each entry (ensures isolation)
-      const sessionStart = Date.now()
-      const session = await sessions.create(workingDir)
-      sessionId = session.id
-      const sessionCreation = Date.now() - sessionStart
-      logProgress(`  Session: ${session.id}`, progress)
-
-      // Handle string or array input
-      const inputs = Array.isArray(promptCase.input) ? promptCase.input : [promptCase.input]
-      const turnCount = inputs.length
-
-      // Collect all updates from all turns
-      const allUpdates: ParsedUpdate[] = []
-      let lastExitInfo: ProcessExitInfo | undefined
-      let lastOutput = ''
-
-      // Execute each turn sequentially in the same session
-      for (const turnInput of inputs) {
-        const turnResult: PromptResult = await sessions.prompt(session.id, turnInput)
-        allUpdates.push(...turnResult.updates)
-        lastExitInfo = turnResult.exitInfo
-        lastOutput = turnResult.output
-      }
-
-      const endTime = Date.now()
-      const trajectory = extractTrajectory(allUpdates, startTime)
-
-      // Use last turn's output or extract from trajectory
-      const output = lastOutput || extractOutput(trajectory)
-      const toolErrors = hasToolErrors(trajectory) || (lastExitInfo?.timedOut ?? false)
-      const trajectoryRichness = detectTrajectoryRichness(trajectory)
-
-      result = {
-        id: promptCase.id,
-        input: promptCase.input,
-        output,
-        ...(promptCase.hint && { hint: promptCase.hint }),
-        trajectory,
-        metadata: {
-          ...promptCase.metadata,
-          agent: schema.name,
-          trajectoryRichness,
-          turnCount,
-          ...(resolvedWorkspaceDir && { workspaceDir: workingDir }),
-          ...(lastExitInfo && {
-            exitCode: lastExitInfo.exitCode,
-            signal: lastExitInfo.signal,
-            timedOut: lastExitInfo.timedOut,
-          }),
-        },
-        timing: {
-          start: startTime,
-          end: endTime,
-          firstResponse: trajectory.length > 0 ? trajectory[0]?.timestamp : undefined,
-          sessionCreation,
-          total: endTime - startTime,
-        },
-        toolErrors,
-      }
-
-      // Apply grader if provided
-      if (grader) {
-        const graderResult = await grader({
-          input: promptCase.input,
-          output,
-          hint: promptCase.hint,
-          trajectory,
-          metadata: promptCase.metadata,
-          cwd: session.cwd,
-        })
-
-        result.score = graderResult
-
-        if (graderResult.outcome) {
-          result.outcome = graderResult.outcome
-        }
-      }
-    } catch (error) {
-      const endTime = Date.now()
-      const message = error instanceof Error ? error.message : String(error)
-      const inputs = Array.isArray(promptCase.input) ? promptCase.input : [promptCase.input]
-
-      result = {
-        id: promptCase.id,
-        input: promptCase.input,
-        output: '',
-        trajectory: [],
-        metadata: {
-          ...promptCase.metadata,
-          agent: schema.name,
-          trajectoryRichness: 'minimal' as TrajectoryRichness,
-          turnCount: inputs.length,
-          ...(resolvedWorkspaceDir && { workspaceDir: workingDir }),
-        },
-        timing: {
-          start: startTime,
-          end: endTime,
-          sessionCreation: 0,
-          total: endTime - startTime,
-        },
-        toolErrors: true,
-        errors: [message],
-      }
-    } finally {
-      // Always clean up session if it was created
-      if (sessionId) {
-        sessions.destroy(sessionId)
-      }
-    }
-
-    // Write result immediately (coordinated via mutex for concurrent writes)
-    await ctx.writeResult(result)
-
-    const statusIcon = result.toolErrors ? '!' : '✓'
-    const exitInfo = result.metadata?.timedOut
-      ? ' - TIMEOUT'
-      : result.metadata?.exitCode && result.metadata.exitCode !== 0
-        ? ` - exit ${result.metadata.exitCode}`
-        : ''
-    logProgress(`  ${statusIcon} ${promptCase.id} (${result.timing.total}ms)${exitInfo}`, progress)
-
-    return result
-  }
-
-  // Run with worker pool
-  return executePrompts(ctx, processPrompt)
-}
-
-// ============================================================================
-// CLI Entry Point
-// ============================================================================
-
-/**
- * Capture command CLI handler.
- *
- * @param args - Command line arguments (after 'capture')
- */
-export const capture = async (args: string[]): Promise<void> => {
-  const { values, positionals } = parseArgs({
-    args,
-    options: {
-      schema: { type: 'string', short: 's' },
-      output: { type: 'string', short: 'o' },
-      cwd: { type: 'string', short: 'c' },
-      timeout: { type: 'string', short: 't' },
-      progress: { type: 'boolean', default: false },
-      append: { type: 'boolean', default: false },
-      grader: { type: 'string', short: 'g' },
-      debug: { type: 'boolean', default: false },
-      stdin: { type: 'boolean', default: false },
-      concurrency: { type: 'string', short: 'j' },
-      'workspace-dir': { type: 'string' },
-      help: { type: 'boolean', short: 'h' },
-    },
-    allowPositionals: true,
-  })
-
-  if (values.help) {
-    console.log(`
-Usage: agent-eval-harness capture <prompts.jsonl> --schema <schema.json> [options]
-       cat prompts.jsonl | agent-eval-harness capture --stdin --schema <schema.json> [options]
-
-Arguments:
-  prompts.jsonl     Input file with evaluation prompts
-
-Options:
-  -s, --schema      Path to agent schema JSON file (required)
-  -o, --output      Output file (default: stdout)
-  -c, --cwd         Working directory for agent
-  -t, --timeout     Request timeout in ms (overrides schema default)
-  -j, --concurrency Number of concurrent workers (default: 1)
-  --stdin           Read prompts from stdin (mutually exclusive with file arg)
-  --workspace-dir   Base directory for per-prompt workspace isolation
-  --progress        Show progress to stderr
-  --append          Append to output file instead of overwriting
-  -g, --grader      Path to grader (.ts/.js module or executable script)
-  --debug           Enable debug mode (shows raw output, JSONPath matching)
-  -h, --help        Show this help message
-
-Output Format:
-  Full trajectory JSONL with toolErrors indicator.
-  Use 'agent-eval-harness summarize' to derive compact views.
-
-Exit Info (in metadata):
-  exitCode      Process exit code (null if killed/timed out)
-  signal        Signal that killed process (if any)
-  timedOut      true if process was killed due to timeout
-
-Graders:
-  TS/JS modules must export a 'grade' function.
-  Executable scripts (Python, etc.) use stdin/stdout JSON protocol.
-
-Parallelization:
-  Use -j/--concurrency to run multiple prompts in parallel.
-  Each prompt gets its own agent session for isolation.
-  Results are written as they complete (order may differ from input).
-
-  Memory: Stream-mode agents (e.g. Claude Code) spawn real subprocesses
-  at ~400-500MB RSS each. With -j 8 that is 3-4GB of resident memory.
-  In memory-constrained environments (Docker, CI) this can cause OOM kills.
-  Use --stdin to pipe prompts for container-level orchestration.
-
-Workspace Isolation:
-  Use --workspace-dir to create per-prompt directories.
-  Each prompt runs in {workspace-dir}/prompt-{id}/.
-  Useful for code generation tasks requiring filesystem isolation.
-
-Examples:
-  # Basic capture with schema
-  agent-eval-harness capture prompts.jsonl --schema claude.json -o results.jsonl
-
-  # Run 4 prompts in parallel
-  agent-eval-harness capture prompts.jsonl -s claude.json -j 4 -o results.jsonl
-
-  # With workspace isolation for code generation
-  agent-eval-harness capture prompts.jsonl -s claude.json -j 4 \\
-    --workspace-dir ./workspaces -o results.jsonl
-
-  # With TypeScript grader
-  agent-eval-harness capture prompts.jsonl -s claude.json --grader ./grader.ts -o results.jsonl
-
-  # With debug mode
-  agent-eval-harness capture prompts.jsonl -s claude.json --debug -o results.jsonl
-
-  # Read prompts from stdin (container orchestration)
-  cat prompts.jsonl | agent-eval-harness capture --stdin -s claude.json -o results.jsonl
-`)
-    return
-  }
-
-  const promptsPath = positionals[0]
-  const useStdin = values.stdin ?? false
-
-  // Mutual exclusivity: --stdin and positional file
-  if (useStdin && promptsPath) {
-    console.error('Error: --stdin and prompts file argument are mutually exclusive')
-    process.exit(1)
-  }
-
-  if (!useStdin && !promptsPath) {
-    console.error('Error: prompts.jsonl path is required (or use --stdin)')
-    process.exit(1)
-  }
-
-  if (!values.schema) {
-    console.error('Error: --schema is required')
-    console.error('Example: agent-eval-harness capture prompts.jsonl --schema ./claude.json')
-    process.exit(1)
-  }
-
-  // Read prompts from stdin if requested
-  let prompts: PromptCase[] | undefined
-  if (useStdin) {
-    const stdinPrompts = await readStdinPrompts()
-    if (!stdinPrompts || stdinPrompts.length === 0) {
-      console.error('Error: no prompts received on stdin')
-      process.exit(1)
-    }
-    prompts = stdinPrompts
-  }
-
-  // Load grader if specified
-  const grader = values.grader ? await loadGraderOrExit(values.grader) : undefined
-
-  await runCapture({
-    promptsPath: promptsPath ?? undefined,
-    prompts,
-    schemaPath: values.schema,
-    outputPath: values.output,
-    cwd: values.cwd,
-    timeout: values.timeout ? Number.parseInt(values.timeout, 10) : undefined,
-    progress: values.progress ?? false,
-    append: values.append ?? false,
-    grader,
-    debug: values.debug ?? false,
-    concurrency: parseConcurrency(values.concurrency),
-    workspaceDir: values['workspace-dir'],
-  })
-}
diff --git a/src/commands/execution.ts b/src/commands/execution.ts
deleted file mode 100644
index 4a565fe..0000000
--- a/src/commands/execution.ts
+++ /dev/null
@@ -1,245 +0,0 @@
-/**
- * Shared execution utilities for capture and trials commands.
- *
- * @remarks
- * Extracts common setup logic: schema loading, prompt loading, path resolution,
- * session manager creation, output initialization, and worker pool execution.
- *
- * @packageDocumentation
- */
-
-import { mkdir } from 'node:fs/promises'
-import { createWriteMutex, loadPrompts, logProgress, resolvePath, runWorkerPool, writeOutput } from '../core.ts'
-import { type HeadlessAdapterConfig, parseHeadlessConfig } from '../headless/headless.schemas.ts'
-import { createSessionManager, type SessionManager } from '../headless/headless-session-manager.ts'
-import { DEFAULT_HARNESS_TIMEOUT } from '../schemas/constants.ts'
-import type { Grader, PromptCase } from '../schemas.ts'
-
-// ============================================================================
-// Types
-// ============================================================================
-
-/** Base configuration shared by capture and trials commands */
-export type BaseExecutionConfig = {
-  /** Path to prompts.jsonl file (required unless prompts provided) */
-  promptsPath?: string
-  /** Path to agent schema JSON file */
-  schemaPath: string
-  /** Pre-loaded prompt cases (from stdin); skips file loading when set */
-  prompts?: PromptCase[]
-  /** Output file path (undefined for stdout) */
-  outputPath?: string
-  /** Working directory for agent */
-  cwd?: string
-  /** Timeout per prompt in milliseconds (overrides schema default) */
-  timeout?: number
-  /** Show progress to stderr */
-  progress?: boolean
-  /** Append to output file instead of overwriting */
-  append?: boolean
-  /** Optional grader function */
-  grader?: Grader
-  /** Enable debug mode */
-  debug?: boolean
-  /** Number of concurrent workers (default: 1 for sequential) */
-  concurrency?: number
-  /** Base directory for per-prompt workspace isolation */
-  workspaceDir?: string
-}
-
-/** Prepared execution context returned by prepareExecution */
-export type ExecutionContext = {
-  /** Parsed and validated headless adapter schema */
-  schema: HeadlessAdapterConfig
-  /** Loaded and validated prompt cases */
-  prompts: PromptCase[]
-  /** Session manager for creating/destroying agent sessions */
-  sessions: SessionManager
-  /** Resolved absolute output path (undefined for stdout) */
-  resolvedOutputPath?: string
-  /** Resolved absolute workspace directory path */
-  resolvedWorkspaceDir?: string
-  /** Effective timeout in milliseconds */
-  effectiveTimeout: number
-  /** Default working directory for agent sessions */
-  defaultWorkingDir: string
-  /** Number of concurrent workers */
-  concurrency: number
-  /** Whether to show progress output */
-  progress: boolean
-  /** Optional grader function */
-  grader?: Grader
-  /** Whether debug mode is enabled */
-  debug: boolean
-  /** Write a result object as JSONL, coordinated via mutex */
-  writeResult: (result: unknown) => Promise<void>
-}
-
-// ============================================================================
-// Execution Setup
-// ============================================================================
-
-/**
- * Prepare execution context from base configuration.
- *
- * @remarks
- * Handles all shared setup: schema loading/validation, prompt loading,
- * path resolution, session manager creation, output file initialization,
- * workspace directory creation, and write mutex coordination.
- *
- * @param config - Base execution configuration
- * @returns Prepared execution context
- * @throws Error if schema file not found, invalid, or prompts missing
- *
- * @public
- */
-export const prepareExecution = async (config: BaseExecutionConfig): Promise<ExecutionContext> => {
-  const {
-    promptsPath,
-    schemaPath,
-    outputPath,
-    cwd,
-    timeout,
-    progress = false,
-    append = false,
-    grader,
-    debug = false,
-    concurrency = 1,
-    workspaceDir,
-  } = config
-
-  // Validate prompt source
-  if (!config.prompts && !promptsPath) {
-    throw new Error('Either promptsPath or prompts must be provided')
-  }
-
-  // Load and validate schema
-  const schemaFile = Bun.file(schemaPath)
-  if (!(await schemaFile.exists())) {
-    throw new Error(`Schema file not found: ${schemaPath}`)
-  }
-
-  let schema: HeadlessAdapterConfig
-  try {
-    const rawSchema = await schemaFile.json()
-    schema = parseHeadlessConfig(rawSchema)
-  } catch (error) {
-    throw new Error(`Invalid schema: ${error instanceof Error ? error.message : String(error)}`)
-  }
-
-  // Load prompts
-  const prompts = config.prompts ?? (await loadPrompts(promptsPath!))
-
-  // Resolve paths
-  const resolvedOutputPath = outputPath ? resolvePath(outputPath) : undefined
-  const resolvedWorkspaceDir = workspaceDir ? resolvePath(workspaceDir) : undefined
-
-  // Determine effective timeout (CLI flag > schema default > harness default)
-  const schemaTimeout = 'timeout' in schema ? schema.timeout : undefined
-  const effectiveTimeout = timeout ?? schemaTimeout ?? DEFAULT_HARNESS_TIMEOUT
-
-  // Create session manager
-  const sessions = createSessionManager({
-    schema,
-    timeout: effectiveTimeout,
-    verbose: progress,
-    debug,
-  })
-
-  // Initialize output file (clear if not appending)
-  if (resolvedOutputPath && !append) {
-    await Bun.write(resolvedOutputPath, '')
-  }
-
-  // Create workspace base directory if specified
-  if (resolvedWorkspaceDir) {
-    await mkdir(resolvedWorkspaceDir, { recursive: true })
-  }
-
-  const defaultWorkingDir = cwd ?? process.cwd()
-
-  // Create write mutex with closure for coordinated result writing
-  const writeMutex = createWriteMutex()
-  let isFirstOutput = true
-
-  const writeResult = async (result: unknown) => {
-    await writeMutex.write(async () => {
-      const formatted = JSON.stringify(result)
-      await writeOutput(formatted, resolvedOutputPath, !isFirstOutput)
-      isFirstOutput = false
-    })
-  }
-
-  return {
-    schema,
-    prompts,
-    sessions,
-    resolvedOutputPath,
-    resolvedWorkspaceDir,
-    effectiveTimeout,
-    defaultWorkingDir,
-    concurrency,
-    progress,
-    grader,
-    debug,
-    writeResult,
-  }
-}
-
-// ============================================================================
-// Worker Pool Execution
-// ============================================================================
-
-/**
- * Execute prompts through a worker pool with progress logging.
- *
- * @remarks
- * Common wrapper for the runWorkerPool pattern used by both capture and trials.
- * Handles progress callbacks, error logging, and completion logging.
- *
- * @param ctx - Execution context from prepareExecution
- * @param processFn - Function to process each prompt
- * @returns Array of results
- *
- * @public
- */
-export const executePrompts = async <T>(
-  ctx: ExecutionContext,
-  processFn: (promptCase: PromptCase, index: number) => Promise<T>,
-): Promise<T[]> => {
-  const { results, errors } = await runWorkerPool(ctx.prompts, processFn, {
-    concurrency: ctx.concurrency,
-    onProgress: (completed, total) => {
-      logProgress(`Progress: ${completed}/${total} prompts completed`, ctx.progress)
-    },
-  })
-
-  if (errors.length > 0) {
-    logProgress(`Completed with ${errors.length} error(s)`, ctx.progress)
-  }
-
-  logProgress('Done!', ctx.progress)
-  return results
-}
-
-// ============================================================================
-// CLI Helpers
-// ============================================================================
-
-/**
- * Parse and validate concurrency CLI argument.
- *
- * @param value - Raw string value from parseArgs
- * @returns Validated positive integer (default: 1)
- *
- * @public
- */
-export const parseConcurrency = (value: string | undefined): number => {
-  if (!value) return 1
-  const parsed = Number.parseInt(value, 10)
-  if (Number.isNaN(parsed) || parsed < 1) {
-    console.error('Error: --concurrency must be a positive integer')
-    process.exit(1)
-  }
-  return parsed
-}
diff --git a/src/commands/summarize.ts b/src/commands/summarize.ts
deleted file mode 100644
index 81499f7..0000000
--- a/src/commands/summarize.ts
+++ /dev/null
@@ -1,226 +0,0 @@
-/**
- * Summarize command - derive compact views from full trajectory results.
- *
- * @remarks
- * Transforms full trajectory JSONL into:
- * - Summary JSONL: Compact format for jq analysis
- * - Markdown: Human-readable format for LLM-as-judge workflows
- *
- * @packageDocumentation
- */
-
-import { parseArgs } from 'node:util'
-import { extractContent, extractFilePath, headTailPreview, loadResults, resolvePath } from '../core.ts'
-import { HEAD_LINES, MAX_CONTENT_LENGTH, TAIL_LINES } from '../schemas/constants.ts'
-import type { CaptureResult, SummaryResult } from '../schemas.ts'
-
-// ============================================================================
-// Types
-// ============================================================================
-
-/** Configuration for summarize command */
-export type SummarizeConfig = {
-  /** Path to results.jsonl file */
-  resultsPath: string
-  /** Output file path */
-  outputPath?: string
-  /** Output as markdown instead of JSONL */
-  markdown?: boolean
-}
-
-/**
- * Format capture result as compact summary.
- *
- * @param result - Full capture result
- * @returns Compact summary result
- *
- * @public
- */
-export const formatSummary = (result: CaptureResult): SummaryResult => {
-  const inputText = Array.isArray(result.input) ? result.input.join('\n') : result.input
-  return {
-    id: result.id,
-    input: inputText,
-    output: result.output,
-    toolCalls: result.trajectory.flatMap((s) => (s.type === 'tool_call' ? [s.name] : [])),
-    duration: result.timing.end - result.timing.start,
-  }
-}
-
-/**
- * Format capture result as markdown with step IDs.
- *
- * @param result - Full capture result
- * @returns Markdown formatted string
- *
- * @public
- */
-export const formatMarkdown = (result: CaptureResult): string => {
-  const inputText = Array.isArray(result.input) ? result.input.join('\n') : result.input
-  const lines: string[] = [`## Evaluation Record: ${result.id}`, '', `**Input:** ${inputText}`, '', '**Trajectory:**']
-
-  let stepNum = 1
-  for (const step of result.trajectory) {
-    const stepId = `${result.id}-step-${stepNum}`
-
-    if (step.type === 'thought') {
-      const preview = step.content.slice(0, 100)
-      const truncated = step.content.length > 100 ? '...' : ''
-      lines.push(`${stepNum}. [THOUGHT] ${preview}${truncated} [→${stepId}]`)
-      stepNum++
-    } else if (step.type === 'tool_call') {
-      const duration = step.duration ? ` (${step.duration}ms)` : ''
-      const filePath = extractFilePath(step.input)
-      const content = extractContent(step.input)
-
-      lines.push(`${stepNum}. [TOOL:${step.name}] -> ${step.status}${duration} [→${stepId}]`)
-
-      // Add file path if present
-      if (filePath) {
-        const charCount = content?.length ?? 0
-        lines.push(`   File: ${filePath}${charCount > 0 ? ` (${charCount} chars)` : ''}`)
-      }
-
-      // Add head/tail preview for content-producing tools
-      if (content && content.length > 0) {
-        const preview = content.length > MAX_CONTENT_LENGTH ? headTailPreview(content, HEAD_LINES, TAIL_LINES) : content
-        // Detect file extension for syntax highlighting
-        const ext = filePath?.split('.').pop() ?? 'typescript'
-        lines.push(`   \`\`\`${ext}`)
-        lines.push(`   ${preview.split('\n').join('\n   ')}`)
-        lines.push('   ```')
-      }
-      stepNum++
-    } else if (step.type === 'plan') {
-      const entries = step.entries as Array<{ content: string; status: string }>
-      const planSummary = entries.map((e) => `${e.content}: ${e.status}`).join(', ')
-      const truncated = planSummary.length > 80 ? '...' : ''
-      lines.push(`${stepNum}. [PLAN] ${planSummary.slice(0, 80)}${truncated} [→${stepId}]`)
-      stepNum++
-    } else if (step.type === 'message') {
-      const preview = step.content.slice(0, 100)
-      const truncated = step.content.length > 100 ? '...' : ''
-      lines.push(`${stepNum}. [MESSAGE] ${preview}${truncated} [→${stepId}]`)
-      stepNum++
-    }
-  }
-
-  lines.push('')
-  const outputPreview = result.output.slice(0, 200)
-  const outputTruncated = result.output.length > 200 ? '...' : ''
-  lines.push(`**Output:** ${outputPreview}${outputTruncated}`)
-  lines.push('')
-
-  const metadataStr = Object.entries(result.metadata)
-    .map(([k, v]) => `${k}=${v}`)
-    .join(', ')
-  lines.push(`**Metadata:** ${metadataStr}`)
-  lines.push(`**Tool Errors:** ${result.toolErrors}`)
-  lines.push(`**Duration:** ${result.timing.end - result.timing.start}ms`)
-
-  if (result.score) {
-    lines.push(`**Score:** ${result.score.pass ? 'PASS' : 'FAIL'} (${result.score.score})`)
-    if (result.score.reasoning) {
-      lines.push(`**Reasoning:** ${result.score.reasoning}`)
-    }
-  }
-
-  lines.push('')
-  lines.push('---')
-  lines.push('')
-
-  return lines.join('\n')
-}
-
-// ============================================================================
-// Summarize Implementation
-// ============================================================================
-
-/**
- * Execute summarize with configuration object.
- *
- * @param config - Summarize configuration
- * @returns Formatted output string
- */
-export const runSummarize = async (config: SummarizeConfig): Promise<string> => {
-  const { resultsPath, outputPath, markdown = false } = config
-
-  // Load results
-  const results = await loadResults(resultsPath)
-
-  // Format output
-  let output: string
-  if (markdown) {
-    output = results.map(formatMarkdown).join('\n')
-  } else {
-    output = results.map((r) => JSON.stringify(formatSummary(r))).join('\n')
-  }
-
-  // Write output
-  if (outputPath) {
-    await Bun.write(resolvePath(outputPath), output)
-  } else {
-    console.log(output)
-  }
-
-  return output
-}
-
-// ============================================================================
-// CLI Entry Point
-// ============================================================================
-
-/**
- * Summarize command CLI handler.
- *
- * @param args - Command line arguments (after 'summarize')
- */
-export const summarize = async (args: string[]): Promise<void> => {
-  const { values, positionals } = parseArgs({
-    args,
-    options: {
-      output: { type: 'string', short: 'o' },
-      markdown: { type: 'boolean', short: 'm', default: false },
-      help: { type: 'boolean', short: 'h' },
-    },
-    allowPositionals: true,
-  })
-
-  if (values.help) {
-    console.log(`
-Usage: agent-eval-harness summarize <results.jsonl> [options]
-
-Arguments:
-  results.jsonl     Input file with capture results
-
-Options:
-  -o, --output      Output file (default: stdout)
-  -m, --markdown    Output as markdown instead of JSONL
-  -h, --help        Show this help message
-
-Output Formats:
-  JSONL (default): Compact summary with id, input, output, toolCalls, duration
-  Markdown (-m):   Human-readable format with step IDs for LLM-as-judge
-
-Examples:
-  # Summary JSONL for jq analysis
-  agent-eval-harness summarize results.jsonl -o summary.jsonl
-
-  # Markdown for LLM evaluation
-  agent-eval-harness summarize results.jsonl --markdown -o results.md
-`)
-    return
-  }
-
-  const resultsPath = positionals[0]
-  if (!resultsPath) {
-    console.error('Error: results.jsonl path is required')
-    process.exit(1)
-  }
-
-  await runSummarize({
-    resultsPath,
-    outputPath: values.output,
-    markdown: values.markdown ?? false,
-  })
-}
diff --git a/src/commands/tests/balance-helpers.spec.ts b/src/commands/tests/balance-helpers.spec.ts
deleted file mode 100644
index 6641c34..0000000
--- a/src/commands/tests/balance-helpers.spec.ts
+++ /dev/null
@@ -1,279 +0,0 @@
-import { describe, expect, test } from 'bun:test'
-import type { CategoryDistribution, PromptCase } from '../../schemas.ts'
-import { analyzeCategories, findUnderrepresented, generateSuggestions } from '../balance.ts'
-
-// ============================================================================
-// analyzeCategories
-// ============================================================================
-
-describe('analyzeCategories', () => {
-  test('counts prompts by category', () => {
-    const prompts: PromptCase[] = [
-      { id: '1', input: 'test', metadata: { category: 'math' } },
-      { id: '2', input: 'test', metadata: { category: 'math' } },
-      { id: '3', input: 'test', metadata: { category: 'code' } },
-    ]
-
-    const result = analyzeCategories(prompts, 'category')
-
-    expect(result).toHaveLength(2)
-    const math = result.find((d) => d.name === 'math')
-    const code = result.find((d) => d.name === 'code')
-
-    expect(math?.count).toBe(2)
-    expect(code?.count).toBe(1)
-  })
-
-  test('calculates percentages correctly', () => {
-    const prompts: PromptCase[] = [
-      { id: '1', input: 'test', metadata: { category: 'a' } },
-      { id: '2', input: 'test', metadata: { category: 'a' } },
-      { id: '3', input: 'test', metadata: { category: 'b' } },
-      { id: '4', input: 'test', metadata: { category: 'b' } },
-    ]
-
-    const result = analyzeCategories(prompts, 'category')
-
-    expect(result[0]?.percentage).toBe(50)
-    expect(result[1]?.percentage).toBe(50)
-  })
-
-  test('sorts by count descending', () => {
-    const prompts: PromptCase[] = [
-      { id: '1', input: 'test', metadata: { category: 'small' } },
-      { id: '2', input: 'test', metadata: { category: 'large' } },
-      { id: '3', input: 'test', metadata: { category: 'large' } },
-      { id: '4', input: 'test', metadata: { category: 'large' } },
-      { id: '5', input: 'test', metadata: { category: 'medium' } },
-      { id: '6', input: 'test', metadata: { category: 'medium' } },
-    ]
-
-    const result = analyzeCategories(prompts, 'category')
-
-    expect(result[0]?.name).toBe('large')
-    expect(result[1]?.name).toBe('medium')
-    expect(result[2]?.name).toBe('small')
-  })
-
-  test('handles missing metadata as (uncategorized)', () => {
-    const prompts: PromptCase[] = [
-      { id: '1', input: 'test', metadata: { category: 'known' } },
-      { id: '2', input: 'test' }, // No metadata
-      { id: '3', input: 'test', metadata: {} }, // Empty metadata
-    ]
-
-    const result = analyzeCategories(prompts, 'category')
-
-    const uncategorized = result.find((d) => d.name === '(uncategorized)')
-    expect(uncategorized?.count).toBe(2)
-  })
-
-  test('handles different metadata keys', () => {
-    const prompts: PromptCase[] = [
-      { id: '1', input: 'test', metadata: { difficulty: 'easy', category: 'math' } },
-      { id: '2', input: 'test', metadata: { difficulty: 'hard', category: 'math' } },
-      { id: '3', input: 'test', metadata: { difficulty: 'easy', category: 'code' } },
-    ]
-
-    const byDifficulty = analyzeCategories(prompts, 'difficulty')
-    const byCategory = analyzeCategories(prompts, 'category')
-
-    expect(byDifficulty.find((d) => d.name === 'easy')?.count).toBe(2)
-    expect(byCategory.find((d) => d.name === 'math')?.count).toBe(2)
-  })
-
-  test('converts non-string metadata values to strings', () => {
-    const prompts: PromptCase[] = [
-      { id: '1', input: 'test', metadata: { level: 1 } },
-      { id: '2', input: 'test', metadata: { level: 1 } },
-      { id: '3', input: 'test', metadata: { level: 2 } },
-    ]
-
-    const result = analyzeCategories(prompts, 'level')
-
-    expect(result.find((d) => d.name === '1')?.count).toBe(2)
-    expect(result.find((d) => d.name === '2')?.count).toBe(1)
-  })
-
-  test('handles empty prompts array', () => {
-    const result = analyzeCategories([], 'category')
-    expect(result).toEqual([])
-  })
-
-  test('rounds percentages to integers', () => {
-    const prompts: PromptCase[] = [
-      { id: '1', input: 'test', metadata: { category: 'a' } },
-      { id: '2', input: 'test', metadata: { category: 'b' } },
-      { id: '3', input: 'test', metadata: { category: 'c' } },
-    ]
-
-    const result = analyzeCategories(prompts, 'category')
-
-    // 1/3 = 33.33... should round to 33
-    for (const dist of result) {
-      expect(Number.isInteger(dist.percentage)).toBe(true)
-    }
-  })
-})
-
-// ============================================================================
-// findUnderrepresented
-// ============================================================================
-
-describe('findUnderrepresented', () => {
-  test('identifies categories below threshold', () => {
-    const distributions: CategoryDistribution[] = [
-      { name: 'large', count: 50, percentage: 50 },
-      { name: 'medium', count: 30, percentage: 30 },
-      { name: 'small', count: 20, percentage: 20 },
-    ]
-
-    // Even distribution would be 33.3% each
-    // With 50% threshold, anything below 16.65% is underrepresented
-    const result = findUnderrepresented(distributions, 50)
-
-    // At 50% threshold, 20% is above 16.65%, so nothing should be underrepresented
-    expect(result).toEqual([])
-  })
-
-  test('returns underrepresented categories at stricter threshold', () => {
-    const distributions: CategoryDistribution[] = [
-      { name: 'large', count: 80, percentage: 80 },
-      { name: 'small', count: 20, percentage: 20 },
-    ]
-
-    // Even distribution would be 50% each
-    // With 50% threshold, anything below 25% is underrepresented
-    const result = findUnderrepresented(distributions, 50)
-
-    expect(result).toContain('small')
-    expect(result).not.toContain('large')
-  })
-
-  test('handles even distribution (no underrepresentation)', () => {
-    const distributions: CategoryDistribution[] = [
-      { name: 'a', count: 25, percentage: 25 },
-      { name: 'b', count: 25, percentage: 25 },
-      { name: 'c', count: 25, percentage: 25 },
-      { name: 'd', count: 25, percentage: 25 },
-    ]
-
-    const result = findUnderrepresented(distributions, 50)
-    expect(result).toEqual([])
-  })
-
-  test('handles single category (never underrepresented)', () => {
-    const distributions: CategoryDistribution[] = [{ name: 'only', count: 100, percentage: 100 }]
-
-    const result = findUnderrepresented(distributions, 50)
-    expect(result).toEqual([])
-  })
-
-  test('threshold affects sensitivity', () => {
-    const distributions: CategoryDistribution[] = [
-      { name: 'large', count: 70, percentage: 70 },
-      { name: 'small', count: 30, percentage: 30 },
-    ]
-
-    // Even = 50%, at 50% threshold: below 25% is underrepresented
-    const strict = findUnderrepresented(distributions, 50)
-    expect(strict).toEqual([])
-
-    // At 80% threshold: below 40% is underrepresented
-    const lenient = findUnderrepresented(distributions, 80)
-    expect(lenient).toContain('small')
-  })
-
-  test('handles empty distributions', () => {
-    const result = findUnderrepresented([], 50)
-    expect(result).toEqual([])
-  })
-})
-
-// ============================================================================
-// generateSuggestions
-// ============================================================================
-
-describe('generateSuggestions', () => {
-  test('suggests adding cases for underrepresented categories', () => {
-    const distributions: CategoryDistribution[] = [
-      { name: 'math', count: 80, percentage: 80 },
-      { name: 'code', count: 20, percentage: 20 },
-    ]
-    const underrepresented = ['code']
-
-    const suggestions = generateSuggestions(distributions, underrepresented, 100)
-
-    expect(suggestions.some((s) => s.includes('code'))).toBe(true)
-    expect(suggestions.some((s) => s.toLowerCase().includes('add'))).toBe(true)
-  })
-
-  test('warns about dominant category (>50%)', () => {
-    const distributions: CategoryDistribution[] = [
-      { name: 'dominant', count: 60, percentage: 60 },
-      { name: 'other', count: 40, percentage: 40 },
-    ]
-
-    const suggestions = generateSuggestions(distributions, [], 100)
-
-    expect(suggestions.some((s) => s.includes('dominant') && s.includes('60%'))).toBe(true)
-    expect(suggestions.some((s) => s.toLowerCase().includes('diversify'))).toBe(true)
-  })
-
-  test('warns about tiny categories (<3 cases)', () => {
-    const distributions: CategoryDistribution[] = [
-      { name: 'large', count: 97, percentage: 97 },
-      { name: 'tiny', count: 2, percentage: 2 },
-      { name: 'also_tiny', count: 1, percentage: 1 },
-    ]
-
-    const suggestions = generateSuggestions(distributions, [], 100)
-
-    expect(suggestions.some((s) => s.includes('tiny') || s.includes('also_tiny'))).toBe(true)
-    expect(suggestions.some((s) => s.includes('< 3 cases'))).toBe(true)
-  })
-
-  test('suggests expanding small test sets (<20 cases)', () => {
-    const distributions: CategoryDistribution[] = [
-      { name: 'a', count: 5, percentage: 50 },
-      { name: 'b', count: 5, percentage: 50 },
-    ]
-
-    const suggestions = generateSuggestions(distributions, [], 10)
-
-    expect(suggestions.some((s) => s.includes('10 cases') && s.toLowerCase().includes('expand'))).toBe(true)
-  })
-
-  test('returns "well-balanced" when no issues found', () => {
-    const distributions: CategoryDistribution[] = [
-      { name: 'a', count: 25, percentage: 25 },
-      { name: 'b', count: 25, percentage: 25 },
-      { name: 'c', count: 25, percentage: 25 },
-      { name: 'd', count: 25, percentage: 25 },
-    ]
-
-    const suggestions = generateSuggestions(distributions, [], 100)
-
-    expect(suggestions.some((s) => s.toLowerCase().includes('well-balanced'))).toBe(true)
-  })
-
-  test('combines multiple suggestions', () => {
-    const distributions: CategoryDistribution[] = [
-      { name: 'huge', count: 8, percentage: 80 },
-      { name: 'tiny', count: 2, percentage: 20 },
-    ]
-    const underrepresented = ['tiny']
-
-    const suggestions = generateSuggestions(distributions, underrepresented, 10)
-
-    // Should have multiple suggestions: underrepresented, dominant, tiny count, small test set
-    expect(suggestions.length).toBeGreaterThanOrEqual(2)
-  })
-
-  test('handles empty distributions', () => {
-    const suggestions = generateSuggestions([], [], 0)
-
-    // Should suggest expanding (0 cases)
-    expect(suggestions.some((s) => s.includes('0 cases'))).toBe(true)
-  })
-})
diff --git a/src/commands/tests/calibrate-helpers.spec.ts b/src/commands/tests/calibrate-helpers.spec.ts
deleted file mode 100644
index becdff1..0000000
--- a/src/commands/tests/calibrate-helpers.spec.ts
+++ /dev/null
@@ -1,226 +0,0 @@
-import { describe, expect, test } from 'bun:test'
-import type { TrajectoryStep } from '../../schemas.ts'
-import { getTrajectorySnippet, sampleArray } from '../calibrate.ts'
-
-// ============================================================================
-// sampleArray
-// ============================================================================
-
-describe('sampleArray', () => {
-  test('returns n elements from array', () => {
-    const arr = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
-    const result = sampleArray(arr, 3)
-
-    expect(result).toHaveLength(3)
-  })
-
-  test('returns all elements when n >= array length', () => {
-    const arr = [1, 2, 3]
-    const result = sampleArray(arr, 5)
-
-    expect(result).toHaveLength(3)
-    expect(new Set(result)).toEqual(new Set(arr))
-  })
-
-  test('returns empty array for empty input', () => {
-    const result = sampleArray([], 5)
-    expect(result).toEqual([])
-  })
-
-  test('returns empty array when n is 0', () => {
-    const arr = [1, 2, 3]
-    const result = sampleArray(arr, 0)
-
-    expect(result).toEqual([])
-  })
-
-  test('does not modify original array', () => {
-    const arr = [1, 2, 3, 4, 5]
-    const original = [...arr]
-    sampleArray(arr, 3)
-
-    expect(arr).toEqual(original)
-  })
-
-  test('returns unique elements (no duplicates)', () => {
-    const arr = [1, 2, 3, 4, 5]
-    const result = sampleArray(arr, 3)
-
-    const uniqueResult = new Set(result)
-    expect(uniqueResult.size).toBe(result.length)
-  })
-
-  test('all returned elements exist in original array', () => {
-    const arr = ['a', 'b', 'c', 'd', 'e']
-    const result = sampleArray(arr, 3)
-
-    for (const item of result) {
-      expect(arr).toContain(item)
-    }
-  })
-
-  test('works with objects', () => {
-    const arr = [{ id: 1 }, { id: 2 }, { id: 3 }, { id: 4 }]
-    const result = sampleArray(arr, 2)
-
-    expect(result).toHaveLength(2)
-    for (const item of result) {
-      expect(arr).toContainEqual(item)
-    }
-  })
-
-  test('produces different results on multiple calls (randomness)', () => {
-    const arr = Array.from({ length: 100 }, (_, i) => i)
-    const results = new Set<string>()
-
-    // Run multiple times and check we get different orderings
-    for (let i = 0; i < 10; i++) {
-      const sample = sampleArray(arr, 10)
-      results.add(sample.join(','))
-    }
-
-    // With 100 elements, sampling 10, we should get different results
-    // This is probabilistic but extremely unlikely to fail
-    expect(results.size).toBeGreaterThan(1)
-  })
-})
-
-// ============================================================================
-// getTrajectorySnippet
-// ============================================================================
-
-describe('getTrajectorySnippet', () => {
-  const createStep = (index: number): TrajectoryStep => ({
-    type: 'message',
-    content: `Step ${index}`,
-    timestamp: index * 100,
-  })
-
-  test('returns full trajectory when length <= maxSteps', () => {
-    const trajectory: TrajectoryStep[] = [createStep(1), createStep(2), createStep(3)]
-
-    const result = getTrajectorySnippet(trajectory, 5)
-
-    expect(result).toHaveLength(3)
-    expect(result).toEqual(trajectory)
-  })
-
-  test('returns maxSteps elements for longer trajectories', () => {
-    const trajectory: TrajectoryStep[] = Array.from({ length: 10 }, (_, i) => createStep(i + 1))
-
-    const result = getTrajectorySnippet(trajectory, 5)
-
-    expect(result).toHaveLength(5)
-  })
-
-  test('includes first two steps', () => {
-    const trajectory: TrajectoryStep[] = Array.from({ length: 10 }, (_, i) => createStep(i + 1))
-
-    const result = getTrajectorySnippet(trajectory, 5)
-
-    expect(result[0]).toEqual(createStep(1))
-    expect(result[1]).toEqual(createStep(2))
-  })
-
-  test('includes last two steps', () => {
-    const trajectory: TrajectoryStep[] = Array.from({ length: 10 }, (_, i) => createStep(i + 1))
-
-    const result = getTrajectorySnippet(trajectory, 5)
-
-    expect(result[3]).toEqual(createStep(9))
-    expect(result[4]).toEqual(createStep(10))
-  })
-
-  test('includes middle step', () => {
-    const trajectory: TrajectoryStep[] = Array.from({ length: 10 }, (_, i) => createStep(i + 1))
-
-    const result = getTrajectorySnippet(trajectory, 5)
-
-    // Middle of 10 is index 5 (0-indexed), which is Step 6
-    expect(result[2]).toEqual(createStep(6))
-  })
-
-  test('handles empty trajectory', () => {
-    const result = getTrajectorySnippet([], 5)
-    expect(result).toEqual([])
-  })
-
-  test('handles single element trajectory', () => {
-    const trajectory: TrajectoryStep[] = [createStep(1)]
-
-    const result = getTrajectorySnippet(trajectory, 5)
-
-    expect(result).toEqual([createStep(1)])
-  })
-
-  test('handles trajectory exactly at maxSteps boundary', () => {
-    const trajectory: TrajectoryStep[] = Array.from({ length: 5 }, (_, i) => createStep(i + 1))
-
-    const result = getTrajectorySnippet(trajectory, 5)
-
-    expect(result).toHaveLength(5)
-    expect(result).toEqual(trajectory)
-  })
-
-  test('respects custom maxSteps parameter', () => {
-    const trajectory: TrajectoryStep[] = Array.from({ length: 20 }, (_, i) => createStep(i + 1))
-
-    const result3 = getTrajectorySnippet(trajectory, 3)
-    const result7 = getTrajectorySnippet(trajectory, 7)
-
-    // With maxSteps=3, should still return 5 (first 2 + middle + last 2)
-    // because the algorithm always takes first 2, middle 1, last 2
-    // But the function returns full trajectory if <= maxSteps
-    expect(result3.length).toBeLessThanOrEqual(trajectory.length)
-    expect(result7.length).toBeLessThanOrEqual(trajectory.length)
-  })
-
-  test('works with different step types', () => {
-    const trajectory: TrajectoryStep[] = [
-      { type: 'thought', content: 'Thinking...', timestamp: 0 },
-      { type: 'tool_call', name: 'Read', status: 'completed', timestamp: 100 },
-      { type: 'tool_call', name: 'Write', status: 'completed', timestamp: 200 },
-      { type: 'tool_call', name: 'Bash', status: 'completed', timestamp: 300 },
-      { type: 'tool_call', name: 'Grep', status: 'completed', timestamp: 400 },
-      { type: 'tool_call', name: 'Glob', status: 'completed', timestamp: 500 },
-      { type: 'plan', entries: [{ content: 'Plan', status: 'done' }], timestamp: 600 },
-      { type: 'message', content: 'Done!', timestamp: 700 },
-    ]
-
-    const result = getTrajectorySnippet(trajectory, 5)
-
-    expect(result).toHaveLength(5)
-    // First two
-    expect(result[0]?.type).toBe('thought')
-    expect(result[1]?.type).toBe('tool_call')
-    // Last two
-    expect(result[3]?.type).toBe('plan')
-    expect(result[4]?.type).toBe('message')
-  })
-
-  test('preserves step content when extracting', () => {
-    const trajectory: TrajectoryStep[] = [
-      { type: 'thought', content: 'First thought', timestamp: 0 },
-      { type: 'message', content: 'First message', timestamp: 100 },
-      { type: 'tool_call', name: 'Read', status: 'completed', input: { file_path: '/test.ts' }, timestamp: 200 },
-      { type: 'tool_call', name: 'Write', status: 'completed', timestamp: 300 },
-      { type: 'tool_call', name: 'Bash', status: 'failed', timestamp: 400 },
-      { type: 'message', content: 'Last message', timestamp: 500 },
-    ]
-
-    const result = getTrajectorySnippet(trajectory, 5)
-
-    // First step should preserve all properties
-    const firstStep = result[0]
-    if (firstStep?.type === 'thought') {
-      expect(firstStep.content).toBe('First thought')
-      expect(firstStep.timestamp).toBe(0)
-    }
-
-    // Last step should preserve all properties
-    const lastStep = result[result.length - 1]
-    if (lastStep?.type === 'message') {
-      expect(lastStep.content).toBe('Last message')
-    }
-  })
-})
diff --git a/src/commands/tests/capture-cli.spec.ts b/src/commands/tests/capture-cli.spec.ts
deleted file mode 100644
index 2e6e76d..0000000
--- a/src/commands/tests/capture-cli.spec.ts
+++ /dev/null
@@ -1,274 +0,0 @@
-import { afterEach, beforeEach, describe, expect, test } from 'bun:test'
-import type { CaptureConfig } from '../capture.ts'
-import { loadPrompts } from '../capture.ts'
-
-// ============================================================================
-// loadPrompts
-// ============================================================================
-
-describe('loadPrompts', () => {
-  const testPromptFile = '/tmp/agent-eval-harness-test-prompts.jsonl'
-
-  beforeEach(async () => {
-    await Bun.$`rm -f ${testPromptFile}`.nothrow()
-  })
-
-  afterEach(async () => {
-    await Bun.$`rm -f ${testPromptFile}`.nothrow()
-  })
-
-  test('loads single-turn prompts', async () => {
-    await Bun.write(
-      testPromptFile,
-      `{"id": "t1", "input": "Hello"}
-{"id": "t2", "input": "World"}`,
-    )
-
-    const prompts = await loadPrompts(testPromptFile)
-
-    expect(prompts).toHaveLength(2)
-    expect(prompts[0]?.id).toBe('t1')
-    expect(prompts[0]?.input).toBe('Hello')
-    expect(prompts[1]?.id).toBe('t2')
-    expect(prompts[1]?.input).toBe('World')
-  })
-
-  test('loads multi-turn prompts', async () => {
-    await Bun.write(testPromptFile, `{"id": "conv1", "input": ["Hi", "How are you?", "Bye"]}`)
-
-    const prompts = await loadPrompts(testPromptFile)
-
-    expect(prompts).toHaveLength(1)
-    expect(prompts[0]?.id).toBe('conv1')
-    expect(Array.isArray(prompts[0]?.input)).toBe(true)
-    expect(prompts[0]?.input).toEqual(['Hi', 'How are you?', 'Bye'])
-  })
-
-  test('loads prompts with hint field', async () => {
-    await Bun.write(testPromptFile, `{"id": "t1", "input": "2+2?", "hint": "4"}`)
-
-    const prompts = await loadPrompts(testPromptFile)
-
-    expect(prompts).toHaveLength(1)
-    expect(prompts[0]?.hint).toBe('4')
-  })
-
-  test('loads prompts with metadata', async () => {
-    await Bun.write(
-      testPromptFile,
-      `{"id": "t1", "input": "Test", "metadata": {"category": "math", "difficulty": "easy"}}`,
-    )
-
-    const prompts = await loadPrompts(testPromptFile)
-
-    expect(prompts).toHaveLength(1)
-    expect(prompts[0]?.metadata).toEqual({ category: 'math', difficulty: 'easy' })
-  })
-
-  test('loads prompts with timeout override', async () => {
-    await Bun.write(testPromptFile, `{"id": "t1", "input": "Slow task", "timeout": 120000}`)
-
-    const prompts = await loadPrompts(testPromptFile)
-
-    expect(prompts).toHaveLength(1)
-    expect(prompts[0]?.timeout).toBe(120000)
-  })
-
-  test('skips empty lines', async () => {
-    await Bun.write(
-      testPromptFile,
-      `{"id": "t1", "input": "First"}
-
-{"id": "t2", "input": "Second"}
-`,
-    )
-
-    const prompts = await loadPrompts(testPromptFile)
-
-    expect(prompts).toHaveLength(2)
-  })
-
-  test('throws on invalid JSON', async () => {
-    await Bun.write(testPromptFile, 'not valid json')
-
-    await expect(loadPrompts(testPromptFile)).rejects.toThrow()
-  })
-
-  test('throws on missing required fields', async () => {
-    await Bun.write(testPromptFile, `{"id": "t1"}`) // missing input
-
-    await expect(loadPrompts(testPromptFile)).rejects.toThrow()
-  })
-})
-
-// ============================================================================
-// runCapture configuration
-// ============================================================================
-
-describe('runCapture configuration', () => {
-  test('CaptureConfig type accepts valid configuration', () => {
-    // Type-level test - if this compiles, the types are correct
-    const config: CaptureConfig = {
-      promptsPath: '/tmp/prompts.jsonl',
-      schemaPath: './schemas/claude-headless.json',
-      outputPath: '/tmp/output.jsonl',
-      cwd: '/tmp',
-      timeout: 30000,
-      progress: true,
-      append: false,
-      debug: false,
-      concurrency: 4,
-      workspaceDir: '/tmp/workspaces',
-    }
-
-    expect(config.promptsPath).toBe('/tmp/prompts.jsonl')
-    expect(config.schemaPath).toBe('./schemas/claude-headless.json')
-    expect(config.concurrency).toBe(4)
-    expect(config.workspaceDir).toBe('/tmp/workspaces')
-  })
-
-  test('CaptureConfig allows minimal configuration', () => {
-    const config: CaptureConfig = {
-      promptsPath: '/tmp/prompts.jsonl',
-      schemaPath: './test-schema.json',
-    }
-
-    expect(config.outputPath).toBeUndefined()
-    expect(config.cwd).toBeUndefined()
-    expect(config.timeout).toBeUndefined()
-    expect(config.progress).toBeUndefined()
-    expect(config.append).toBeUndefined()
-    expect(config.grader).toBeUndefined()
-    expect(config.concurrency).toBeUndefined()
-    expect(config.workspaceDir).toBeUndefined()
-  })
-
-  test('CaptureConfig accepts prompts without promptsPath', () => {
-    const config: CaptureConfig = {
-      schemaPath: './test-schema.json',
-      prompts: [{ id: 't1', input: 'hello' }],
-    }
-
-    expect(config.promptsPath).toBeUndefined()
-    expect(config.prompts).toHaveLength(1)
-  })
-})
-
-// ============================================================================
-// CLI Help Output
-// ============================================================================
-
-describe('capture CLI', () => {
-  test('displays help with --help flag', async () => {
-    const proc = Bun.spawn(['bun', './bin/cli.ts', 'capture', '--help'], {
-      stdout: 'pipe',
-      stderr: 'pipe',
-    })
-
-    const stdout = await new Response(proc.stdout).text()
-    await proc.exited
-
-    expect(stdout).toContain('Usage: agent-eval-harness capture')
-    expect(stdout).toContain('prompts.jsonl')
-    expect(stdout).toContain('-o, --output')
-    expect(stdout).toContain('-c, --cwd')
-    expect(stdout).toContain('-t, --timeout')
-    expect(stdout).toContain('--progress')
-    expect(stdout).toContain('-g, --grader')
-    expect(stdout).toContain('-s, --schema')
-    expect(stdout).toContain('-j, --concurrency')
-    expect(stdout).toContain('--workspace-dir')
-    expect(stdout).toContain('--stdin')
-  })
-
-  test('shows error for --stdin with positional file', async () => {
-    const proc = Bun.spawn(
-      ['bun', './bin/cli.ts', 'capture', '/tmp/prompts.jsonl', '--stdin', '-s', '/tmp/schema.json'],
-      {
-        stdout: 'pipe',
-        stderr: 'pipe',
-      },
-    )
-
-    const stderr = await new Response(proc.stderr).text()
-    const exitCode = await proc.exited
-
-    expect(exitCode).not.toBe(0)
-    expect(stderr).toContain('--stdin and prompts file argument are mutually exclusive')
-  })
-
-  test('shows error for missing prompts file argument', async () => {
-    const proc = Bun.spawn(['bun', './bin/cli.ts', 'capture'], {
-      stdout: 'pipe',
-      stderr: 'pipe',
-    })
-
-    const stderr = await new Response(proc.stderr).text()
-    const exitCode = await proc.exited
-
-    expect(exitCode).not.toBe(0)
-    expect(stderr).toContain('prompts.jsonl path is required')
-  })
-
-  test('shows error for missing schema argument', async () => {
-    const proc = Bun.spawn(['bun', './bin/cli.ts', 'capture', '/tmp/prompts.jsonl'], {
-      stdout: 'pipe',
-      stderr: 'pipe',
-    })
-
-    const stderr = await new Response(proc.stderr).text()
-    const exitCode = await proc.exited
-
-    expect(exitCode).not.toBe(0)
-    expect(stderr).toContain('--schema is required')
-  })
-
-  test('shows error for invalid concurrency value', async () => {
-    const proc = Bun.spawn(
-      ['bun', './bin/cli.ts', 'capture', '/tmp/prompts.jsonl', '-s', '/tmp/schema.json', '-j', 'abc'],
-      {
-        stdout: 'pipe',
-        stderr: 'pipe',
-      },
-    )
-
-    const stderr = await new Response(proc.stderr).text()
-    const exitCode = await proc.exited
-
-    expect(exitCode).not.toBe(0)
-    expect(stderr).toContain('--concurrency must be a positive integer')
-  })
-
-  test('shows error for zero concurrency', async () => {
-    const proc = Bun.spawn(
-      ['bun', './bin/cli.ts', 'capture', '/tmp/prompts.jsonl', '-s', '/tmp/schema.json', '-j', '0'],
-      {
-        stdout: 'pipe',
-        stderr: 'pipe',
-      },
-    )
-
-    const stderr = await new Response(proc.stderr).text()
-    const exitCode = await proc.exited
-
-    expect(exitCode).not.toBe(0)
-    expect(stderr).toContain('--concurrency must be a positive integer')
-  })
-
-  test('shows error for negative concurrency', async () => {
-    // Note: Using --concurrency=-1 format because -j -1 is ambiguous to parseArgs
-    const proc = Bun.spawn(
-      ['bun', './bin/cli.ts', 'capture', '/tmp/prompts.jsonl', '-s', '/tmp/schema.json', '--concurrency=-1'],
-      {
-        stdout: 'pipe',
-        stderr: 'pipe',
-      },
-    )
-
-    const stderr = await new Response(proc.stderr).text()
-    const exitCode = await proc.exited
-
-    expect(exitCode).not.toBe(0)
-    expect(stderr).toContain('--concurrency must be a positive integer')
-  })
-})
diff --git a/src/commands/tests/capture-helpers.spec.ts b/src/commands/tests/capture-helpers.spec.ts
deleted file mode 100644
index b772291..0000000
--- a/src/commands/tests/capture-helpers.spec.ts
+++ /dev/null
@@ -1,634 +0,0 @@
-import { describe, expect, test } from 'bun:test'
-import type { ParsedUpdate } from '../../headless/headless-output-parser.ts'
-import type { TrajectoryStep } from '../../schemas.ts'
-import {
-  detectTrajectoryRichness,
-  extractContent,
-  extractFilePath,
-  extractOutput,
-  extractTrajectory,
-  hasToolErrors,
-  headTailPreview,
-  loadPrompts,
-} from '../capture.ts'
-
-// ============================================================================
-// loadPrompts
-// ============================================================================
-
-describe('loadPrompts', () => {
-  test('parses valid JSONL file with string input', async () => {
-    // Create a temporary test file
-    const testPath = '/tmp/test-prompts-valid.jsonl'
-    await Bun.write(
-      testPath,
-      `{"id": "test-1", "input": "What is 2+2?"}
-{"id": "test-2", "input": "Hello world", "hint": "greeting"}`,
-    )
-
-    const prompts = await loadPrompts(testPath)
-
-    expect(prompts).toHaveLength(2)
-    expect(prompts[0]?.id).toBe('test-1')
-    expect(prompts[0]?.input).toBe('What is 2+2?')
-    expect(prompts[1]?.id).toBe('test-2')
-    expect(prompts[1]?.hint).toBe('greeting')
-  })
-
-  test('parses multi-turn input (string array)', async () => {
-    const testPath = '/tmp/test-prompts-multiturn.jsonl'
-    await Bun.write(testPath, `{"id": "test-1", "input": ["Hello", "How are you?", "Goodbye"], "hint": "farewell"}`)
-
-    const prompts = await loadPrompts(testPath)
-
-    expect(prompts).toHaveLength(1)
-    expect(prompts[0]?.id).toBe('test-1')
-    expect(Array.isArray(prompts[0]?.input)).toBe(true)
-    expect(prompts[0]?.input).toEqual(['Hello', 'How are you?', 'Goodbye'])
-    expect(prompts[0]?.hint).toBe('farewell')
-  })
-
-  test('parses prompts with metadata', async () => {
-    const testPath = '/tmp/test-prompts-metadata.jsonl'
-    await Bun.write(
-      testPath,
-      `{"id": "test-1", "input": "Test", "metadata": {"category": "math", "difficulty": "easy"}}`,
-    )
-
-    const prompts = await loadPrompts(testPath)
-
-    expect(prompts).toHaveLength(1)
-    expect(prompts[0]?.metadata?.category).toBe('math')
-    expect(prompts[0]?.metadata?.difficulty).toBe('easy')
-  })
-
-  test('throws on invalid JSON at specific line', async () => {
-    const testPath = '/tmp/test-prompts-invalid.jsonl'
-    await Bun.write(
-      testPath,
-      `{"id": "test-1", "input": "Valid"}
-{invalid json here}
-{"id": "test-3", "input": "Also valid"}`,
-    )
-
-    await expect(loadPrompts(testPath)).rejects.toThrow('Invalid prompt at line 2')
-  })
-
-  test('throws on missing required fields', async () => {
-    const testPath = '/tmp/test-prompts-missing.jsonl'
-    await Bun.write(testPath, `{"id": "test-1"}`)
-
-    await expect(loadPrompts(testPath)).rejects.toThrow('Invalid prompt at line 1')
-  })
-
-  test('handles empty lines gracefully', async () => {
-    const testPath = '/tmp/test-prompts-empty-lines.jsonl'
-    await Bun.write(
-      testPath,
-      `{"id": "test-1", "input": "First"}
-
-{"id": "test-2", "input": "Second"}
-`,
-    )
-
-    const prompts = await loadPrompts(testPath)
-    expect(prompts).toHaveLength(2)
-  })
-})
-
-// ============================================================================
-// extractTrajectory
-// ============================================================================
-
-describe('extractTrajectory', () => {
-  const baseTime = 0
-
-  test('extracts thoughts from thought type updates', () => {
-    const updates: ParsedUpdate[] = [
-      {
-        type: 'thought',
-        content: 'Let me think about this...',
-        timestamp: 100,
-        raw: { type: 'thought', text: 'Let me think about this...' },
-      },
-    ]
-
-    const trajectory = extractTrajectory(updates, baseTime)
-
-    expect(trajectory).toHaveLength(1)
-    expect(trajectory[0]?.type).toBe('thought')
-    const step = trajectory[0]!
-    expect(step.type === 'thought' && step.content).toBe('Let me think about this...')
-  })
-
-  test('extracts messages from message type updates', () => {
-    const updates: ParsedUpdate[] = [
-      {
-        type: 'message',
-        content: 'Here is my answer.',
-        timestamp: 200,
-        raw: { type: 'message', text: 'Here is my answer.' },
-      },
-    ]
-
-    const trajectory = extractTrajectory(updates, baseTime)
-
-    expect(trajectory).toHaveLength(1)
-    expect(trajectory[0]?.type).toBe('message')
-    const step = trajectory[0]!
-    expect(step.type === 'message' && step.content).toBe('Here is my answer.')
-  })
-
-  test('extracts tool calls with title and status', () => {
-    const updates: ParsedUpdate[] = [
-      {
-        type: 'tool_call',
-        title: 'Read',
-        status: 'pending',
-        timestamp: 300,
-        raw: { tool: 'Read', input: { file_path: '/test.ts' } },
-      },
-    ]
-
-    const trajectory = extractTrajectory(updates, baseTime)
-
-    expect(trajectory).toHaveLength(1)
-    expect(trajectory[0]?.type).toBe('tool_call')
-    const step = trajectory[0]!
-    expect(step.type === 'tool_call' && step.name).toBe('Read')
-    expect(step.type === 'tool_call' && step.status).toBe('pending')
-  })
-
-  test('extracts plan type updates', () => {
-    const updates: ParsedUpdate[] = [
-      {
-        type: 'plan',
-        timestamp: 400,
-        raw: {
-          entries: [
-            { content: 'Step 1', status: 'completed' },
-            { content: 'Step 2', status: 'in_progress' },
-          ],
-        },
-      },
-    ]
-
-    const trajectory = extractTrajectory(updates, baseTime)
-
-    expect(trajectory).toHaveLength(1)
-    expect(trajectory[0]?.type).toBe('plan')
-    // Note: extractTrajectory creates plan entries from the update type
-    // but doesn't extract entries from raw (they are captured via output parser mappings)
-    const step = trajectory[0]!
-    expect(step.type === 'plan').toBe(true)
-  })
-
-  test('handles empty updates', () => {
-    const trajectory = extractTrajectory([], baseTime)
-    expect(trajectory).toEqual([])
-  })
-
-  test('assigns timestamps relative to start time', () => {
-    const startTime = 1000
-    const updates: ParsedUpdate[] = [
-      {
-        type: 'message',
-        content: 'First',
-        timestamp: 1500,
-        raw: { type: 'message', text: 'First' },
-      },
-      {
-        type: 'message',
-        content: 'Second',
-        timestamp: 2000,
-        raw: { type: 'message', text: 'Second' },
-      },
-    ]
-
-    const trajectory = extractTrajectory(updates, startTime)
-
-    expect(trajectory[0]?.timestamp).toBe(500)
-    expect(trajectory[1]?.timestamp).toBe(1000)
-  })
-
-  test('handles updates without content for message/thought types', () => {
-    const updates: ParsedUpdate[] = [
-      {
-        type: 'message',
-        content: undefined, // No content - will have empty string
-        timestamp: 100,
-        raw: { type: 'message' },
-      },
-      {
-        type: 'message',
-        content: 'Has content',
-        timestamp: 200,
-        raw: { type: 'message', text: 'Has content' },
-      },
-    ]
-
-    const trajectory = extractTrajectory(updates, baseTime)
-
-    // Both messages are included - ones without content get empty string
-    expect(trajectory).toHaveLength(2)
-    expect(trajectory[0]?.type).toBe('message')
-    expect(trajectory[1]?.type).toBe('message')
-  })
-
-  test('attaches input to new tool call from update', () => {
-    const updates: ParsedUpdate[] = [
-      {
-        type: 'tool_call',
-        title: 'Read',
-        status: 'pending',
-        input: { file_path: '/src/main.ts' },
-        timestamp: 500,
-        raw: {},
-      },
-    ]
-
-    const trajectory = extractTrajectory(updates, baseTime)
-
-    expect(trajectory).toHaveLength(1)
-    const step = trajectory[0]!
-    expect(step.type === 'tool_call' && step.input).toEqual({ file_path: '/src/main.ts' })
-  })
-
-  test('attaches output to tool call on completion', () => {
-    const updates: ParsedUpdate[] = [
-      {
-        type: 'tool_call',
-        title: 'Read',
-        status: 'pending',
-        input: { file_path: '/src/main.ts' },
-        timestamp: 500,
-        raw: {},
-      },
-      {
-        type: 'tool_call',
-        title: 'Read',
-        status: 'completed',
-        output: 'file contents here',
-        timestamp: 800,
-        raw: {},
-      },
-    ]
-
-    const trajectory = extractTrajectory(updates, baseTime)
-
-    expect(trajectory).toHaveLength(1)
-    const step = trajectory[0]!
-    expect(step.type).toBe('tool_call')
-    if (step.type === 'tool_call') {
-      expect(step.input).toEqual({ file_path: '/src/main.ts' })
-      expect(step.output).toBe('file contents here')
-      expect(step.status).toBe('completed')
-      expect(step.duration).toBe(300)
-    }
-  })
-
-  test('handles sequential same-named tool calls independently', () => {
-    const updates: ParsedUpdate[] = [
-      // First Read: pending → completed
-      {
-        type: 'tool_call',
-        title: 'Read',
-        status: 'pending',
-        input: { file_path: '/src/a.ts' },
-        timestamp: 100,
-        raw: {},
-      },
-      {
-        type: 'tool_call',
-        title: 'Read',
-        status: 'completed',
-        output: 'contents of a.ts',
-        timestamp: 300,
-        raw: {},
-      },
-      // Second Read: pending → completed (same tool name, different args)
-      {
-        type: 'tool_call',
-        title: 'Read',
-        status: 'pending',
-        input: { file_path: '/src/b.ts' },
-        timestamp: 500,
-        raw: {},
-      },
-      {
-        type: 'tool_call',
-        title: 'Read',
-        status: 'completed',
-        output: 'contents of b.ts',
-        timestamp: 700,
-        raw: {},
-      },
-    ]
-
-    const trajectory = extractTrajectory(updates, baseTime)
-
-    // Both calls should appear as separate trajectory steps
-    const toolCalls = trajectory.filter((s) => s.type === 'tool_call')
-    expect(toolCalls).toHaveLength(2)
-
-    const first = toolCalls[0]!
-    expect(first.type === 'tool_call' && first.input).toEqual({ file_path: '/src/a.ts' })
-    expect(first.type === 'tool_call' && first.output).toBe('contents of a.ts')
-    expect(first.type === 'tool_call' && first.status).toBe('completed')
-
-    const second = toolCalls[1]!
-    expect(second.type === 'tool_call' && second.input).toEqual({ file_path: '/src/b.ts' })
-    expect(second.type === 'tool_call' && second.output).toBe('contents of b.ts')
-    expect(second.type === 'tool_call' && second.status).toBe('completed')
-  })
-})
-
-// ============================================================================
-// extractOutput
-// ============================================================================
-
-describe('extractOutput', () => {
-  test('joins message contents with newlines', () => {
-    const trajectory: TrajectoryStep[] = [
-      { type: 'message', content: 'First line', timestamp: 0 },
-      { type: 'message', content: 'Second line', timestamp: 100 },
-    ]
-
-    expect(extractOutput(trajectory)).toBe('First line\nSecond line')
-  })
-
-  test('filters out non-message steps', () => {
-    const trajectory: TrajectoryStep[] = [
-      { type: 'thought', content: 'Thinking...', timestamp: 0 },
-      { type: 'message', content: 'Answer', timestamp: 100 },
-      { type: 'tool_call', name: 'Read', status: 'completed', timestamp: 200 },
-      { type: 'message', content: 'Done', timestamp: 300 },
-    ]
-
-    expect(extractOutput(trajectory)).toBe('Answer\nDone')
-  })
-
-  test('returns empty string for empty trajectory', () => {
-    expect(extractOutput([])).toBe('')
-  })
-
-  test('returns empty string when no messages', () => {
-    const trajectory: TrajectoryStep[] = [
-      { type: 'thought', content: 'Just thinking', timestamp: 0 },
-      { type: 'tool_call', name: 'Read', status: 'completed', timestamp: 100 },
-    ]
-
-    expect(extractOutput(trajectory)).toBe('')
-  })
-
-  test('handles single message', () => {
-    const trajectory: TrajectoryStep[] = [{ type: 'message', content: 'Only message', timestamp: 0 }]
-
-    expect(extractOutput(trajectory)).toBe('Only message')
-  })
-})
-
-// ============================================================================
-// hasToolErrors
-// ============================================================================
-
-describe('hasToolErrors', () => {
-  test('returns false when no tool calls', () => {
-    const trajectory: TrajectoryStep[] = [
-      { type: 'thought', content: 'Thinking', timestamp: 0 },
-      { type: 'message', content: 'Done', timestamp: 100 },
-    ]
-
-    expect(hasToolErrors(trajectory)).toBe(false)
-  })
-
-  test('returns false when all tool calls succeeded', () => {
-    const trajectory: TrajectoryStep[] = [
-      { type: 'tool_call', name: 'Read', status: 'completed', timestamp: 0 },
-      { type: 'tool_call', name: 'Write', status: 'completed', timestamp: 100 },
-    ]
-
-    expect(hasToolErrors(trajectory)).toBe(false)
-  })
-
-  test('returns true when any tool call failed', () => {
-    const trajectory: TrajectoryStep[] = [
-      { type: 'tool_call', name: 'Read', status: 'completed', timestamp: 0 },
-      { type: 'tool_call', name: 'Write', status: 'failed', timestamp: 100 },
-      { type: 'tool_call', name: 'Bash', status: 'completed', timestamp: 200 },
-    ]
-
-    expect(hasToolErrors(trajectory)).toBe(true)
-  })
-
-  test('returns false for empty trajectory', () => {
-    expect(hasToolErrors([])).toBe(false)
-  })
-
-  test('returns true when only tool call failed', () => {
-    const trajectory: TrajectoryStep[] = [{ type: 'tool_call', name: 'Bash', status: 'failed', timestamp: 0 }]
-
-    expect(hasToolErrors(trajectory)).toBe(true)
-  })
-})
-
-// ============================================================================
-// headTailPreview
-// ============================================================================
-
-describe('headTailPreview', () => {
-  test('returns full content when under limit', () => {
-    const content = 'line1\nline2\nline3'
-    expect(headTailPreview(content, 5, 5)).toBe(content)
-  })
-
-  test('truncates with omitted count for long content', () => {
-    const lines = Array.from({ length: 20 }, (_, i) => `line${i + 1}`)
-    const content = lines.join('\n')
-
-    const result = headTailPreview(content, 3, 3)
-
-    expect(result).toContain('line1')
-    expect(result).toContain('line2')
-    expect(result).toContain('line3')
-    expect(result).toContain('line18')
-    expect(result).toContain('line19')
-    expect(result).toContain('line20')
-    expect(result).toContain('14 lines omitted')
-  })
-
-  test('respects custom head line count', () => {
-    const lines = Array.from({ length: 10 }, (_, i) => `line${i + 1}`)
-    const content = lines.join('\n')
-
-    const result = headTailPreview(content, 2, 2)
-
-    expect(result).toContain('line1')
-    expect(result).toContain('line2')
-    expect(result).not.toContain('line3')
-    expect(result).toContain('6 lines omitted')
-  })
-
-  test('respects custom tail line count', () => {
-    const lines = Array.from({ length: 10 }, (_, i) => `line${i + 1}`)
-    const content = lines.join('\n')
-
-    const result = headTailPreview(content, 1, 4)
-
-    expect(result).toContain('line1')
-    expect(result).toContain('line7')
-    expect(result).toContain('line10')
-    expect(result).toContain('5 lines omitted')
-  })
-
-  test('handles content exactly at boundary', () => {
-    const content = 'line1\nline2\nline3\nline4\nline5\nline6'
-    // 6 lines, head=3, tail=3 means no truncation needed
-    expect(headTailPreview(content, 3, 3)).toBe(content)
-  })
-
-  test('handles single line content', () => {
-    const content = 'single line'
-    expect(headTailPreview(content, 3, 3)).toBe(content)
-  })
-
-  test('handles empty content', () => {
-    expect(headTailPreview('', 3, 3)).toBe('')
-  })
-})
-
-// ============================================================================
-// extractFilePath
-// ============================================================================
-
-describe('extractFilePath', () => {
-  test('extracts file_path field', () => {
-    const input = { file_path: '/path/to/file.ts' }
-    expect(extractFilePath(input)).toBe('/path/to/file.ts')
-  })
-
-  test('extracts path field as fallback', () => {
-    const input = { path: '/another/path.js' }
-    expect(extractFilePath(input)).toBe('/another/path.js')
-  })
-
-  test('prefers file_path over path', () => {
-    const input = { file_path: '/preferred.ts', path: '/fallback.ts' }
-    expect(extractFilePath(input)).toBe('/preferred.ts')
-  })
-
-  test('returns undefined for invalid input', () => {
-    expect(extractFilePath(null)).toBeUndefined()
-    expect(extractFilePath(undefined)).toBeUndefined()
-    expect(extractFilePath('string')).toBeUndefined()
-    expect(extractFilePath(123)).toBeUndefined()
-  })
-
-  test('returns undefined when no path fields present', () => {
-    const input = { content: 'some content' }
-    expect(extractFilePath(input)).toBeUndefined()
-  })
-
-  test('handles empty object', () => {
-    expect(extractFilePath({})).toBeUndefined()
-  })
-})
-
-// ============================================================================
-// extractContent
-// ============================================================================
-
-describe('extractContent', () => {
-  test('extracts content field', () => {
-    const input = { content: 'const x = 1;' }
-    expect(extractContent(input)).toBe('const x = 1;')
-  })
-
-  test('extracts new_string field as fallback', () => {
-    const input = { new_string: 'const y = 2;' }
-    expect(extractContent(input)).toBe('const y = 2;')
-  })
-
-  test('prefers content over new_string', () => {
-    const input = { content: 'preferred', new_string: 'fallback' }
-    expect(extractContent(input)).toBe('preferred')
-  })
-
-  test('returns undefined for invalid input', () => {
-    expect(extractContent(null)).toBeUndefined()
-    expect(extractContent(undefined)).toBeUndefined()
-    expect(extractContent('string')).toBeUndefined()
-    expect(extractContent(123)).toBeUndefined()
-  })
-
-  test('returns undefined when no content fields present', () => {
-    const input = { file_path: '/some/path.ts' }
-    expect(extractContent(input)).toBeUndefined()
-  })
-
-  test('handles empty object', () => {
-    expect(extractContent({})).toBeUndefined()
-  })
-
-  test('handles multiline content', () => {
-    const input = { content: 'line1\nline2\nline3' }
-    expect(extractContent(input)).toBe('line1\nline2\nline3')
-  })
-})
-
-// ============================================================================
-// detectTrajectoryRichness
-// ============================================================================
-
-describe('detectTrajectoryRichness', () => {
-  test('returns "full" when trajectory has thoughts', () => {
-    const trajectory: TrajectoryStep[] = [
-      { type: 'thought', content: 'Let me think...', timestamp: 0 },
-      { type: 'message', content: 'Answer', timestamp: 100 },
-    ]
-
-    expect(detectTrajectoryRichness(trajectory)).toBe('full')
-  })
-
-  test('returns "full" when trajectory has tool calls', () => {
-    const trajectory: TrajectoryStep[] = [
-      { type: 'tool_call', name: 'Read', status: 'completed', timestamp: 0 },
-      { type: 'message', content: 'Answer', timestamp: 100 },
-    ]
-
-    expect(detectTrajectoryRichness(trajectory)).toBe('full')
-  })
-
-  test('returns "full" when trajectory has plans', () => {
-    const trajectory: TrajectoryStep[] = [
-      { type: 'plan', entries: [{ content: 'Step 1', status: 'completed' }], timestamp: 0 },
-      { type: 'message', content: 'Answer', timestamp: 100 },
-    ]
-
-    expect(detectTrajectoryRichness(trajectory)).toBe('full')
-  })
-
-  test('returns "messages-only" when trajectory only has messages', () => {
-    const trajectory: TrajectoryStep[] = [
-      { type: 'message', content: 'First', timestamp: 0 },
-      { type: 'message', content: 'Second', timestamp: 100 },
-    ]
-
-    expect(detectTrajectoryRichness(trajectory)).toBe('messages-only')
-  })
-
-  test('returns "minimal" when trajectory is empty', () => {
-    expect(detectTrajectoryRichness([])).toBe('minimal')
-  })
-
-  test('returns "full" when trajectory has mixed rich content', () => {
-    const trajectory: TrajectoryStep[] = [
-      { type: 'thought', content: 'Thinking...', timestamp: 0 },
-      { type: 'tool_call', name: 'Read', status: 'completed', timestamp: 50 },
-      { type: 'plan', entries: [], timestamp: 100 },
-      { type: 'message', content: 'Done', timestamp: 150 },
-    ]
-
-    expect(detectTrajectoryRichness(trajectory)).toBe('full')
-  })
-})
diff --git a/src/commands/tests/summarize-helpers.spec.ts b/src/commands/tests/summarize-helpers.spec.ts
deleted file mode 100644
index 9df86d1..0000000
--- a/src/commands/tests/summarize-helpers.spec.ts
+++ /dev/null
@@ -1,339 +0,0 @@
-import { describe, expect, test } from 'bun:test'
-import type { CaptureResult } from '../../schemas.ts'
-import { formatMarkdown, formatSummary } from '../summarize.ts'
-
-// ============================================================================
-// Test Fixtures
-// ============================================================================
-
-const createBasicResult = (overrides?: Partial<CaptureResult>): CaptureResult => ({
-  id: 'test-001',
-  input: 'What is 2+2?',
-  output: 'The answer is 4.',
-  trajectory: [
-    { type: 'thought', content: 'Let me think about this...', timestamp: 0 },
-    { type: 'message', content: 'The answer is 4.', timestamp: 100 },
-  ],
-  metadata: { category: 'math', agent: 'test-agent' },
-  timing: { start: 1000, end: 2000, sessionCreation: 0, total: 1000 },
-  toolErrors: false,
-  ...overrides,
-})
-
-const createResultWithToolCalls = (): CaptureResult => ({
-  id: 'test-002',
-  input: 'Read and summarize file.txt',
-  output: 'File contains important data.',
-  trajectory: [
-    { type: 'thought', content: 'I will read the file first.', timestamp: 0 },
-    {
-      type: 'tool_call',
-      name: 'Read',
-      status: 'completed',
-      input: { file_path: '/path/to/file.txt' },
-      output: 'file contents here',
-      duration: 50,
-      timestamp: 100,
-    },
-    {
-      type: 'tool_call',
-      name: 'Write',
-      status: 'completed',
-      input: { file_path: '/output.md', content: 'Summary here' },
-      duration: 30,
-      timestamp: 200,
-    },
-    { type: 'message', content: 'File contains important data.', timestamp: 300 },
-  ],
-  metadata: { agent: 'test-agent' },
-  timing: { start: 1000, end: 1500, sessionCreation: 0, total: 500 },
-  toolErrors: false,
-})
-
-// ============================================================================
-// formatSummary
-// ============================================================================
-
-describe('formatSummary', () => {
-  test('extracts id, input, and output', () => {
-    const result = createBasicResult()
-    const summary = formatSummary(result)
-
-    expect(summary.id).toBe('test-001')
-    expect(summary.input).toBe('What is 2+2?')
-    expect(summary.output).toBe('The answer is 4.')
-  })
-
-  test('extracts tool call names', () => {
-    const result = createResultWithToolCalls()
-    const summary = formatSummary(result)
-
-    expect(summary.toolCalls).toEqual(['Read', 'Write'])
-  })
-
-  test('calculates duration from timing', () => {
-    const result = createBasicResult()
-    const summary = formatSummary(result)
-
-    expect(summary.duration).toBe(1000) // 2000 - 1000
-  })
-
-  test('handles empty trajectory', () => {
-    const result = createBasicResult({ trajectory: [] })
-    const summary = formatSummary(result)
-
-    expect(summary.toolCalls).toEqual([])
-  })
-
-  test('filters only tool_call steps for toolCalls list', () => {
-    const result = createBasicResult()
-    const summary = formatSummary(result)
-
-    // trajectory has thought and message, but no tool_call
-    expect(summary.toolCalls).toEqual([])
-  })
-
-  test('handles trajectory with only messages', () => {
-    const result = createBasicResult({
-      trajectory: [
-        { type: 'message', content: 'First message', timestamp: 0 },
-        { type: 'message', content: 'Second message', timestamp: 100 },
-      ],
-    })
-    const summary = formatSummary(result)
-
-    expect(summary.toolCalls).toEqual([])
-  })
-
-  test('preserves original input/output exactly', () => {
-    const result = createBasicResult({
-      input: 'Input with\nnewlines and "quotes"',
-      output: 'Output with\ttabs',
-    })
-    const summary = formatSummary(result)
-
-    expect(summary.input).toBe('Input with\nnewlines and "quotes"')
-    expect(summary.output).toBe('Output with\ttabs')
-  })
-})
-
-// ============================================================================
-// formatMarkdown
-// ============================================================================
-
-describe('formatMarkdown', () => {
-  test('includes evaluation record header with id', () => {
-    const result = createBasicResult()
-    const markdown = formatMarkdown(result)
-
-    expect(markdown).toContain('## Evaluation Record: test-001')
-  })
-
-  test('includes input field', () => {
-    const result = createBasicResult()
-    const markdown = formatMarkdown(result)
-
-    expect(markdown).toContain('**Input:** What is 2+2?')
-  })
-
-  test('includes trajectory section', () => {
-    const result = createBasicResult()
-    const markdown = formatMarkdown(result)
-
-    expect(markdown).toContain('**Trajectory:**')
-  })
-
-  test('formats thought steps with truncation', () => {
-    const result = createBasicResult({
-      trajectory: [
-        { type: 'thought', content: 'Short thought', timestamp: 0 },
-        { type: 'thought', content: 'A'.repeat(150), timestamp: 100 },
-      ],
-    })
-    const markdown = formatMarkdown(result)
-
-    expect(markdown).toContain('[THOUGHT] Short thought')
-    expect(markdown).toContain(`[THOUGHT] ${'A'.repeat(100)}...`)
-  })
-
-  test('formats tool calls with status and duration', () => {
-    const result = createResultWithToolCalls()
-    const markdown = formatMarkdown(result)
-
-    expect(markdown).toContain('[TOOL:Read] -> completed (50ms)')
-    expect(markdown).toContain('[TOOL:Write] -> completed (30ms)')
-  })
-
-  test('includes file path for tool calls', () => {
-    const result = createResultWithToolCalls()
-    const markdown = formatMarkdown(result)
-
-    expect(markdown).toContain('File: /path/to/file.txt')
-    expect(markdown).toContain('File: /output.md')
-  })
-
-  test('includes step IDs for reference', () => {
-    const result = createBasicResult()
-    const markdown = formatMarkdown(result)
-
-    expect(markdown).toContain('[→test-001-step-1]')
-    expect(markdown).toContain('[→test-001-step-2]')
-  })
-
-  test('formats plan steps', () => {
-    const result = createBasicResult({
-      trajectory: [
-        {
-          type: 'plan',
-          entries: [
-            { content: 'Step 1', status: 'completed' },
-            { content: 'Step 2', status: 'in_progress' },
-          ],
-          timestamp: 0,
-        },
-      ],
-    })
-    const markdown = formatMarkdown(result)
-
-    expect(markdown).toContain('[PLAN]')
-    expect(markdown).toContain('Step 1: completed')
-    expect(markdown).toContain('Step 2: in_progress')
-  })
-
-  test('truncates long plan summaries', () => {
-    const result = createBasicResult({
-      trajectory: [
-        {
-          type: 'plan',
-          entries: [
-            { content: 'A very long step description that goes on and on', status: 'completed' },
-            { content: 'Another very long step description', status: 'pending' },
-            { content: 'Yet another step', status: 'pending' },
-          ],
-          timestamp: 0,
-        },
-      ],
-    })
-    const markdown = formatMarkdown(result)
-
-    expect(markdown).toContain('...')
-  })
-
-  test('formats message steps', () => {
-    const result = createBasicResult({
-      trajectory: [{ type: 'message', content: 'Here is my response to your question.', timestamp: 0 }],
-    })
-    const markdown = formatMarkdown(result)
-
-    expect(markdown).toContain('[MESSAGE] Here is my response')
-  })
-
-  test('includes output preview', () => {
-    const result = createBasicResult()
-    const markdown = formatMarkdown(result)
-
-    expect(markdown).toContain('**Output:** The answer is 4.')
-  })
-
-  test('truncates long output', () => {
-    const result = createBasicResult({
-      output: 'X'.repeat(300),
-    })
-    const markdown = formatMarkdown(result)
-
-    expect(markdown).toContain(`${'X'.repeat(200)}...`)
-  })
-
-  test('includes metadata', () => {
-    const result = createBasicResult()
-    const markdown = formatMarkdown(result)
-
-    expect(markdown).toContain('**Metadata:**')
-    expect(markdown).toContain('category=math')
-    expect(markdown).toContain('agent=test-agent')
-  })
-
-  test('includes tool errors status', () => {
-    const result = createBasicResult({ toolErrors: true })
-    const markdown = formatMarkdown(result)
-
-    expect(markdown).toContain('**Tool Errors:** true')
-  })
-
-  test('includes duration', () => {
-    const result = createBasicResult()
-    const markdown = formatMarkdown(result)
-
-    expect(markdown).toContain('**Duration:** 1000ms')
-  })
-
-  test('includes score when present', () => {
-    const result = createBasicResult({
-      score: {
-        pass: true,
-        score: 0.95,
-        reasoning: 'Correct answer provided',
-      },
-    })
-    const markdown = formatMarkdown(result)
-
-    expect(markdown).toContain('**Score:** PASS (0.95)')
-    expect(markdown).toContain('**Reasoning:** Correct answer provided')
-  })
-
-  test('handles failed score', () => {
-    const result = createBasicResult({
-      score: {
-        pass: false,
-        score: 0.2,
-        reasoning: 'Incorrect answer',
-      },
-    })
-    const markdown = formatMarkdown(result)
-
-    expect(markdown).toContain('**Score:** FAIL (0.2)')
-  })
-
-  test('includes content preview with syntax highlighting', () => {
-    const result: CaptureResult = {
-      id: 'test-003',
-      input: 'Write a function',
-      output: 'Done',
-      trajectory: [
-        {
-          type: 'tool_call',
-          name: 'Write',
-          status: 'completed',
-          input: {
-            file_path: '/src/utils.ts',
-            content: 'export const add = (a: number, b: number) => a + b;',
-          },
-          duration: 20,
-          timestamp: 0,
-        },
-      ],
-      metadata: { agent: 'test' },
-      timing: { start: 0, end: 100, sessionCreation: 0, total: 100 },
-      toolErrors: false,
-    }
-    const markdown = formatMarkdown(result)
-
-    expect(markdown).toContain('```ts')
-    expect(markdown).toContain('export const add')
-  })
-
-  test('ends with horizontal rule separator', () => {
-    const result = createBasicResult()
-    const markdown = formatMarkdown(result)
-
-    expect(markdown).toContain('---')
-  })
-
-  test('handles empty trajectory', () => {
-    const result = createBasicResult({ trajectory: [] })
-    const markdown = formatMarkdown(result)
-
-    expect(markdown).toContain('**Trajectory:**')
-    expect(markdown).toContain('**Output:**')
-  })
-})
diff --git a/src/commands/tests/trials-calculations.spec.ts b/src/commands/tests/trials-calculations.spec.ts
deleted file mode 100644
index 30ce9ae..0000000
--- a/src/commands/tests/trials-calculations.spec.ts
+++ /dev/null
@@ -1,209 +0,0 @@
-import { describe, expect, test } from 'bun:test'
-import { calculatePassAtK, calculatePassExpK } from '../trials.ts'
-
-// ============================================================================
-// calculatePassAtK
-// ============================================================================
-
-describe('calculatePassAtK', () => {
-  test('returns 1 when all trials pass', () => {
-    expect(calculatePassAtK(5, 5)).toBe(1)
-    expect(calculatePassAtK(10, 10)).toBe(1)
-    expect(calculatePassAtK(1, 1)).toBe(1)
-  })
-
-  test('returns 0 when no trials pass', () => {
-    expect(calculatePassAtK(0, 5)).toBe(0)
-    expect(calculatePassAtK(0, 10)).toBe(0)
-    expect(calculatePassAtK(0, 1)).toBe(0)
-  })
-
-  test('calculates probability correctly for partial passes', () => {
-    // pass@k = 1 - (1 - passRate)^k
-    // For 3 passes out of 5: passRate = 0.6
-    // pass@5 = 1 - (0.4)^5 = 1 - 0.01024 = 0.98976
-    const result = calculatePassAtK(3, 5)
-    expect(result).toBeCloseTo(0.98976, 5)
-  })
-
-  test('k=1 equals the pass rate', () => {
-    // For k=1, pass@1 = 1 - (1 - p)^1 = p
-    expect(calculatePassAtK(1, 1)).toBe(1)
-
-    // More interesting: 0 passes, 1 trial
-    expect(calculatePassAtK(0, 1)).toBe(0)
-  })
-
-  test('higher pass rate yields higher pass@k', () => {
-    const lowPassRate = calculatePassAtK(1, 5) // 20% pass rate
-    const highPassRate = calculatePassAtK(4, 5) // 80% pass rate
-
-    expect(highPassRate).toBeGreaterThan(lowPassRate)
-  })
-
-  test('larger k amplifies probability of at least one pass', () => {
-    // With 50% pass rate, larger k means higher chance of at least one pass
-    // k=2: 1 - (0.5)^2 = 0.75
-    // k=4: 1 - (0.5)^4 = 0.9375
-
-    const k2 = calculatePassAtK(1, 2) // 50% pass rate
-    const k4 = calculatePassAtK(2, 4) // Also 50% pass rate
-
-    expect(k4).toBeGreaterThan(k2)
-  })
-
-  test('handles edge case where passes equals k', () => {
-    expect(calculatePassAtK(3, 3)).toBe(1)
-  })
-
-  test('handles passes greater than k (returns 1)', () => {
-    // This shouldn't happen in practice, but the function handles it
-    expect(calculatePassAtK(10, 5)).toBe(1)
-  })
-
-  test('mathematical verification with known values', () => {
-    // 1 out of 3 passes: passRate = 1/3
-    // pass@3 = 1 - (2/3)^3 = 1 - 8/27 = 19/27 ≈ 0.7037
-    const result = calculatePassAtK(1, 3)
-    expect(result).toBeCloseTo(19 / 27, 5)
-
-    // 2 out of 4 passes: passRate = 0.5
-    // pass@4 = 1 - (0.5)^4 = 1 - 0.0625 = 0.9375
-    const result2 = calculatePassAtK(2, 4)
-    expect(result2).toBeCloseTo(0.9375, 5)
-  })
-})
-
-// ============================================================================
-// calculatePassExpK
-// ============================================================================
-
-describe('calculatePassExpK', () => {
-  test('returns 1 when all trials pass', () => {
-    expect(calculatePassExpK(5, 5)).toBe(1)
-    expect(calculatePassExpK(10, 10)).toBe(1)
-    expect(calculatePassExpK(1, 1)).toBe(1)
-  })
-
-  test('returns 0 when no trials pass', () => {
-    expect(calculatePassExpK(0, 5)).toBe(0)
-    expect(calculatePassExpK(0, 10)).toBe(0)
-    expect(calculatePassExpK(0, 1)).toBe(0)
-  })
-
-  test('calculates probability correctly', () => {
-    // pass^k = passRate^k
-    // For 3 passes out of 5: passRate = 0.6
-    // pass^5 = (0.6)^5 = 0.07776
-    const result = calculatePassExpK(3, 5)
-    expect(result).toBeCloseTo(0.07776, 5)
-  })
-
-  test('k=1 equals the pass rate', () => {
-    // For k=1, pass^1 = p^1 = p
-    expect(calculatePassExpK(1, 1)).toBe(1)
-  })
-
-  test('higher pass rate yields higher pass^k', () => {
-    const lowPassRate = calculatePassExpK(1, 5) // 20% pass rate
-    const highPassRate = calculatePassExpK(4, 5) // 80% pass rate
-
-    expect(highPassRate).toBeGreaterThan(lowPassRate)
-  })
-
-  test('larger k reduces probability of all passing (for non-100% rates)', () => {
-    // With 80% pass rate:
-    // k=2: (0.8)^2 = 0.64
-    // k=5: (0.8)^5 = 0.32768
-
-    // Mathematical verification using known formulas
-    const k2_fair = 0.8 ** 2 // = 0.64
-    const k5_fair = 0.8 ** 5 // = 0.32768
-
-    expect(k5_fair).toBeLessThan(k2_fair)
-
-    // Also verify our function produces consistent results
-    // 4 out of 5 gives 80% pass rate
-    const result = calculatePassExpK(4, 5)
-    expect(result).toBeCloseTo(k5_fair, 5)
-  })
-
-  test('handles edge case where passes equals k', () => {
-    expect(calculatePassExpK(3, 3)).toBe(1)
-  })
-
-  test('mathematical verification with known values', () => {
-    // 1 out of 3 passes: passRate = 1/3
-    // pass^3 = (1/3)^3 = 1/27 ≈ 0.037
-    const result = calculatePassExpK(1, 3)
-    expect(result).toBeCloseTo(1 / 27, 5)
-
-    // 2 out of 4 passes: passRate = 0.5
-    // pass^4 = (0.5)^4 = 0.0625
-    const result2 = calculatePassExpK(2, 4)
-    expect(result2).toBeCloseTo(0.0625, 5)
-
-    // 3 out of 4 passes: passRate = 0.75
-    // pass^4 = (0.75)^4 = 0.31640625
-    const result3 = calculatePassExpK(3, 4)
-    expect(result3).toBeCloseTo(0.31640625, 5)
-  })
-
-  test('pass^k is always less than or equal to pass@k', () => {
-    // For any pass rate < 100%, pass^k <= pass@k
-    // This is because "all pass" is a subset of "at least one passes"
-
-    const testCases = [
-      { passes: 1, k: 5 },
-      { passes: 2, k: 5 },
-      { passes: 3, k: 5 },
-      { passes: 4, k: 5 },
-      { passes: 1, k: 3 },
-      { passes: 2, k: 4 },
-    ]
-
-    for (const { passes, k } of testCases) {
-      const passExpK = calculatePassExpK(passes, k)
-      const passAtK = calculatePassAtK(passes, k)
-      expect(passExpK).toBeLessThanOrEqual(passAtK)
-    }
-  })
-})
-
-// ============================================================================
-// Combined behavior tests
-// ============================================================================
-
-describe('pass@k and pass^k relationship', () => {
-  test('100% pass rate: both metrics equal 1', () => {
-    expect(calculatePassAtK(5, 5)).toBe(1)
-    expect(calculatePassExpK(5, 5)).toBe(1)
-  })
-
-  test('0% pass rate: both metrics equal 0', () => {
-    expect(calculatePassAtK(0, 5)).toBe(0)
-    expect(calculatePassExpK(0, 5)).toBe(0)
-  })
-
-  test('gap between metrics varies with pass rate', () => {
-    // At 50% pass rate, the gap is maximized
-    // At extreme pass rates (0% or 100%), the gap is 0
-
-    // 50% pass rate with k=4
-    const midAtK = calculatePassAtK(2, 4) // 0.9375
-    const midExpK = calculatePassExpK(2, 4) // 0.0625
-    const midGap = midAtK - midExpK // 0.875
-
-    // 80% pass rate with k=5
-    const highAtK = calculatePassAtK(4, 5)
-    const highExpK = calculatePassExpK(4, 5)
-    const highGap = highAtK - highExpK
-
-    // Both gaps should be positive (pass@k > pass^k for partial pass rates)
-    expect(midGap).toBeGreaterThan(0)
-    expect(highGap).toBeGreaterThan(0)
-
-    // Mid-range pass rate has larger gap than high pass rate
-    expect(midGap).toBeGreaterThan(highGap)
-  })
-})
diff --git a/src/commands/tests/trials-cli.spec.ts b/src/commands/tests/trials-cli.spec.ts
deleted file mode 100644
index 8005755..0000000
--- a/src/commands/tests/trials-cli.spec.ts
+++ /dev/null
@@ -1,215 +0,0 @@
-import { describe, expect, test } from 'bun:test'
-import type { TrialsConfig } from '../trials.ts'
-
-// ============================================================================
-// TrialsConfig type
-// ============================================================================
-
-describe('TrialsConfig configuration', () => {
-  test('TrialsConfig type accepts valid configuration', () => {
-    const config: TrialsConfig = {
-      promptsPath: '/tmp/prompts.jsonl',
-      schemaPath: './schemas/claude-headless.json',
-      k: 5,
-      outputPath: '/tmp/output.jsonl',
-      cwd: '/tmp',
-      timeout: 30000,
-      progress: true,
-      append: false,
-      debug: false,
-      concurrency: 4,
-      workspaceDir: '/tmp/workspaces',
-    }
-
-    expect(config.promptsPath).toBe('/tmp/prompts.jsonl')
-    expect(config.schemaPath).toBe('./schemas/claude-headless.json')
-    expect(config.k).toBe(5)
-    expect(config.concurrency).toBe(4)
-    expect(config.workspaceDir).toBe('/tmp/workspaces')
-  })
-
-  test('TrialsConfig allows minimal configuration', () => {
-    const config: TrialsConfig = {
-      promptsPath: '/tmp/prompts.jsonl',
-      schemaPath: './test-schema.json',
-      k: 3,
-    }
-
-    expect(config.outputPath).toBeUndefined()
-    expect(config.cwd).toBeUndefined()
-    expect(config.timeout).toBeUndefined()
-    expect(config.progress).toBeUndefined()
-    expect(config.append).toBeUndefined()
-    expect(config.grader).toBeUndefined()
-    expect(config.concurrency).toBeUndefined()
-    expect(config.workspaceDir).toBeUndefined()
-  })
-
-  test('TrialsConfig accepts prompts without promptsPath', () => {
-    const config: TrialsConfig = {
-      schemaPath: './test-schema.json',
-      k: 3,
-      prompts: [{ id: 't1', input: 'hello' }],
-    }
-
-    expect(config.promptsPath).toBeUndefined()
-    expect(config.prompts).toHaveLength(1)
-  })
-})
-
-// ============================================================================
-// CLI Help Output
-// ============================================================================
-
-describe('trials CLI', () => {
-  test('displays help with --help flag', async () => {
-    const proc = Bun.spawn(['bun', './bin/cli.ts', 'trials', '--help'], {
-      stdout: 'pipe',
-      stderr: 'pipe',
-    })
-
-    const stdout = await new Response(proc.stdout).text()
-    await proc.exited
-
-    expect(stdout).toContain('Usage: agent-eval-harness trials')
-    expect(stdout).toContain('prompts.jsonl')
-    expect(stdout).toContain('-o, --output')
-    expect(stdout).toContain('-k')
-    expect(stdout).toContain('-c, --cwd')
-    expect(stdout).toContain('-t, --timeout')
-    expect(stdout).toContain('--progress')
-    expect(stdout).toContain('-g, --grader')
-    expect(stdout).toContain('-s, --schema')
-    expect(stdout).toContain('pass@k')
-    expect(stdout).toContain('-j, --concurrency')
-    expect(stdout).toContain('--workspace-dir')
-    expect(stdout).toContain('--stdin')
-  })
-
-  test('shows error for --stdin with positional file', async () => {
-    const proc = Bun.spawn(
-      ['bun', './bin/cli.ts', 'trials', '/tmp/prompts.jsonl', '--stdin', '-s', '/tmp/schema.json'],
-      {
-        stdout: 'pipe',
-        stderr: 'pipe',
-      },
-    )
-
-    const stderr = await new Response(proc.stderr).text()
-    const exitCode = await proc.exited
-
-    expect(exitCode).not.toBe(0)
-    expect(stderr).toContain('--stdin and prompts file argument are mutually exclusive')
-  })
-
-  test('shows error for missing prompts file argument', async () => {
-    const proc = Bun.spawn(['bun', './bin/cli.ts', 'trials'], {
-      stdout: 'pipe',
-      stderr: 'pipe',
-    })
-
-    const stderr = await new Response(proc.stderr).text()
-    const exitCode = await proc.exited
-
-    expect(exitCode).not.toBe(0)
-    expect(stderr).toContain('prompts.jsonl path is required')
-  })
-
-  test('shows error for missing schema argument', async () => {
-    const proc = Bun.spawn(['bun', './bin/cli.ts', 'trials', '/tmp/prompts.jsonl'], {
-      stdout: 'pipe',
-      stderr: 'pipe',
-    })
-
-    const stderr = await new Response(proc.stderr).text()
-    const exitCode = await proc.exited
-
-    expect(exitCode).not.toBe(0)
-    expect(stderr).toContain('--schema is required')
-  })
-
-  test('shows error for invalid concurrency value', async () => {
-    const proc = Bun.spawn(
-      ['bun', './bin/cli.ts', 'trials', '/tmp/prompts.jsonl', '-s', '/tmp/schema.json', '-j', 'abc'],
-      {
-        stdout: 'pipe',
-        stderr: 'pipe',
-      },
-    )
-
-    const stderr = await new Response(proc.stderr).text()
-    const exitCode = await proc.exited
-
-    expect(exitCode).not.toBe(0)
-    expect(stderr).toContain('--concurrency must be a positive integer')
-  })
-
-  test('shows error for zero concurrency', async () => {
-    const proc = Bun.spawn(
-      ['bun', './bin/cli.ts', 'trials', '/tmp/prompts.jsonl', '-s', '/tmp/schema.json', '-j', '0'],
-      {
-        stdout: 'pipe',
-        stderr: 'pipe',
-      },
-    )
-
-    const stderr = await new Response(proc.stderr).text()
-    const exitCode = await proc.exited
-
-    expect(exitCode).not.toBe(0)
-    expect(stderr).toContain('--concurrency must be a positive integer')
-  })
-})
-
-// ============================================================================
-// Schemas CLI
-// ============================================================================
-
-describe('schemas CLI', () => {
-  test('displays help with --help flag', async () => {
-    const proc = Bun.spawn(['bun', './bin/cli.ts', 'schemas', '--help'], {
-      stdout: 'pipe',
-      stderr: 'pipe',
-    })
-
-    const stdout = await new Response(proc.stdout).text()
-    await proc.exited
-
-    expect(stdout).toContain('Usage: agent-eval-harness schemas')
-    expect(stdout).toContain('-o, --output')
-    expect(stdout).toContain('-j, --json')
-    expect(stdout).toContain('-s, --split')
-    expect(stdout).toContain('-l, --list')
-    expect(stdout).toContain('Available Schemas')
-  })
-
-  test('lists schemas with --list flag', async () => {
-    const proc = Bun.spawn(['bun', './bin/cli.ts', 'schemas', '--list'], {
-      stdout: 'pipe',
-      stderr: 'pipe',
-    })
-
-    const stdout = await new Response(proc.stdout).text()
-    await proc.exited
-
-    expect(stdout).toContain('Available schemas')
-    expect(stdout).toContain('PromptCase')
-    expect(stdout).toContain('CaptureResult')
-    expect(stdout).toContain('GraderResult')
-  })
-
-  test('exports schema as JSON', async () => {
-    const proc = Bun.spawn(['bun', './bin/cli.ts', 'schemas', 'PromptCase', '--json'], {
-      stdout: 'pipe',
-      stderr: 'pipe',
-    })
-
-    const stdout = await new Response(proc.stdout).text()
-    await proc.exited
-
-    const schema = JSON.parse(stdout)
-    expect(schema.$schema).toBe('https://json-schema.org/draft/2020-12/schema')
-    expect(schema.title).toBe('PromptCase')
-    expect(schema.type).toBe('object')
-  })
-})
diff --git a/src/commands/trials.ts b/src/commands/trials.ts
deleted file mode 100644
index 9d4adb4..0000000
--- a/src/commands/trials.ts
+++ /dev/null
@@ -1,377 +0,0 @@
-/**
- * Multi-run trials command for pass@k/pass^k analysis.
- *
- * @remarks
- * Runs each prompt k times to measure non-determinism.
- * Without a grader, captures raw trials. With a grader, computes:
- * - passRate: Simple pass rate (passes / k)
- * - passAtK: Probability of at least one pass in k samples
- * - passExpK: Probability of all k samples passing
- *
- * @packageDocumentation
- */
-
-import { parseArgs } from 'node:util'
-import { createWorkspaceDir, extractOutput, extractTrajectory, logProgress, readStdinPrompts } from '../core.ts'
-import type { ParsedUpdate } from '../headless/headless-output-parser.ts'
-import { DEFAULT_TRIAL_COUNT } from '../schemas/constants.ts'
-import { loadGraderOrExit } from '../schemas/grader-loader.ts'
-import type { PromptCase, TrialEntry, TrialResult } from '../schemas.ts'
-import { type BaseExecutionConfig, executePrompts, parseConcurrency, prepareExecution } from './execution.ts'
-
-// ============================================================================
-// Pass@k/Pass^k Calculation
-// ============================================================================
-
-/**
- * Calculate pass@k: probability of at least one pass in k samples.
- *
- * @remarks
- * Uses the unbiased estimator: 1 - C(n-c, k) / C(n, k)
- * where n = total samples, c = correct samples, k = samples per trial
- *
- * For our case where n = k (we run exactly k trials per prompt):
- * pass@k = 1 - (1 - passRate)^k (simplified)
- *
- * @param passes - Number of passing trials
- * @param k - Total number of trials
- * @returns Probability of at least one pass
- *
- * @public
- */
-export const calculatePassAtK = (passes: number, k: number): number => {
-  if (passes >= k) return 1
-  if (passes === 0) return 0
-
-  // Simplified formula when n = k
-  const passRate = passes / k
-  return 1 - (1 - passRate) ** k
-}
-
-/**
- * Calculate pass^k: probability of all k samples passing.
- *
- * @remarks
- * This is simply passRate^k
- *
- * @param passes - Number of passing trials
- * @param k - Total number of trials
- * @returns Probability of all k samples passing
- *
- * @public
- */
-export const calculatePassExpK = (passes: number, k: number): number => {
-  if (passes === k) return 1
-  if (passes === 0) return 0
-
-  const passRate = passes / k
-  return passRate ** k
-}
-
-// ============================================================================
-// Types
-// ============================================================================
-
-/** Configuration for trials command */
-export type TrialsConfig = BaseExecutionConfig & {
-  /** Number of trials per prompt */
-  k: number
-}
-
-// ============================================================================
-// Trials Implementation
-// ============================================================================
-
-/**
- * Execute trials with configuration object.
- *
- * @param config - Trials configuration
- * @returns Array of trial results
- */
-export const runTrials = async (config: TrialsConfig): Promise<TrialResult[]> => {
-  const { k } = config
-  const ctx = await prepareExecution(config)
-  const { schema, prompts, sessions, resolvedWorkspaceDir, defaultWorkingDir, progress, grader } = ctx
-
-  // Log progress info
-  logProgress(`Loaded ${prompts.length} prompts from ${config.promptsPath ?? 'stdin'}`, progress)
-  logProgress(`Running ${k} trials per prompt (${prompts.length * k} total executions)`, progress)
-  logProgress(`Schema: ${schema.name} (${config.schemaPath})`, progress)
-  logProgress(`Timeout: ${ctx.effectiveTimeout}ms`, progress)
-  if (ctx.concurrency > 1) {
-    logProgress(`Concurrency: ${ctx.concurrency} workers`, progress)
-  }
-  if (resolvedWorkspaceDir) {
-    logProgress(`Workspace: ${resolvedWorkspaceDir}`, progress)
-  }
-  if (grader) {
-    logProgress('Grader: enabled (will compute pass@k metrics)', progress)
-  }
-
-  // Process all trials for a single prompt
-  const processPromptTrials = async (promptCase: (typeof prompts)[number], index: number): Promise<TrialResult> => {
-    logProgress(`[${index + 1}/${prompts.length}] ${promptCase.id}: Running ${k} trials...`, progress)
-
-    const trialEntries: TrialEntry[] = []
-
-    for (let trialNum = 1; trialNum <= k; trialNum++) {
-      // Determine working directory (per-prompt workspace or default)
-      // For trials, include trial number in workspace path for isolation
-      const workingDir = resolvedWorkspaceDir
-        ? await createWorkspaceDir(resolvedWorkspaceDir, `${promptCase.id}-trial-${trialNum}`)
-        : defaultWorkingDir
-
-      // Create fresh session for each trial
-      const session = await sessions.create(workingDir)
-      const startTime = Date.now()
-
-      try {
-        // Handle string or array input
-        const inputs = Array.isArray(promptCase.input) ? promptCase.input : [promptCase.input]
-        const allUpdates: ParsedUpdate[] = []
-
-        // Execute each turn sequentially
-        for (const turnInput of inputs) {
-          const turnResult = await sessions.prompt(session.id, turnInput)
-          allUpdates.push(...turnResult.updates)
-        }
-
-        const endTime = Date.now()
-        const trajectory = extractTrajectory(allUpdates, startTime)
-        const output = extractOutput(trajectory)
-
-        const entry: TrialEntry = {
-          trialNum,
-          output,
-          trajectory,
-          duration: endTime - startTime,
-        }
-
-        // Apply grader if provided
-        if (grader) {
-          const graderResult = await grader({
-            input: promptCase.input,
-            output,
-            hint: promptCase.hint,
-            trajectory,
-            metadata: promptCase.metadata,
-            cwd: session.cwd,
-          })
-          entry.pass = graderResult.pass
-          entry.score = graderResult.score
-          entry.reasoning = graderResult.reasoning
-
-          if (graderResult.outcome) {
-            entry.outcome = graderResult.outcome
-          }
-        }
-
-        trialEntries.push(entry)
-        logProgress(
-          `    Trial ${trialNum}/${k}: ${entry.pass !== undefined ? (entry.pass ? '✓' : '✗') : '?'}`,
-          progress,
-        )
-      } catch (error) {
-        const endTime = Date.now()
-        const message = error instanceof Error ? error.message : String(error)
-
-        trialEntries.push({
-          trialNum,
-          output: '',
-          trajectory: [],
-          duration: endTime - startTime,
-          pass: false,
-          reasoning: `Error: ${message}`,
-        })
-        logProgress(`    Trial ${trialNum}/${k}: ! (error)`, progress)
-      } finally {
-        // Always clean up session
-        sessions.destroy(session.id)
-      }
-    }
-
-    // Build result
-    const result: TrialResult = {
-      id: promptCase.id,
-      input: promptCase.input,
-      ...(promptCase.hint && { hint: promptCase.hint }),
-      k,
-      trials: trialEntries,
-      metadata: {
-        ...promptCase.metadata,
-        agent: schema.name,
-        ...(resolvedWorkspaceDir && { workspaceDir: resolvedWorkspaceDir }),
-      },
-    }
-
-    // Calculate metrics if grader was used
-    if (grader) {
-      const passes = trialEntries.filter((t) => t.pass).length
-      result.passRate = passes / k
-      result.passAtK = calculatePassAtK(passes, k)
-      result.passExpK = calculatePassExpK(passes, k)
-    }
-
-    // Write result immediately (coordinated via mutex for concurrent writes)
-    await ctx.writeResult(result)
-
-    if (grader) {
-      logProgress(
-        `  → ${promptCase.id}: passRate=${(result.passRate ?? 0).toFixed(2)}, pass@${k}=${(result.passAtK ?? 0).toFixed(2)}`,
-        progress,
-      )
-    }
-
-    return result
-  }
-
-  // Run with worker pool (parallelizes across prompts, trials for each prompt run sequentially)
-  return executePrompts(ctx, processPromptTrials)
-}
-
-// ============================================================================
-// CLI Entry Point
-// ============================================================================
-
-/**
- * Trials command CLI handler.
- *
- * @param args - Command line arguments (after 'trials')
- */
-export const trials = async (args: string[]): Promise<void> => {
-  const { values, positionals } = parseArgs({
-    args,
-    options: {
-      schema: { type: 'string', short: 's' },
-      output: { type: 'string', short: 'o' },
-      k: { type: 'string', short: 'k', default: String(DEFAULT_TRIAL_COUNT) },
-      cwd: { type: 'string', short: 'c' },
-      timeout: { type: 'string', short: 't' },
-      progress: { type: 'boolean', default: false },
-      append: { type: 'boolean', default: false },
-      grader: { type: 'string', short: 'g' },
-      debug: { type: 'boolean', default: false },
-      stdin: { type: 'boolean', default: false },
-      concurrency: { type: 'string', short: 'j' },
-      'workspace-dir': { type: 'string' },
-      help: { type: 'boolean', short: 'h' },
-    },
-    allowPositionals: true,
-  })
-
-  if (values.help) {
-    console.log(`
-Usage: agent-eval-harness trials <prompts.jsonl> --schema <schema.json> [options]
-       cat prompts.jsonl | agent-eval-harness trials --stdin --schema <schema.json> [options]
-
-Arguments:
-  prompts.jsonl     Input file with evaluation prompts
-
-Options:
-  -s, --schema      Path to agent schema JSON file (required)
-  -o, --output      Output file (default: stdout)
-  -k                Number of trials per prompt (default: ${DEFAULT_TRIAL_COUNT})
-  -c, --cwd         Working directory for agent
-  -t, --timeout     Request timeout in ms (overrides schema default)
-  -j, --concurrency Number of concurrent workers (default: 1)
-  --stdin           Read prompts from stdin (mutually exclusive with file arg)
-  --workspace-dir   Base directory for per-trial workspace isolation
-  --progress        Show progress to stderr
-  --append          Append to output file
-  -g, --grader      Path to grader (.ts/.js module or executable script)
-  --debug           Enable debug mode
-  -h, --help        Show this help message
-
-Output Format:
-  Without grader: Raw trials with trajectories
-  With grader: Trials plus pass@k metrics (passRate, passAtK, passExpK)
-
-Graders:
-  TS/JS modules must export a 'grade' function.
-  Executable scripts (Python, etc.) use stdin/stdout JSON protocol.
-
-Parallelization:
-  Use -j/--concurrency to run multiple prompts' trials in parallel.
-  Each prompt's k trials still run sequentially (required for aggregation).
-  With 151 prompts and -j 4, you get 4 prompts running trials concurrently.
-
-  Memory: Stream-mode agents (e.g. Claude Code) spawn real subprocesses
-  at ~400-500MB RSS each. With -j 8 that is 3-4GB of resident memory.
-  In memory-constrained environments (Docker, CI) this can cause OOM kills.
-  Use --stdin to pipe prompts for container-level orchestration.
-
-Workspace Isolation:
-  Use --workspace-dir to create per-trial directories.
-  Each trial runs in {workspace-dir}/prompt-{id}-trial-{n}/.
-  Useful for code generation tasks requiring filesystem isolation.
-
-Examples:
-  # Basic trials
-  agent-eval-harness trials prompts.jsonl -s claude.json -k 5 -o trials.jsonl
-
-  # Run 4 prompts' trials in parallel (4x faster for 151 prompts)
-  agent-eval-harness trials prompts.jsonl -s claude.json -k 5 -j 4 -o trials.jsonl
-
-  # With workspace isolation for code generation
-  agent-eval-harness trials prompts.jsonl -s claude.json -k 5 -j 4 \\
-    --workspace-dir ./workspaces -o trials.jsonl
-
-  # With TypeScript grader
-  agent-eval-harness trials prompts.jsonl -s claude.json -k 5 --grader ./grader.ts -o trials.jsonl
-
-  # Read prompts from stdin (container orchestration)
-  cat prompts.jsonl | agent-eval-harness trials --stdin -s claude.json -k 5 -o trials.jsonl
-`)
-    return
-  }
-
-  const promptsPath = positionals[0]
-  const useStdin = values.stdin ?? false
-
-  // Mutual exclusivity: --stdin and positional file
-  if (useStdin && promptsPath) {
-    console.error('Error: --stdin and prompts file argument are mutually exclusive')
-    process.exit(1)
-  }
-
-  if (!useStdin && !promptsPath) {
-    console.error('Error: prompts.jsonl path is required (or use --stdin)')
-    process.exit(1)
-  }
-
-  if (!values.schema) {
-    console.error('Error: --schema is required')
-    console.error('Example: agent-eval-harness trials prompts.jsonl --schema ./claude.json')
-    process.exit(1)
-  }
-
-  // Read prompts from stdin if requested
-  let prompts: PromptCase[] | undefined
-  if (useStdin) {
-    const stdinPrompts = await readStdinPrompts()
-    if (!stdinPrompts || stdinPrompts.length === 0) {
-      console.error('Error: no prompts received on stdin')
-      process.exit(1)
-    }
-    prompts = stdinPrompts
-  }
-
-  // Load grader if specified
-  const grader = values.grader ? await loadGraderOrExit(values.grader) : undefined
-
-  await runTrials({
-    promptsPath: promptsPath ?? undefined,
-    prompts,
-    schemaPath: values.schema,
-    k: Number.parseInt(values.k ?? String(DEFAULT_TRIAL_COUNT), 10),
-    outputPath: values.output,
-    cwd: values.cwd,
-    timeout: values.timeout ? Number.parseInt(values.timeout, 10) : undefined,
-    progress: values.progress ?? false,
-    append: values.append ?? false,
-    grader,
-    debug: values.debug ?? false,
-    concurrency: parseConcurrency(values.concurrency),
-    workspaceDir: values['workspace-dir'],
-  })
-}
diff --git a/src/commands/validate-refs.ts b/src/commands/validate-refs.ts
deleted file mode 100644
index 003790d..0000000
--- a/src/commands/validate-refs.ts
+++ /dev/null
@@ -1,171 +0,0 @@
-/**
- * Validate-refs command - check reference solutions against grader.
- *
- * @remarks
- * Validates that reference solutions in prompts.jsonl pass the grader.
- * Helps identify prompts with broken or incorrect reference solutions.
- *
- * @packageDocumentation
- */
-
-import { parseArgs } from 'node:util'
-import { loadPrompts, resolvePath } from '../core.ts'
-import { loadGraderOrExit } from '../schemas/grader-loader.ts'
-import type { Grader, ValidationResult } from '../schemas.ts'
-
-// ============================================================================
-// Types
-// ============================================================================
-
-/** Configuration for validate-refs command */
-export type ValidateRefsConfig = {
-  /** Path to prompts.jsonl file */
-  promptsPath: string
-  /** Output file path */
-  outputPath?: string
-  /** Grader function */
-  grader: Grader
-}
-
-// ============================================================================
-// Validate-Refs Implementation
-// ============================================================================
-
-/**
- * Execute validate-refs with configuration object.
- *
- * @param config - Validate-refs configuration
- * @returns Array of validation results
- */
-export const runValidateRefs = async (config: ValidateRefsConfig): Promise<ValidationResult[]> => {
-  const { promptsPath, outputPath, grader } = config
-
-  // Load prompts
-  const prompts = await loadPrompts(promptsPath)
-
-  // Filter to prompts with reference solutions
-  const promptsWithRefs = prompts.filter((p) => p.reference !== undefined)
-
-  if (promptsWithRefs.length === 0) {
-    console.error('No prompts with reference solutions found')
-    return []
-  }
-
-  console.error(`Validating ${promptsWithRefs.length} reference solutions...`)
-
-  const results: ValidationResult[] = []
-
-  for (const prompt of promptsWithRefs) {
-    const graderResult = await grader({
-      input: prompt.input,
-      output: prompt.reference as string,
-      hint: prompt.hint,
-      trajectory: [], // No trajectory for reference validation
-      metadata: prompt.metadata,
-    })
-
-    results.push({
-      id: prompt.id,
-      reference: prompt.reference as string,
-      passes: graderResult.pass,
-      graderResult,
-    })
-
-    const icon = graderResult.pass ? '✓' : '✗'
-    console.error(`  ${icon} ${prompt.id}`)
-  }
-
-  // Format output
-  const output = results.map((r) => JSON.stringify(r)).join('\n')
-
-  // Write output
-  if (outputPath) {
-    await Bun.write(resolvePath(outputPath), output)
-  } else {
-    console.log(output)
-  }
-
-  // Summary
-  const passed = results.filter((r) => r.passes).length
-  const failed = results.length - passed
-  console.error(`\nResults: ${passed} passed, ${failed} failed`)
-
-  if (failed > 0) {
-    console.error('\nFailing references:')
-    for (const result of results.filter((r) => !r.passes)) {
-      console.error(`  - ${result.id}: ${result.graderResult.reasoning ?? 'No reasoning'}`)
-    }
-  }
-
-  return results
-}
-
-// ============================================================================
-// CLI Entry Point
-// ============================================================================
-
-/**
- * Validate-refs command CLI handler.
- *
- * @param args - Command line arguments (after 'validate-refs')
- */
-export const validateRefs = async (args: string[]): Promise<void> => {
-  const { values, positionals } = parseArgs({
-    args,
-    options: {
-      output: { type: 'string', short: 'o' },
-      grader: { type: 'string', short: 'g' },
-      help: { type: 'boolean', short: 'h' },
-    },
-    allowPositionals: true,
-  })
-
-  if (values.help) {
-    console.log(`
-Usage: agent-eval-harness validate-refs <prompts.jsonl> --grader <grader.ts> [options]
-
-Arguments:
-  prompts.jsonl     Input file with prompts (must have 'reference' field)
-
-Options:
-  -o, --output      Output file (default: stdout)
-  -g, --grader      Path to grader (.ts/.js module or executable script, required)
-  -h, --help        Show this help message
-
-Output:
-  JSONL with validation results for each reference solution.
-
-Prompt Format:
-  {
-    "id": "test-001",
-    "input": "What is 2+2?",
-    "expected": "4",
-    "reference": "The answer is 4."
-  }
-
-Examples:
-  agent-eval-harness validate-refs prompts.jsonl --grader ./grader.ts -o validation.jsonl
-`)
-    return
-  }
-
-  const promptsPath = positionals[0]
-  if (!promptsPath) {
-    console.error('Error: prompts.jsonl path is required')
-    process.exit(1)
-  }
-
-  if (!values.grader) {
-    console.error('Error: --grader is required for validate-refs')
-    process.exit(1)
-  }
-
-  // Load grader
-  const grader = await loadGraderOrExit(values.grader)
-
-  await runValidateRefs({
-    promptsPath,
-    outputPath: values.output,
-    grader,
-  })
-}
diff --git a/src/core.ts b/src/core.ts
deleted file mode 100644
index 5d1df05..0000000
--- a/src/core.ts
+++ /dev/null
@@ -1,46 +0,0 @@
-/**
- * Core utilities re-export.
- *
- * @remarks
- * Public API for core utilities. Import from here for external use.
- *
- * @packageDocumentation
- */
-
-export {
-  // Loading
-  buildResultsIndex,
-  countLines,
-  // Native streaming
-  countLinesStreaming,
-  // Worker pool
-  createWorkspaceDir,
-  createWriteMutex,
-  // Trajectory
-  detectTrajectoryRichness,
-  extractContent,
-  extractFilePath,
-  extractOutput,
-  extractTrajectory,
-  // Output
-  getInputPreview,
-  hasToolErrors,
-  headTailPreview,
-  loadJsonl,
-  loadPrompts,
-  loadResults,
-  logProgress,
-  type ProgressCallback,
-  readStdinPrompts,
-  resolvePath,
-  runWorkerPool,
-  streamJsonl,
-  streamPrompts,
-  streamResults,
-  streamResultsNative,
-  streamTrialResults,
-  type WorkerPoolOptions,
-  type WorkerPoolResult,
-  type WriteMutex,
-  writeOutput,
-} from './core/core.ts'
diff --git a/src/core/core.ts b/src/core/core.ts
deleted file mode 100644
index a36b7bd..0000000
--- a/src/core/core.ts
+++ /dev/null
@@ -1,51 +0,0 @@
-/**
- * Core utilities for agent-eval-harness.
- *
- * @remarks
- * Re-exports shared utilities used across all commands:
- * - Loading: JSONL file parsing for prompts and results
- * - Trajectory: Extraction and analysis of agent trajectories
- * - Output: Writing results, progress logging, path resolution
- *
- * @packageDocumentation
- */
-
-// Loading utilities
-export {
-  buildResultsIndex,
-  countLines,
-  loadJsonl,
-  loadPrompts,
-  loadResults,
-  readStdinPrompts,
-  streamResults,
-} from './loading.ts'
-// Output utilities
-export { getInputPreview, headTailPreview, logProgress, resolvePath, writeOutput } from './output.ts'
-// Native streaming utilities
-export {
-  countLinesStreaming,
-  streamJsonl,
-  streamPrompts,
-  streamResultsNative,
-  streamTrialResults,
-} from './streaming.ts'
-// Trajectory utilities
-export {
-  detectTrajectoryRichness,
-  extractContent,
-  extractFilePath,
-  extractOutput,
-  extractTrajectory,
-  hasToolErrors,
-} from './trajectory.ts'
-// Worker pool utilities
-export {
-  createWorkspaceDir,
-  createWriteMutex,
-  type ProgressCallback,
-  runWorkerPool,
-  type WorkerPoolOptions,
-  type WorkerPoolResult,
-  type WriteMutex,
-} from './worker-pool.ts'
diff --git a/src/core/loading.ts b/src/core/loading.ts
deleted file mode 100644
index 1bb5b41..0000000
--- a/src/core/loading.ts
+++ /dev/null
@@ -1,207 +0,0 @@
-/**
- * Shared loading utilities for JSONL files.
- *
- * @remarks
- * Provides consistent loading and parsing of prompts and results files.
- * Used by capture, trials, summarize, calibrate, and pipeline commands.
- *
- * @packageDocumentation
- */
-
-import type { CaptureResult, PromptCase } from '../schemas.ts'
-import { CaptureResultSchema, PromptCaseSchema } from '../schemas.ts'
-
-/**
- * Load prompts from a JSONL file.
- *
- * @remarks
- * Each line in the file should be a valid JSON object matching PromptCaseSchema.
- * Supports both single-turn (string input) and multi-turn (string[] input) formats.
- *
- * @param path - Path to the prompts.jsonl file
- * @returns Parsed and validated prompt cases
- * @throws Error if file cannot be read or any line is invalid
- *
- * @public
- */
-export const loadPrompts = async (path: string): Promise<PromptCase[]> => {
-  const content = await Bun.file(path).text()
-  return content
-    .trim()
-    .split('\n')
-    .filter(Boolean)
-    .map((line, index) => {
-      try {
-        return PromptCaseSchema.parse(JSON.parse(line))
-      } catch (error) {
-        throw new Error(`Invalid prompt at line ${index + 1}: ${error instanceof Error ? error.message : error}`)
-      }
-    })
-}
-
-/**
- * Read prompts from stdin as JSONL.
- *
- * @remarks
- * Reads all data from stdin, parses each line as JSON, and validates against
- * PromptCaseSchema. Returns null when stdin is a TTY (no piped input).
- * Uses chunked Buffer reads matching the pattern in pipeline/run.ts.
- *
- * @returns Parsed and validated prompt cases, or null if stdin is a TTY
- * @throws Error if any line is invalid JSON or fails schema validation
- *
- * @public
- */
-export const readStdinPrompts = async (): Promise<PromptCase[] | null> => {
-  if (process.stdin.isTTY) {
-    return null
-  }
-
-  const chunks: Buffer[] = []
-  for await (const chunk of process.stdin) {
-    chunks.push(chunk)
-  }
-
-  const content = Buffer.concat(chunks).toString('utf-8').trim()
-  if (!content) return null
-
-  return content
-    .split('\n')
-    .filter(Boolean)
-    .map((line, index) => {
-      try {
-        return PromptCaseSchema.parse(JSON.parse(line))
-      } catch (error) {
-        throw new Error(`Invalid stdin prompt at line ${index + 1}: ${error instanceof Error ? error.message : error}`)
-      }
-    })
-}
-
-/**
- * Load capture results from a JSONL file.
- *
- * @remarks
- * Each line should be a valid JSON object matching CaptureResultSchema.
- * Used by summarize, calibrate, and compare commands.
- *
- * @param path - Path to the results.jsonl file
- * @returns Parsed and validated capture results
- * @throws Error if file cannot be read or any line is invalid
- *
- * @public
- */
-export const loadResults = async (path: string): Promise<CaptureResult[]> => {
-  const content = await Bun.file(path).text()
-  return content
-    .trim()
-    .split('\n')
-    .filter(Boolean)
-    .map((line, index) => {
-      try {
-        return CaptureResultSchema.parse(JSON.parse(line))
-      } catch (error) {
-        throw new Error(`Invalid result at line ${index + 1}: ${error instanceof Error ? error.message : error}`)
-      }
-    })
-}
-
-/**
- * Load raw JSONL file as parsed JSON objects.
- *
- * @remarks
- * Lower-level loading without schema validation.
- * Useful for pipeline commands that need flexible input handling.
- *
- * @param path - Path to JSONL file
- * @returns Array of parsed JSON objects
- * @throws Error if file cannot be read or any line is invalid JSON
- *
- * @public
- */
-export const loadJsonl = async <T = unknown>(path: string): Promise<T[]> => {
-  const content = await Bun.file(path).text()
-  return content
-    .trim()
-    .split('\n')
-    .filter(Boolean)
-    .map((line, index) => {
-      try {
-        return JSON.parse(line) as T
-      } catch (error) {
-        throw new Error(`Invalid JSON at line ${index + 1}: ${error instanceof Error ? error.message : error}`)
-      }
-    })
-}
-
-// ============================================================================
-// Streaming Loading
-// ============================================================================
-
-// Re-export native streaming functions for backward compatibility
-export {
-  countLinesStreaming,
-  streamJsonl,
-  streamPrompts,
-  streamResultsNative,
-  streamTrialResults,
-} from './streaming.ts'
-
-/**
- * Stream capture results from a JSONL file.
- *
- * @remarks
- * Memory-efficient alternative to loadResults for large files.
- * Uses native streaming via Bun.file().stream() for O(1) memory usage.
- *
- * @param path - Path to the results.jsonl file
- * @yields Parsed and validated capture results
- * @throws Error if file cannot be read or any line is invalid
- *
- * @public
- */
-export async function* streamResults(path: string): AsyncGenerator<CaptureResult, void, unknown> {
-  const { streamResultsNative } = await import('./streaming.ts')
-  yield* streamResultsNative(path)
-}
-
-/**
- * Build an indexed map of results by ID using streaming.
- *
- * @remarks
- * Memory-efficient for the compare command. Loads results into a Map
- * keyed by ID for O(1) lookups without holding raw file content.
- *
- * For very large files (10k+ results), this is more memory-efficient than
- * loading everything into an array and then building an index.
- *
- * @param path - Path to the results.jsonl file
- * @returns Map of result ID to CaptureResult
- *
- * @public
- */
-export const buildResultsIndex = async (path: string): Promise<Map<string, CaptureResult>> => {
-  const index = new Map<string, CaptureResult>()
-
-  for await (const result of streamResults(path)) {
-    index.set(result.id, result)
-  }
-
-  return index
-}
-
-/**
- * Count lines in a JSONL file without loading content.
- *
- * @remarks
- * Useful for detecting large files that should use streaming mode.
- * Uses native streaming for O(1) memory usage.
- *
- * @param path - Path to the JSONL file
- * @returns Number of non-empty lines
- *
- * @public
- */
-export const countLines = async (path: string): Promise<number> => {
-  const { countLinesStreaming } = await import('./streaming.ts')
-  return countLinesStreaming(path)
-}
diff --git a/src/core/output.ts b/src/core/output.ts
deleted file mode 100644
index c3fdd9f..0000000
--- a/src/core/output.ts
+++ /dev/null
@@ -1,120 +0,0 @@
-/**
- * Shared output utilities for writing results and logging.
- *
- * @remarks
- * Provides consistent output handling across all commands:
- * - Writing to stdout or files
- * - Progress logging to stderr
- * - Path resolution
- * - Content preview (head/tail)
- *
- * @packageDocumentation
- */
-
-import { appendFile } from 'node:fs/promises'
-import { HEAD_LINES, TAIL_LINES } from '../schemas/constants.ts'
-
-/**
- * Write output line to stdout or file.
- *
- * @remarks
- * When writing to a file, supports both overwrite and append modes.
- * When writing to stdout, uses console.log.
- *
- * @param line - Content to write (without trailing newline)
- * @param outputPath - Optional file path (stdout if undefined)
- * @param append - If true, append to file instead of overwrite
- *
- * @public
- */
-export const writeOutput = async (line: string, outputPath?: string, append?: boolean): Promise<void> => {
-  if (outputPath) {
-    if (append) {
-      await appendFile(outputPath, `${line}\n`)
-    } else {
-      await Bun.write(outputPath, `${line}\n`)
-    }
-  } else {
-    console.log(line)
-  }
-}
-
-/**
- * Log progress message to stderr.
- *
- * @remarks
- * Progress output goes to stderr to avoid polluting stdout
- * when piping command output.
- *
- * @param message - Progress message to display
- * @param showProgress - If false, message is suppressed
- *
- * @public
- */
-export const logProgress = (message: string, showProgress: boolean): void => {
-  if (showProgress) {
-    console.error(message)
-  }
-}
-
-/**
- * Resolve path relative to process.cwd().
- *
- * @remarks
- * Absolute paths (starting with /) are returned as-is.
- * Relative paths are joined with current working directory.
- *
- * @param path - Path to resolve
- * @returns Absolute path
- *
- * @public
- */
-export const resolvePath = (path: string): string => {
-  if (path.startsWith('/')) return path
-  return `${process.cwd()}/${path}`
-}
-
-/**
- * Create head/tail preview of content.
- *
- * @remarks
- * Shows first N and last M lines with omission indicator in between.
- * Useful for large files/content in markdown output.
- *
- * @param content - Full content string
- * @param headLines - Number of lines from start (default from constants)
- * @param tailLines - Number of lines from end (default from constants)
- * @returns Truncated content with omission indicator
- *
- * @public
- */
-export const headTailPreview = (content: string, headLines = HEAD_LINES, tailLines = TAIL_LINES): string => {
-  const lines = content.split('\n')
-  if (lines.length <= headLines + tailLines) {
-    return content
-  }
-  const head = lines.slice(0, headLines).join('\n')
-  const tail = lines.slice(-tailLines).join('\n')
-  const omitted = lines.length - headLines - tailLines
-  return `${head}\n\n// ... ${omitted} lines omitted ...\n\n${tail}`
-}
-
-/**
- * Get preview text for input (handles string or array).
- *
- * @remarks
- * For arrays (multi-turn), shows turn count and preview of first turn.
- * For strings, shows first 50 characters.
- *
- * @param input - String or array input
- * @returns Preview text suitable for progress display
- *
- * @public
- */
-export const getInputPreview = (input: string | string[]): string => {
-  if (Array.isArray(input)) {
-    const first = input[0] ?? ''
-    return `[${input.length} turns] ${first.slice(0, 40)}...`
-  }
-  return input.slice(0, 50)
-}
diff --git a/src/core/streaming.ts b/src/core/streaming.ts
deleted file mode 100644
index c1ec47a..0000000
--- a/src/core/streaming.ts
+++ /dev/null
@@ -1,172 +0,0 @@
-/**
- * Native streaming utilities for JSONL files.
- *
- * @remarks
- * Provides true memory-efficient streaming using Bun.file().stream().
- * Unlike the batch-then-yield approach in loading.ts, these functions
- * process data chunk-by-chunk, maintaining O(1) memory usage regardless
- * of file size.
- *
- * @packageDocumentation
- */
-
-import type { ZodSchema } from 'zod'
-import type { CaptureResult, PromptCase, TrialResult } from '../schemas.ts'
-import { CaptureResultSchema, PromptCaseSchema, TrialResultSchema } from '../schemas.ts'
-
-/**
- * Stream JSONL file entries with optional schema validation.
- *
- * @remarks
- * Uses Bun's native ReadableStream for true streaming - only holds one
- * chunk in memory at a time. For files with 10k+ results, this provides
- * constant memory usage vs O(file size) for batch loading.
- *
- * @typeParam T - The expected type of each JSON line
- * @param path - Path to the JSONL file
- * @param schema - Optional Zod schema for validation
- * @yields Parsed (and optionally validated) JSON objects
- * @throws Error with line number if JSON parsing or validation fails
- *
- * @public
- */
-export async function* streamJsonl<T>(path: string, schema?: ZodSchema<T>): AsyncGenerator<T, void, unknown> {
-  const file = Bun.file(path)
-  const stream = file.stream()
-  const decoder = new TextDecoder()
-
-  let buffer = ''
-  let lineNum = 0
-
-  /**
-   * Process a single line of JSON.
-   */
-  const processLine = (line: string): T => {
-    const parsed = JSON.parse(line)
-    return schema ? schema.parse(parsed) : (parsed as T)
-  }
-
-  for await (const chunk of stream) {
-    buffer += decoder.decode(chunk, { stream: true })
-
-    let newlineIndex = buffer.indexOf('\n')
-    while (newlineIndex !== -1) {
-      const line = buffer.slice(0, newlineIndex).trim()
-      buffer = buffer.slice(newlineIndex + 1)
-      lineNum++
-
-      if (line) {
-        try {
-          yield processLine(line)
-        } catch (error) {
-          throw new Error(`Invalid JSON at line ${lineNum}: ${error instanceof Error ? error.message : error}`)
-        }
-      }
-
-      newlineIndex = buffer.indexOf('\n')
-    }
-  }
-
-  // Flush remaining buffer content (handles files without trailing newline)
-  buffer += decoder.decode()
-
-  const finalLine = buffer.trim()
-  if (finalLine) {
-    lineNum++
-    try {
-      yield processLine(finalLine)
-    } catch (error) {
-      throw new Error(`Invalid JSON at line ${lineNum}: ${error instanceof Error ? error.message : error}`)
-    }
-  }
-}
-
-/**
- * Stream prompt cases from a JSONL file.
- *
- * @remarks
- * Memory-efficient streaming with PromptCaseSchema validation.
- * Use this for large prompt files when you don't need random access.
- *
- * @param path - Path to the prompts.jsonl file
- * @yields Validated PromptCase objects
- * @throws Error with line number if validation fails
- *
- * @public
- */
-export async function* streamPrompts(path: string): AsyncGenerator<PromptCase, void, unknown> {
-  yield* streamJsonl<PromptCase>(path, PromptCaseSchema)
-}
-
-/**
- * Stream capture results from a JSONL file using native streaming.
- *
- * @remarks
- * True streaming alternative to the batch-then-yield streamResults in loading.ts.
- * Maintains O(1) memory usage regardless of file size.
- *
- * @param path - Path to the results.jsonl file
- * @yields Validated CaptureResult objects
- * @throws Error with line number if validation fails
- *
- * @public
- */
-export async function* streamResultsNative(path: string): AsyncGenerator<CaptureResult, void, unknown> {
-  yield* streamJsonl<CaptureResult>(path, CaptureResultSchema)
-}
-
-/**
- * Stream trial results from a JSONL file.
- *
- * @remarks
- * Memory-efficient streaming with TrialResultSchema validation.
- * Use for large trial result files from the trials command.
- *
- * @param path - Path to the trial results JSONL file
- * @yields Validated TrialResult objects
- * @throws Error with line number if validation fails
- *
- * @public
- */
-export async function* streamTrialResults(path: string): AsyncGenerator<TrialResult, void, unknown> {
-  yield* streamJsonl<TrialResult>(path, TrialResultSchema)
-}
-
-/**
- * Count lines in a JSONL file using streaming.
- *
- * @remarks
- * Counts non-empty lines without loading the entire file into memory.
- * Uses byte-level newline scanning for efficiency.
- *
- * @param path - Path to the JSONL file
- * @returns Number of non-empty lines
- *
- * @public
- */
-export const countLinesStreaming = async (path: string): Promise<number> => {
-  const file = Bun.file(path)
-  const stream = file.stream()
-  const decoder = new TextDecoder()
-
-  let count = 0
-  let buffer = ''
-
-  for await (const chunk of stream) {
-    buffer += decoder.decode(chunk, { stream: true })
-
-    let newlineIndex = buffer.indexOf('\n')
-    while (newlineIndex !== -1) {
-      const line = buffer.slice(0, newlineIndex).trim()
-      buffer = buffer.slice(newlineIndex + 1)
-      if (line) count++
-      newlineIndex = buffer.indexOf('\n')
-    }
-  }
-
-  // Flush and check final line
-  buffer += decoder.decode()
-  if (buffer.trim()) count++
-
-  return count
-}
diff --git a/src/core/tests/core.spec.ts b/src/core/tests/core.spec.ts
deleted file mode 100644
index 1d97f61..0000000
--- a/src/core/tests/core.spec.ts
+++ /dev/null
@@ -1,310 +0,0 @@
-/**
- * Unit tests for core utilities.
- *
- * @remarks
- * Tests for shared utility functions in the core module:
- * - loading: loadPrompts, loadResults, loadJsonl
- * - trajectory: extractTrajectory, extractOutput, hasToolErrors
- * - output: writeOutput, logProgress, headTailPreview
- *
- * @packageDocumentation
- */
-
-import { afterEach, describe, expect, test } from 'bun:test'
-import { unlink, writeFile } from 'node:fs/promises'
-import type { ParsedUpdate } from '../../headless/headless-output-parser.ts'
-import { loadJsonl, loadPrompts, loadResults } from '../loading.ts'
-import { headTailPreview, resolvePath } from '../output.ts'
-import { detectTrajectoryRichness, extractOutput, extractTrajectory, hasToolErrors } from '../trajectory.ts'
-
-// ============================================================================
-// Loading Tests
-// ============================================================================
-
-describe('loadJsonl', () => {
-  const testFile = '/tmp/core-test-jsonl.jsonl'
-
-  afterEach(async () => {
-    try {
-      await unlink(testFile)
-    } catch {
-      // Ignore if file doesn't exist
-    }
-  })
-
-  test('loads and parses JSONL file', async () => {
-    await writeFile(testFile, '{"a":1}\n{"a":2}\n{"a":3}')
-    const results = await loadJsonl<{ a: number }>(testFile)
-    expect(results.length).toBe(3)
-    expect(results[0]?.a).toBe(1)
-    expect(results[2]?.a).toBe(3)
-  })
-
-  test('skips empty lines', async () => {
-    await writeFile(testFile, '{"a":1}\n\n{"a":2}\n')
-    const results = await loadJsonl<{ a: number }>(testFile)
-    expect(results.length).toBe(2)
-  })
-
-  test('handles empty file', async () => {
-    await writeFile(testFile, '')
-    const results = await loadJsonl(testFile)
-    expect(results.length).toBe(0)
-  })
-})
-
-describe('loadPrompts', () => {
-  const testFile = '/tmp/core-test-prompts.jsonl'
-
-  afterEach(async () => {
-    try {
-      await unlink(testFile)
-    } catch {
-      // Ignore
-    }
-  })
-
-  test('loads valid prompts', async () => {
-    await writeFile(testFile, '{"id":"p1","input":"hello"}\n{"id":"p2","input":"world"}')
-    const prompts = await loadPrompts(testFile)
-    expect(prompts.length).toBe(2)
-    expect(prompts[0]?.id).toBe('p1')
-    expect(prompts[0]?.input).toBe('hello')
-  })
-
-  test('loads multi-turn prompts', async () => {
-    await writeFile(testFile, '{"id":"m1","input":["turn1","turn2"]}')
-    const prompts = await loadPrompts(testFile)
-    expect(prompts.length).toBe(1)
-    expect(Array.isArray(prompts[0]?.input)).toBe(true)
-    expect((prompts[0]?.input as string[]).length).toBe(2)
-  })
-})
-
-describe('loadResults', () => {
-  const testFile = '/tmp/core-test-results.jsonl'
-
-  afterEach(async () => {
-    try {
-      await unlink(testFile)
-    } catch {
-      // Ignore
-    }
-  })
-
-  test('loads capture results with full schema', async () => {
-    const result = {
-      id: 'r1',
-      input: 'test',
-      output: 'result',
-      trajectory: [],
-      metadata: {},
-      toolErrors: false,
-      timing: {
-        start: 0,
-        end: 100,
-        total: 100,
-        sessionCreation: 10,
-      },
-    }
-    await writeFile(testFile, JSON.stringify(result))
-    const results = await loadResults(testFile)
-    expect(results.length).toBe(1)
-    expect(results[0]?.id).toBe('r1')
-    expect(results[0]?.output).toBe('result')
-  })
-})
-
-// ============================================================================
-// Trajectory Tests
-// ============================================================================
-
-describe('extractTrajectory', () => {
-  const startTime = 1000
-
-  test('extracts message updates', () => {
-    const updates: ParsedUpdate[] = [{ type: 'message', content: 'Hello', timestamp: 1100, raw: {} }]
-    const trajectory = extractTrajectory(updates, startTime)
-    expect(trajectory.length).toBe(1)
-    expect(trajectory[0]?.type).toBe('message')
-    expect(trajectory[0]?.type === 'message' && trajectory[0]?.content).toBe('Hello')
-  })
-
-  test('extracts thought updates', () => {
-    const updates: ParsedUpdate[] = [{ type: 'thought', content: 'Thinking...', timestamp: 1200, raw: {} }]
-    const trajectory = extractTrajectory(updates, startTime)
-    expect(trajectory.length).toBe(1)
-    expect(trajectory[0]?.type).toBe('thought')
-  })
-
-  test('extracts tool_call with title', () => {
-    const updates: ParsedUpdate[] = [
-      {
-        type: 'tool_call',
-        title: 'Read',
-        status: 'completed',
-        timestamp: 1300,
-        raw: {},
-      },
-    ]
-    const trajectory = extractTrajectory(updates, startTime)
-    expect(trajectory.length).toBe(1)
-    expect(trajectory[0]?.type).toBe('tool_call')
-    const step = trajectory[0]
-    if (step?.type === 'tool_call') {
-      expect(step.name).toBe('Read')
-    }
-  })
-
-  test('handles empty updates', () => {
-    const trajectory = extractTrajectory([], startTime)
-    expect(trajectory.length).toBe(0)
-  })
-})
-
-describe('extractOutput', () => {
-  test('concatenates all message content', () => {
-    const trajectory = [
-      { type: 'thought' as const, content: 'Thinking', timestamp: 50 },
-      { type: 'message' as const, content: 'First message', timestamp: 100 },
-      { type: 'message' as const, content: 'Final answer', timestamp: 150 },
-    ]
-    const output = extractOutput(trajectory)
-    // extractOutput joins all messages with newline
-    expect(output).toBe('First message\nFinal answer')
-  })
-
-  test('returns empty string when no messages', () => {
-    const trajectory = [{ type: 'thought' as const, content: 'Thinking only', timestamp: 50 }]
-    const output = extractOutput(trajectory)
-    expect(output).toBe('')
-  })
-
-  test('handles empty trajectory', () => {
-    const output = extractOutput([])
-    expect(output).toBe('')
-  })
-})
-
-describe('hasToolErrors', () => {
-  test('returns false for successful tool calls', () => {
-    const trajectory = [
-      {
-        type: 'tool_call' as const,
-        name: 'Read',
-        status: 'completed',
-        timestamp: 100,
-      },
-    ]
-    expect(hasToolErrors(trajectory)).toBe(false)
-  })
-
-  test('returns true for failed status', () => {
-    const trajectory = [
-      {
-        type: 'tool_call' as const,
-        name: 'Read',
-        status: 'failed',
-        timestamp: 100,
-      },
-    ]
-    // hasToolErrors checks for status === 'failed'
-    expect(hasToolErrors(trajectory)).toBe(true)
-  })
-
-  test('returns false for error status (not failed)', () => {
-    // The implementation checks for 'failed', not 'error'
-    const trajectory = [
-      {
-        type: 'tool_call' as const,
-        name: 'Read',
-        status: 'error',
-        timestamp: 100,
-      },
-    ]
-    expect(hasToolErrors(trajectory)).toBe(false)
-  })
-
-  test('returns false for empty trajectory', () => {
-    expect(hasToolErrors([])).toBe(false)
-  })
-})
-
-describe('detectTrajectoryRichness', () => {
-  test('returns full when has thoughts', () => {
-    const trajectory = [
-      { type: 'thought' as const, content: 'Let me think', timestamp: 50 },
-      { type: 'message' as const, content: 'Done', timestamp: 150 },
-    ]
-    expect(detectTrajectoryRichness(trajectory)).toBe('full')
-  })
-
-  test('returns full when has tool_calls', () => {
-    const trajectory = [
-      {
-        type: 'tool_call' as const,
-        name: 'Read',
-        status: 'completed',
-        timestamp: 100,
-      },
-      { type: 'message' as const, content: 'Done', timestamp: 150 },
-    ]
-    // Any tool_call means 'full'
-    expect(detectTrajectoryRichness(trajectory)).toBe('full')
-  })
-
-  test('returns messages-only when only messages', () => {
-    const trajectory = [{ type: 'message' as const, content: 'Just a message', timestamp: 100 }]
-    expect(detectTrajectoryRichness(trajectory)).toBe('messages-only')
-  })
-
-  test('returns minimal for empty trajectory', () => {
-    // Empty trajectory returns 'minimal', not 'messages-only'
-    expect(detectTrajectoryRichness([])).toBe('minimal')
-  })
-})
-
-// ============================================================================
-// Output Tests
-// ============================================================================
-
-describe('headTailPreview', () => {
-  test('returns full content when short', () => {
-    const content = 'line1\nline2\nline3'
-    const preview = headTailPreview(content, 5, 3)
-    expect(preview).toBe(content)
-  })
-
-  test('truncates long content with omission indicator', () => {
-    const lines = Array.from({ length: 20 }, (_, i) => `line${i + 1}`).join('\n')
-    const preview = headTailPreview(lines, 3, 2)
-
-    expect(preview).toContain('line1')
-    expect(preview).toContain('line2')
-    expect(preview).toContain('line3')
-    // Actual format uses "// ... N lines omitted ..."
-    expect(preview).toContain('// ... 15 lines omitted ...')
-    expect(preview).toContain('line19')
-    expect(preview).toContain('line20')
-  })
-
-  test('handles exact boundary', () => {
-    const lines = 'line1\nline2\nline3\nline4\nline5'
-    const preview = headTailPreview(lines, 3, 2)
-    // 5 lines is exactly head(3) + tail(2), no truncation needed
-    expect(preview).toBe(lines)
-  })
-})
-
-describe('resolvePath', () => {
-  test('resolves relative path from cwd', () => {
-    const resolved = resolvePath('./test.txt')
-    expect(resolved.endsWith('test.txt')).toBe(true)
-    expect(resolved.startsWith('/')).toBe(true)
-  })
-
-  test('returns absolute path unchanged', () => {
-    const path = '/absolute/path/file.txt'
-    expect(resolvePath(path)).toBe(path)
-  })
-})
diff --git a/src/core/tests/streaming.spec.ts b/src/core/tests/streaming.spec.ts
deleted file mode 100644
index 633254c..0000000
--- a/src/core/tests/streaming.spec.ts
+++ /dev/null
@@ -1,399 +0,0 @@
-/**
- * Unit tests for native streaming utilities.
- *
- * @remarks
- * Tests for memory-efficient streaming functions in streaming.ts:
- * - streamJsonl: Generic JSONL streaming with optional schema validation
- * - streamPrompts: PromptCase streaming
- * - streamResultsNative: CaptureResult streaming
- * - streamTrialResults: TrialResult streaming
- * - countLinesStreaming: Line counting without full file load
- *
- * @packageDocumentation
- */
-
-import { afterEach, describe, expect, test } from 'bun:test'
-import { unlink } from 'node:fs/promises'
-import { z } from 'zod'
-import {
-  countLinesStreaming,
-  streamJsonl,
-  streamPrompts,
-  streamResultsNative,
-  streamTrialResults,
-} from '../streaming.ts'
-
-// ============================================================================
-// streamJsonl Tests
-// ============================================================================
-
-describe('streamJsonl', () => {
-  const testFile = '/tmp/streaming-test-jsonl.jsonl'
-
-  afterEach(async () => {
-    try {
-      await unlink(testFile)
-    } catch {
-      // Ignore if file doesn't exist
-    }
-  })
-
-  test('streams items one at a time', async () => {
-    await Bun.write(testFile, '{"a":1}\n{"a":2}\n{"a":3}')
-
-    const items: Array<{ a: number }> = []
-    for await (const item of streamJsonl<{ a: number }>(testFile)) {
-      items.push(item)
-    }
-
-    expect(items.length).toBe(3)
-    expect(items[0]?.a).toBe(1)
-    expect(items[1]?.a).toBe(2)
-    expect(items[2]?.a).toBe(3)
-  })
-
-  test('handles files without trailing newline', async () => {
-    await Bun.write(testFile, '{"a":1}\n{"a":2}')
-
-    const items: Array<{ a: number }> = []
-    for await (const item of streamJsonl<{ a: number }>(testFile)) {
-      items.push(item)
-    }
-
-    expect(items.length).toBe(2)
-    expect(items[1]?.a).toBe(2)
-  })
-
-  test('validates with schema when provided', async () => {
-    const schema = z.object({ id: z.string(), value: z.number() })
-    await Bun.write(testFile, '{"id":"a","value":1}\n{"id":"b","value":2}')
-
-    const items: Array<{ id: string; value: number }> = []
-    for await (const item of streamJsonl(testFile, schema)) {
-      items.push(item)
-    }
-
-    expect(items.length).toBe(2)
-    expect(items[0]?.id).toBe('a')
-    expect(items[0]?.value).toBe(1)
-  })
-
-  test('throws with line number on invalid JSON', async () => {
-    await Bun.write(testFile, '{"a":1}\ninvalid json\n{"a":3}')
-
-    const items: unknown[] = []
-    let error: Error | undefined
-
-    try {
-      for await (const item of streamJsonl(testFile)) {
-        items.push(item)
-      }
-    } catch (e) {
-      error = e as Error
-    }
-
-    expect(error).toBeDefined()
-    expect(error?.message).toContain('line 2')
-  })
-
-  test('throws with line number on schema validation failure', async () => {
-    const schema = z.object({ id: z.string(), required: z.number() })
-    await Bun.write(testFile, '{"id":"a","required":1}\n{"id":"b"}')
-
-    const items: unknown[] = []
-    let error: Error | undefined
-
-    try {
-      for await (const item of streamJsonl(testFile, schema)) {
-        items.push(item)
-      }
-    } catch (e) {
-      error = e as Error
-    }
-
-    expect(error).toBeDefined()
-    expect(error?.message).toContain('line 2')
-  })
-
-  test('handles empty files', async () => {
-    await Bun.write(testFile, '')
-
-    const items: unknown[] = []
-    for await (const item of streamJsonl(testFile)) {
-      items.push(item)
-    }
-
-    expect(items.length).toBe(0)
-  })
-
-  test('handles single-line files', async () => {
-    await Bun.write(testFile, '{"single":true}')
-
-    const items: Array<{ single: boolean }> = []
-    for await (const item of streamJsonl<{ single: boolean }>(testFile)) {
-      items.push(item)
-    }
-
-    expect(items.length).toBe(1)
-    expect(items[0]?.single).toBe(true)
-  })
-
-  test('skips empty lines', async () => {
-    await Bun.write(testFile, '{"a":1}\n\n\n{"a":2}\n')
-
-    const items: Array<{ a: number }> = []
-    for await (const item of streamJsonl<{ a: number }>(testFile)) {
-      items.push(item)
-    }
-
-    expect(items.length).toBe(2)
-  })
-
-  test('handles whitespace-only lines', async () => {
-    await Bun.write(testFile, '{"a":1}\n   \n{"a":2}')
-
-    const items: Array<{ a: number }> = []
-    for await (const item of streamJsonl<{ a: number }>(testFile)) {
-      items.push(item)
-    }
-
-    expect(items.length).toBe(2)
-  })
-})
-
-// ============================================================================
-// streamPrompts Tests
-// ============================================================================
-
-describe('streamPrompts', () => {
-  const testFile = '/tmp/streaming-test-prompts.jsonl'
-
-  afterEach(async () => {
-    try {
-      await unlink(testFile)
-    } catch {
-      // Ignore
-    }
-  })
-
-  test('yields validated PromptCase objects', async () => {
-    await Bun.write(testFile, '{"id":"p1","input":"hello"}\n{"id":"p2","input":"world"}')
-
-    const prompts = []
-    for await (const prompt of streamPrompts(testFile)) {
-      prompts.push(prompt)
-    }
-
-    expect(prompts.length).toBe(2)
-    expect(prompts[0]?.id).toBe('p1')
-    expect(prompts[0]?.input).toBe('hello')
-  })
-
-  test('handles multi-turn prompts', async () => {
-    await Bun.write(testFile, '{"id":"m1","input":["turn1","turn2"]}')
-
-    const prompts = []
-    for await (const prompt of streamPrompts(testFile)) {
-      prompts.push(prompt)
-    }
-
-    expect(prompts.length).toBe(1)
-    expect(Array.isArray(prompts[0]?.input)).toBe(true)
-  })
-
-  test('throws on schema validation failure', async () => {
-    // Missing required 'id' field
-    await Bun.write(testFile, '{"input":"hello"}')
-
-    let error: Error | undefined
-    try {
-      for await (const _ of streamPrompts(testFile)) {
-        // Consume
-      }
-    } catch (e) {
-      error = e as Error
-    }
-
-    expect(error).toBeDefined()
-    expect(error?.message).toContain('line 1')
-  })
-})
-
-// ============================================================================
-// streamResultsNative Tests
-// ============================================================================
-
-describe('streamResultsNative', () => {
-  const testFile = '/tmp/streaming-test-results.jsonl'
-
-  afterEach(async () => {
-    try {
-      await unlink(testFile)
-    } catch {
-      // Ignore
-    }
-  })
-
-  test('yields validated CaptureResult objects', async () => {
-    const result = {
-      id: 'r1',
-      input: 'test',
-      output: 'result',
-      trajectory: [],
-      metadata: {},
-      toolErrors: false,
-      timing: {
-        start: 0,
-        end: 100,
-        total: 100,
-        sessionCreation: 10,
-      },
-    }
-    await Bun.write(testFile, JSON.stringify(result))
-
-    const results = []
-    for await (const r of streamResultsNative(testFile)) {
-      results.push(r)
-    }
-
-    expect(results.length).toBe(1)
-    expect(results[0]?.id).toBe('r1')
-    expect(results[0]?.output).toBe('result')
-  })
-
-  test('streams multiple results', async () => {
-    const makeResult = (id: string) => ({
-      id,
-      input: 'test',
-      output: 'result',
-      trajectory: [],
-      metadata: {},
-      toolErrors: false,
-      timing: { start: 0, end: 100, total: 100, sessionCreation: 10 },
-    })
-
-    await Bun.write(
-      testFile,
-      `${JSON.stringify(makeResult('r1'))}\n${JSON.stringify(makeResult('r2'))}\n${JSON.stringify(makeResult('r3'))}`,
-    )
-
-    const results = []
-    for await (const r of streamResultsNative(testFile)) {
-      results.push(r)
-    }
-
-    expect(results.length).toBe(3)
-    expect(results.map((r) => r.id)).toEqual(['r1', 'r2', 'r3'])
-  })
-})
-
-// ============================================================================
-// streamTrialResults Tests
-// ============================================================================
-
-describe('streamTrialResults', () => {
-  const testFile = '/tmp/streaming-test-trials.jsonl'
-
-  afterEach(async () => {
-    try {
-      await unlink(testFile)
-    } catch {
-      // Ignore
-    }
-  })
-
-  test('yields validated TrialResult objects', async () => {
-    const trialResult = {
-      id: 't1',
-      input: 'test prompt',
-      k: 3,
-      passRate: 0.67,
-      passAtK: 1,
-      passExpK: 0.7,
-      trials: [
-        { trialNum: 1, output: 'output1', trajectory: [], duration: 100, pass: true },
-        { trialNum: 2, output: 'output2', trajectory: [], duration: 150, pass: true },
-        { trialNum: 3, output: 'output3', trajectory: [], duration: 120, pass: false },
-      ],
-    }
-    await Bun.write(testFile, JSON.stringify(trialResult))
-
-    const results = []
-    for await (const r of streamTrialResults(testFile)) {
-      results.push(r)
-    }
-
-    expect(results.length).toBe(1)
-    expect(results[0]?.id).toBe('t1')
-    expect(results[0]?.k).toBe(3)
-    expect(results[0]?.passRate).toBe(0.67)
-  })
-
-  test('throws on invalid trial result', async () => {
-    // Missing required 'k' field
-    await Bun.write(testFile, '{"id":"t1","input":"test","trials":[]}')
-
-    let error: Error | undefined
-    try {
-      for await (const _ of streamTrialResults(testFile)) {
-        // Consume
-      }
-    } catch (e) {
-      error = e as Error
-    }
-
-    expect(error).toBeDefined()
-    expect(error?.message).toContain('line 1')
-  })
-})
-
-// ============================================================================
-// countLinesStreaming Tests
-// ============================================================================
-
-describe('countLinesStreaming', () => {
-  const testFile = '/tmp/streaming-test-count.jsonl'
-
-  afterEach(async () => {
-    try {
-      await unlink(testFile)
-    } catch {
-      // Ignore
-    }
-  })
-
-  test('counts lines without loading full file', async () => {
-    await Bun.write(testFile, '{"a":1}\n{"a":2}\n{"a":3}')
-
-    const count = await countLinesStreaming(testFile)
-    expect(count).toBe(3)
-  })
-
-  test('handles empty file', async () => {
-    await Bun.write(testFile, '')
-
-    const count = await countLinesStreaming(testFile)
-    expect(count).toBe(0)
-  })
-
-  test('handles file without trailing newline', async () => {
-    await Bun.write(testFile, '{"a":1}\n{"a":2}')
-
-    const count = await countLinesStreaming(testFile)
-    expect(count).toBe(2)
-  })
-
-  test('skips empty lines', async () => {
-    await Bun.write(testFile, '{"a":1}\n\n{"a":2}\n\n')
-
-    const count = await countLinesStreaming(testFile)
-    expect(count).toBe(2)
-  })
-
-  test('handles single-line file', async () => {
-    await Bun.write(testFile, '{"single":true}')
-
-    const count = await countLinesStreaming(testFile)
-    expect(count).toBe(1)
-  })
-})
diff --git a/src/core/tests/worker-pool.spec.ts b/src/core/tests/worker-pool.spec.ts
deleted file mode 100644
index d17ae9c..0000000
--- a/src/core/tests/worker-pool.spec.ts
+++ /dev/null
@@ -1,377 +0,0 @@
-/**
- * Unit tests for worker pool utilities.
- *
- * @remarks
- * Tests for parallel execution utilities:
- * - runWorkerPool: Promise-based worker pool with concurrency limit
- * - createWriteMutex: Coordinates concurrent file writes
- * - createWorkspaceDir: Creates per-prompt workspace directories
- *
- * @packageDocumentation
- */
-
-import { afterEach, describe, expect, test } from 'bun:test'
-import { rm, stat } from 'node:fs/promises'
-import { createWorkspaceDir, createWriteMutex, runWorkerPool } from '../worker-pool.ts'
-
-// Helper to check if a directory exists
-const dirExists = async (path: string): Promise<boolean> => {
-  try {
-    const s = await stat(path)
-    return s.isDirectory()
-  } catch {
-    return false
-  }
-}
-
-// ============================================================================
-// runWorkerPool Tests
-// ============================================================================
-
-describe('runWorkerPool', () => {
-  test('processes items sequentially with concurrency 1', async () => {
-    const order: number[] = []
-    const items = [1, 2, 3, 4, 5]
-
-    const { results } = await runWorkerPool(
-      items,
-      async (item) => {
-        order.push(item)
-        return item * 2
-      },
-      { concurrency: 1 },
-    )
-
-    // With concurrency 1, order should be preserved
-    expect(order).toEqual([1, 2, 3, 4, 5])
-    expect(results).toEqual([2, 4, 6, 8, 10])
-  })
-
-  test('processes items in parallel with concurrency > 1', async () => {
-    const items = [1, 2, 3, 4]
-    const startTimes: number[] = []
-
-    const { results } = await runWorkerPool(
-      items,
-      async (item, index) => {
-        startTimes[index] = Date.now()
-        await Bun.sleep(50) // Simulate work
-        return item * 2
-      },
-      { concurrency: 4 },
-    )
-
-    // All results should be correct
-    expect(results.sort((a, b) => a - b)).toEqual([2, 4, 6, 8])
-
-    // With concurrency 4, all items should start nearly simultaneously
-    const maxDiff = Math.max(...startTimes) - Math.min(...startTimes)
-    expect(maxDiff).toBeLessThan(30) // Should all start within 30ms
-  })
-
-  test('limits concurrency correctly', async () => {
-    let activeCount = 0
-    let maxActive = 0
-    const items = [1, 2, 3, 4, 5, 6]
-
-    await runWorkerPool(
-      items,
-      async (item) => {
-        activeCount++
-        maxActive = Math.max(maxActive, activeCount)
-        await Bun.sleep(20) // Simulate work
-        activeCount--
-        return item
-      },
-      { concurrency: 2 },
-    )
-
-    // Should never exceed concurrency limit
-    expect(maxActive).toBeLessThanOrEqual(2)
-  })
-
-  test('collects errors without stopping other workers', async () => {
-    const items = [1, 2, 3, 4, 5]
-
-    const { results, errors } = await runWorkerPool(
-      items,
-      async (item) => {
-        if (item === 3) {
-          throw new Error('Item 3 failed')
-        }
-        return item * 2
-      },
-      { concurrency: 2 },
-    )
-
-    // Should have 4 results and 1 error
-    expect(results.length).toBe(4)
-    expect(errors.length).toBe(1)
-    expect(errors[0]?.index).toBe(2) // Index of item 3
-    expect(errors[0]?.error.message).toBe('Item 3 failed')
-  })
-
-  test('calls onProgress callback', async () => {
-    const progressCalls: Array<{ completed: number; total: number }> = []
-    const items = [1, 2, 3]
-
-    await runWorkerPool(items, async (item) => item * 2, {
-      concurrency: 1,
-      onProgress: (completed, total) => {
-        progressCalls.push({ completed, total })
-      },
-    })
-
-    expect(progressCalls.length).toBe(3)
-    expect(progressCalls[0]).toEqual({ completed: 1, total: 3 })
-    expect(progressCalls[1]).toEqual({ completed: 2, total: 3 })
-    expect(progressCalls[2]).toEqual({ completed: 3, total: 3 })
-  })
-
-  test('handles empty items array', async () => {
-    const { results, errors } = await runWorkerPool([] as number[], async (item) => item * 2, { concurrency: 4 })
-
-    expect(results).toEqual([])
-    expect(errors).toEqual([])
-  })
-
-  test('skips undefined items in array', async () => {
-    // Create a sparse array with holes
-    const items: (number | undefined)[] = [1, undefined, 3, undefined, 5]
-
-    const { results } = await runWorkerPool(
-      items,
-      async (item) => {
-        if (item === undefined) throw new Error('Should not process undefined')
-        return item * 2
-      },
-      { concurrency: 2 },
-    )
-
-    // Should only process defined items
-    expect(results.sort((a, b) => a - b)).toEqual([2, 6, 10])
-  })
-
-  test('handles concurrency greater than items count', async () => {
-    const items = [1, 2]
-    const { results } = await runWorkerPool(items, async (item) => item * 2, { concurrency: 10 })
-
-    expect(results.sort((a, b) => a - b)).toEqual([2, 4])
-  })
-})
-
-// ============================================================================
-// createWriteMutex Tests
-// ============================================================================
-
-describe('createWriteMutex', () => {
-  test('serializes concurrent writes', async () => {
-    const mutex = createWriteMutex()
-    const order: number[] = []
-
-    // Start multiple writes concurrently
-    const promises = [1, 2, 3, 4, 5].map((n) =>
-      mutex.write(async () => {
-        await Bun.sleep(10) // Simulate write delay
-        order.push(n)
-      }),
-    )
-
-    await Promise.all(promises)
-
-    // All writes should complete in order
-    expect(order).toEqual([1, 2, 3, 4, 5])
-  })
-
-  test('continues after failed write', async () => {
-    const mutex = createWriteMutex()
-    const order: number[] = []
-
-    const promise1 = mutex.write(async () => {
-      order.push(1)
-    })
-
-    const promise2 = mutex.write(async () => {
-      order.push(2)
-      throw new Error('Write 2 failed')
-    })
-
-    const promise3 = mutex.write(async () => {
-      order.push(3)
-    })
-
-    await promise1
-    await promise2.catch(() => {}) // Ignore error
-    await promise3
-
-    // All writes should execute in order, even after failure
-    expect(order).toEqual([1, 2, 3])
-  })
-
-  test('returns promise that resolves when write completes', async () => {
-    const mutex = createWriteMutex()
-    let writeCompleted = false
-
-    const promise = mutex.write(async () => {
-      await Bun.sleep(10)
-      writeCompleted = true
-    })
-
-    expect(writeCompleted).toBe(false)
-    await promise
-    expect(writeCompleted).toBe(true)
-  })
-})
-
-// ============================================================================
-// createWorkspaceDir Tests
-// ============================================================================
-
-describe('createWorkspaceDir', () => {
-  const testBaseDir = '/tmp/worker-pool-test-workspaces'
-
-  afterEach(async () => {
-    try {
-      await rm(testBaseDir, { recursive: true, force: true })
-    } catch {
-      // Ignore if doesn't exist
-    }
-  })
-
-  test('creates workspace directory', async () => {
-    const workspaceDir = await createWorkspaceDir(testBaseDir, 'test-prompt-1')
-
-    expect(workspaceDir).toBe(`${testBaseDir}/prompt-test-prompt-1`)
-    expect(await dirExists(workspaceDir)).toBe(true)
-  })
-
-  test('sanitizes invalid filesystem characters', async () => {
-    const workspaceDir = await createWorkspaceDir(testBaseDir, 'test<>:"/\\|?*prompt')
-
-    // Invalid characters should be replaced with underscore
-    expect(workspaceDir).toBe(`${testBaseDir}/prompt-test_________prompt`)
-    expect(await dirExists(workspaceDir)).toBe(true)
-  })
-
-  test('handles existing directory', async () => {
-    // Create first
-    const dir1 = await createWorkspaceDir(testBaseDir, 'existing')
-    // Create same again
-    const dir2 = await createWorkspaceDir(testBaseDir, 'existing')
-
-    expect(dir1).toBe(dir2)
-    expect(await dirExists(dir1)).toBe(true)
-  })
-
-  test('creates nested base directory', async () => {
-    const nestedBase = `${testBaseDir}/deep/nested/path`
-    const workspaceDir = await createWorkspaceDir(nestedBase, 'prompt-1')
-
-    expect(workspaceDir).toBe(`${nestedBase}/prompt-prompt-1`)
-    expect(await dirExists(workspaceDir)).toBe(true)
-  })
-})
-
-// ============================================================================
-// Integration Tests
-// ============================================================================
-
-describe('worker pool with write mutex integration', () => {
-  test('coordinates writes from concurrent workers', async () => {
-    const mutex = createWriteMutex()
-    const writeOrder: string[] = []
-    const items = ['a', 'b', 'c', 'd', 'e']
-
-    await runWorkerPool(
-      items,
-      async (item) => {
-        // Simulate variable processing time
-        await Bun.sleep(Math.random() * 20)
-
-        // Write with mutex coordination
-        await mutex.write(async () => {
-          writeOrder.push(item)
-        })
-
-        return item
-      },
-      { concurrency: 3 },
-    )
-
-    // All items should be written exactly once
-    expect(writeOrder.sort()).toEqual(['a', 'b', 'c', 'd', 'e'])
-    // Order depends on which worker finishes first, but all should be present
-    expect(writeOrder.length).toBe(5)
-  })
-
-  test('produces valid JSONL with concurrent writes to file', async () => {
-    const mutex = createWriteMutex()
-    const items = Array.from({ length: 10 }, (_, i) => ({ id: `test-${i}`, value: i }))
-
-    // Collect lines in memory, then verify structure
-    const lines: string[] = []
-
-    await runWorkerPool(
-      items,
-      async (item) => {
-        // Simulate variable processing time
-        await Bun.sleep(Math.random() * 30)
-
-        // Write JSONL line with mutex coordination (same pattern as capture.ts)
-        await mutex.write(async () => {
-          const line = JSON.stringify(item)
-          lines.push(line)
-        })
-
-        return item
-      },
-      { concurrency: 4 },
-    )
-
-    // Should have all 10 items
-    expect(lines.length).toBe(10)
-
-    // Each line should be valid JSON
-    const parsed = lines.map((line) => JSON.parse(line))
-    const ids = parsed.map((p) => p.id).sort()
-
-    // All items present (order may vary)
-    expect(ids).toEqual(items.map((i) => i.id).sort())
-  })
-
-  test('creates workspace directories concurrently without collision', async () => {
-    const testBase = '/tmp/worker-pool-workspace-test'
-    const items = ['prompt-1', 'prompt-2', 'prompt-3', 'prompt-4', 'prompt-5']
-
-    // Clean up first
-    try {
-      await rm(testBase, { recursive: true, force: true })
-    } catch {
-      // Ignore
-    }
-
-    const createdDirs: string[] = []
-
-    await runWorkerPool(
-      items,
-      async (promptId) => {
-        const dir = await createWorkspaceDir(testBase, promptId)
-        createdDirs.push(dir)
-        return dir
-      },
-      { concurrency: 5 },
-    )
-
-    // All directories created
-    expect(createdDirs.length).toBe(5)
-
-    // Verify each directory exists
-    for (const dir of createdDirs) {
-      const exists = await dirExists(dir)
-      expect(exists).toBe(true)
-    }
-
-    // Cleanup
-    await rm(testBase, { recursive: true, force: true })
-  })
-})
diff --git a/src/core/trajectory.ts b/src/core/trajectory.ts
deleted file mode 100644
index a448c00..0000000
--- a/src/core/trajectory.ts
+++ /dev/null
@@ -1,172 +0,0 @@
-/**
- * Shared trajectory utilities for extraction and analysis.
- *
- * @remarks
- * Provides functions for extracting trajectory data from parsed updates,
- * detecting richness levels, and checking for tool errors.
- *
- * @packageDocumentation
- */
-
-import type { ParsedUpdate } from '../headless/headless-output-parser.ts'
-import type { TrajectoryRichness, TrajectoryStep } from '../schemas.ts'
-import { ToolInputSchema } from '../schemas.ts'
-
-/**
- * Extract trajectory from parsed updates.
- *
- * @remarks
- * Converts ParsedUpdate stream into TrajectoryStep array.
- * Handles tool call deduplication (start/completion events).
- *
- * @param updates - Parsed updates from output parser
- * @param startTime - Reference time for timestamp calculation
- * @returns Array of trajectory steps with relative timestamps
- *
- * @public
- */
-export const extractTrajectory = (updates: ParsedUpdate[], startTime: number): TrajectoryStep[] => {
-  const trajectory: TrajectoryStep[] = []
-  const toolCallMap = new Map<string, { start: number; step: TrajectoryStep & { type: 'tool_call' } }>()
-
-  for (const update of updates) {
-    const timestamp = update.timestamp - startTime
-
-    if (update.type === 'thought') {
-      trajectory.push({
-        type: 'thought',
-        content: update.content ?? '',
-        timestamp,
-      })
-    } else if (update.type === 'message') {
-      trajectory.push({
-        type: 'message',
-        content: update.content ?? '',
-        timestamp,
-      })
-    } else if (update.type === 'tool_call') {
-      const toolCallId = update.title ?? `tool_${timestamp}`
-      const existing = toolCallMap.get(toolCallId)
-
-      if (existing && update.status === 'completed') {
-        // Update existing tool call with completion info
-        existing.step.status = update.status
-        existing.step.duration = timestamp - existing.start
-        if (update.output !== undefined) {
-          existing.step.output = update.output
-        }
-        // Remove from map so a subsequent call with the same name starts fresh
-        toolCallMap.delete(toolCallId)
-      } else if (!existing) {
-        // New tool call
-        const step: TrajectoryStep & { type: 'tool_call' } = {
-          type: 'tool_call',
-          name: update.title ?? 'unknown',
-          status: update.status ?? 'pending',
-          ...(update.input !== undefined && { input: update.input }),
-          timestamp,
-        }
-        toolCallMap.set(toolCallId, { start: timestamp, step })
-        trajectory.push(step)
-      }
-    } else if (update.type === 'plan') {
-      trajectory.push({
-        type: 'plan',
-        entries: [],
-        timestamp,
-      })
-    }
-  }
-
-  return trajectory
-}
-
-/**
- * Extract final text output from trajectory.
- *
- * @remarks
- * Concatenates all message step content to produce final output string.
- *
- * @param trajectory - Trajectory steps from capture
- * @returns Concatenated message content
- *
- * @public
- */
-export const extractOutput = (trajectory: TrajectoryStep[]): string => {
-  return trajectory
-    .filter((step): step is TrajectoryStep & { type: 'message' } => step.type === 'message')
-    .map((step) => step.content)
-    .join('\n')
-}
-
-/**
- * Check if any tool calls failed in trajectory.
- *
- * @param trajectory - Trajectory steps from capture
- * @returns True if any tool call has 'failed' status
- *
- * @public
- */
-export const hasToolErrors = (trajectory: TrajectoryStep[]): boolean => {
-  return trajectory.some((step) => step.type === 'tool_call' && step.status === 'failed')
-}
-
-/**
- * Detect trajectory richness level from captured steps.
- *
- * @remarks
- * Different adapters provide varying levels of detail:
- * - `full`: Has thoughts, tool calls, or plans (e.g., Claude Code)
- * - `messages-only`: Only message steps present
- * - `minimal`: Empty or unknown content
- *
- * Uses single-pass iteration with early exit for efficiency.
- *
- * @param trajectory - Trajectory steps from capture
- * @returns Detected richness level
- *
- * @public
- */
-export const detectTrajectoryRichness = (trajectory: TrajectoryStep[]): TrajectoryRichness => {
-  let hasMessages = false
-
-  for (const step of trajectory) {
-    // Early exit: any of these means 'full' richness
-    if (step.type === 'thought' || step.type === 'tool_call' || step.type === 'plan') {
-      return 'full'
-    }
-    if (step.type === 'message') {
-      hasMessages = true
-    }
-  }
-
-  return hasMessages ? 'messages-only' : 'minimal'
-}
-
-/**
- * Extract file path from tool input if present.
- *
- * @param input - Tool call input object
- * @returns File path string or undefined
- *
- * @public
- */
-export const extractFilePath = (input: unknown): string | undefined => {
-  const result = ToolInputSchema.safeParse(input)
-  if (!result.success) return undefined
-  return result.data.file_path ?? result.data.path
-}
-
-/**
- * Extract content from tool input if present.
- *
- * @param input - Tool call input object
- * @returns Content string or undefined
- *
- * @public
- */
-export const extractContent = (input: unknown): string | undefined => {
-  const result = ToolInputSchema.safeParse(input)
-  if (!result.success) return undefined
-  return result.data.content ?? result.data.new_string
-}
diff --git a/src/core/worker-pool.ts b/src/core/worker-pool.ts
deleted file mode 100644
index d851380..0000000
--- a/src/core/worker-pool.ts
+++ /dev/null
@@ -1,220 +0,0 @@
-/**
- * Promise-based worker pool for parallel task execution.
- *
- * @remarks
- * Implements a p-limit style concurrency limiter that:
- * - Processes items with configurable concurrency
- * - Maintains order-independent result collection
- * - Supports progress callbacks
- * - Coordinates file writes via mutex
- *
- * @packageDocumentation
- */
-
-import { mkdir } from 'node:fs/promises'
-
-// ============================================================================
-// Types
-// ============================================================================
-
-/**
- * Progress callback for worker pool.
- *
- * @param completed - Number of completed tasks
- * @param total - Total number of tasks
- * @param result - Result of the just-completed task (if successful)
- * @param error - Error from the just-completed task (if failed)
- */
-export type ProgressCallback<T> = (completed: number, total: number, result?: T, error?: Error) => void
-
-/**
- * Options for worker pool execution.
- */
-export type WorkerPoolOptions<T> = {
-  /** Maximum concurrent workers (default: 1) */
-  concurrency: number
-  /** Progress callback called after each task completes */
-  onProgress?: ProgressCallback<T>
-}
-
-/**
- * Result of worker pool execution.
- */
-export type WorkerPoolResult<T> = {
-  /** Successfully completed results (in completion order, not input order) */
-  results: T[]
-  /** Errors encountered during execution */
-  errors: Array<{ index: number; error: Error }>
-}
-
-// ============================================================================
-// Write Mutex for JSONL Coordination
-// ============================================================================
-
-/**
- * Simple mutex for coordinating file writes.
- *
- * @remarks
- * Uses a promise chain to ensure only one write happens at a time.
- * This prevents data corruption when multiple workers complete simultaneously.
- */
-export type WriteMutex = {
-  /** Acquire lock, execute write, release lock */
-  write: (fn: () => Promise<void>) => Promise<void>
-}
-
-/**
- * Create a write mutex for coordinating file output.
- *
- * @returns WriteMutex instance
- */
-export const createWriteMutex = (): WriteMutex => {
-  let chain = Promise.resolve()
-
-  return {
-    write: (fn: () => Promise<void>): Promise<void> => {
-      // Chain this write after all previous writes
-      chain = chain.then(fn, fn) // Continue even if previous failed
-      return chain
-    },
-  }
-}
-
-// ============================================================================
-// Worker Pool Implementation
-// ============================================================================
-
-/**
- * Execute tasks in parallel with concurrency limit.
- *
- * @remarks
- * Uses a semaphore-style approach where workers grab the next available
- * task from a shared queue. Results are collected as tasks complete
- * (order may differ from input order).
- *
- * @param items - Array of items to process
- * @param worker - Async function to process each item
- * @param options - Pool configuration
- * @returns Results and any errors encountered
- *
- * @public
- */
-export const runWorkerPool = async <TItem, TResult>(
-  items: TItem[],
-  worker: (item: TItem, index: number) => Promise<TResult>,
-  options: WorkerPoolOptions<TResult>,
-): Promise<WorkerPoolResult<TResult>> => {
-  const { concurrency, onProgress } = options
-  const results: TResult[] = []
-  const errors: Array<{ index: number; error: Error }> = []
-
-  // Fast path: if concurrency is 1, process sequentially
-  if (concurrency === 1) {
-    for (let i = 0; i < items.length; i++) {
-      const item = items[i]
-      if (item === undefined) continue
-
-      try {
-        const result = await worker(item, i)
-        results.push(result)
-        onProgress?.(results.length + errors.length, items.length, result)
-      } catch (err) {
-        const error = err instanceof Error ? err : new Error(String(err))
-        errors.push({ index: i, error })
-        onProgress?.(results.length + errors.length, items.length, undefined, error)
-      }
-    }
-    return { results, errors }
-  }
-
-  // Shared state for work distribution
-  let nextIndex = 0
-  let completed = 0
-  const mutex = { lock: Promise.resolve() }
-
-  // Get next work item (thread-safe via single-threaded JS)
-  // Uses iterative loop instead of recursion to avoid stack overflow with sparse arrays
-  const getNextItem = (): { item: TItem; index: number } | undefined => {
-    while (nextIndex < items.length) {
-      const index = nextIndex++
-      const item = items[index]
-      if (item !== undefined) {
-        return { item, index }
-      }
-      // Skip undefined items and continue to next
-    }
-    return undefined
-  }
-
-  // Worker function that processes items until none remain
-  const runWorker = async (): Promise<void> => {
-    let work = getNextItem()
-    while (work) {
-      const { item, index } = work
-      try {
-        const result = await worker(item, index)
-
-        // Coordinate result collection
-        await new Promise<void>((resolve) => {
-          mutex.lock = mutex.lock.then(() => {
-            results.push(result)
-            completed++
-            onProgress?.(completed, items.length, result)
-            resolve()
-          })
-        })
-      } catch (err) {
-        const error = err instanceof Error ? err : new Error(String(err))
-
-        // Coordinate error collection
-        await new Promise<void>((resolve) => {
-          mutex.lock = mutex.lock.then(() => {
-            errors.push({ index, error })
-            completed++
-            onProgress?.(completed, items.length, undefined, error)
-            resolve()
-          })
-        })
-      }
-
-      work = getNextItem()
-    }
-  }
-
-  // Start N workers
-  const workers = Array.from({ length: Math.min(concurrency, items.length) }, () => runWorker())
-  await Promise.all(workers)
-
-  return { results, errors }
-}
-
-// ============================================================================
-// Workspace Directory Management
-// ============================================================================
-
-/**
- * Create a workspace directory for a prompt.
- *
- * @remarks
- * Creates an isolated directory for each prompt execution.
- * Directory is created if it doesn't exist. Directories persist
- * after completion for debugging/inspection - clean up manually
- * or via CI scripts if disk space is a concern.
- *
- * @param baseDir - Base workspace directory
- * @param promptId - Unique prompt identifier
- * @returns Absolute path to the workspace directory
- *
- * @public
- */
-export const createWorkspaceDir = async (baseDir: string, promptId: string): Promise<string> => {
-  // Sanitize promptId for filesystem (replace invalid chars with underscore)
-  const sanitizedId = promptId.replace(/[<>:"/\\|?*]/g, '_')
-  const workspaceDir = `${baseDir}/prompt-${sanitizedId}`
-
-  // Create directory (recursive, no error if exists)
-  // Uses fs.mkdir instead of shell to prevent command injection
-  await mkdir(workspaceDir, { recursive: true })
-
-  return workspaceDir
-}
diff --git a/src/graders.ts b/src/graders.ts
deleted file mode 100644
index f795ceb..0000000
--- a/src/graders.ts
+++ /dev/null
@@ -1,39 +0,0 @@
-/**
- * Built-in comparison graders for the agent eval harness.
- *
- * @remarks
- * Provides built-in strategies for comparing multiple runs:
- *
- * **For CaptureResult (single-run) data:**
- * - **weighted**: Configurable weights for quality, latency, reliability
- * - **statistical**: Bootstrap sampling for confidence intervals
- *
- * **For TrialResult (multi-run reliability) data:**
- * - **trialsWeighted**: Configurable weights for capability, reliability, consistency
- * - **trialsStatistical**: Bootstrap sampling for passAtK confidence intervals
- *
- * @packageDocumentation
- */
-
-// CaptureResult graders
-export { createStatisticalGrader, grade as statisticalGrade } from './graders/compare-statistical.ts'
-export {
-  createWeightedGrader,
-  DEFAULT_WEIGHTS,
-  getWeightsFromEnv,
-  grade as weightedGrade,
-  type Weights,
-} from './graders/compare-weighted.ts'
-
-// TrialResult graders
-export {
-  createTrialsStatisticalGrader,
-  grade as trialsStatisticalGrade,
-} from './graders/trials-compare-statistical.ts'
-export {
-  createTrialsWeightedGrader,
-  DEFAULT_TRIALS_WEIGHTS,
-  getTrialsWeightsFromEnv,
-  grade as trialsWeightedGrade,
-  type TrialsWeights,
-} from './graders/trials-compare-weighted.ts'
diff --git a/src/graders/bootstrap.ts b/src/graders/bootstrap.ts
deleted file mode 100644
index afcdff5..0000000
--- a/src/graders/bootstrap.ts
+++ /dev/null
@@ -1,135 +0,0 @@
-/**
- * Shared bootstrap sampling utilities for confidence interval computation.
- *
- * @remarks
- * Bootstrap resampling provides robust confidence intervals without
- * assuming a specific distribution. For small samples, it's more
- * reliable than parametric methods.
- *
- * Environment variable configuration:
- * - `COMPARE_BOOTSTRAP_ITERATIONS` (default: 1000)
- *
- * @packageDocumentation
- */
-
-/** Default number of bootstrap iterations */
-export const DEFAULT_ITERATIONS = 1000
-
-/** Default confidence level (95%) */
-export const DEFAULT_CONFIDENCE_LEVEL = 0.95
-
-/**
- * Confidence interval as [lower, upper] bounds.
- */
-export type ConfidenceInterval = [number, number]
-
-/**
- * Bootstrap confidence interval result.
- */
-export type BootstrapResult = {
-  /** Median of bootstrap sample means (50th percentile) */
-  median: number
-  /** Confidence interval [lower, upper] */
-  ci: ConfidenceInterval
-}
-
-/**
- * Configuration for bootstrap sampling.
- */
-export type BootstrapConfig = {
-  /** Number of bootstrap iterations (default: 1000) */
-  iterations?: number
-  /** Confidence level between 0 and 1 (default: 0.95) */
-  confidenceLevel?: number
-}
-
-/**
- * Compute bootstrap confidence interval for sample mean.
- *
- * @remarks
- * Bootstrap resampling provides robust confidence intervals without
- * assuming a specific distribution. For small samples, it's more
- * reliable than parametric methods.
- *
- * @param samples - Array of numeric samples
- * @param config - Optional bootstrap configuration
- * @returns Bootstrap median and confidence interval
- *
- * @public
- */
-export const bootstrap = (samples: number[], config?: BootstrapConfig): BootstrapResult => {
-  const iterations = config?.iterations ?? DEFAULT_ITERATIONS
-  const confidenceLevel = config?.confidenceLevel ?? DEFAULT_CONFIDENCE_LEVEL
-
-  if (samples.length === 0) {
-    return { median: 0, ci: [0, 0] }
-  }
-
-  if (samples.length === 1) {
-    const value = samples[0] ?? 0
-    return { median: value, ci: [value, value] }
-  }
-
-  const means: number[] = []
-
-  for (let i = 0; i < iterations; i++) {
-    // Resample with replacement - we know samples.length > 1 at this point
-    const resampled = Array.from(
-      { length: samples.length },
-      () => samples[Math.floor(Math.random() * samples.length)] as number,
-    )
-
-    // Compute mean of resampled data
-    const sum = resampled.reduce((acc, val) => acc + val, 0)
-    means.push(sum / resampled.length)
-  }
-
-  // Sort means for percentile calculation
-  means.sort((a, b) => a - b)
-
-  // Compute percentile indices based on confidence level
-  // For 95% CI: lower = 2.5th percentile, upper = 97.5th percentile
-  const alpha = (1 - confidenceLevel) / 2
-  const lowerIdx = Math.floor(iterations * alpha)
-  const upperIdx = Math.floor(iterations * (1 - alpha))
-
-  return {
-    median: means[Math.floor(iterations / 2)] ?? 0,
-    ci: [means[lowerIdx] ?? 0, means[upperIdx] ?? 0],
-  }
-}
-
-/**
- * Format confidence interval as string.
- *
- * @param ci - Confidence interval [lower, upper]
- * @param decimals - Number of decimal places (default: 3)
- * @returns Formatted CI string or empty string if undefined
- *
- * @public
- */
-export const formatCI = (ci: ConfidenceInterval | undefined, decimals: number = 3): string => {
-  if (!ci) return ''
-  return `[${ci[0].toFixed(decimals)}, ${ci[1].toFixed(decimals)}]`
-}
-
-/**
- * Get bootstrap configuration from environment variables.
- *
- * @remarks
- * Reads configuration from:
- * - `COMPARE_BOOTSTRAP_ITERATIONS`: Number of iterations (min: 100)
- *
- * @returns Bootstrap configuration
- *
- * @public
- */
-export const getBootstrapConfigFromEnv = (): BootstrapConfig => {
-  const envValue = process.env.COMPARE_BOOTSTRAP_ITERATIONS
-  if (!envValue) return { iterations: DEFAULT_ITERATIONS }
-
-  const parsed = Number.parseInt(envValue, 10)
-  const iterations = Number.isNaN(parsed) || parsed < 100 ? DEFAULT_ITERATIONS : parsed
-
-  return { iterations }
-}
diff --git a/src/graders/compare-statistical.ts b/src/graders/compare-statistical.ts
deleted file mode 100644
index 9465277..0000000
--- a/src/graders/compare-statistical.ts
+++ /dev/null
@@ -1,115 +0,0 @@
-/**
- * Built-in statistical significance comparison grader.
- *
- * @remarks
- * Uses bootstrap sampling to compute confidence intervals for score estimates.
- * Flags when the winner is statistically significant (p<0.05, non-overlapping CIs).
- *
- * Bootstrap iterations can be customized via environment variable:
- * - `COMPARE_BOOTSTRAP_ITERATIONS` (default: 1000)
- *
- * @packageDocumentation
- */
-
-import type { ComparisonGrader, ComparisonGraderInput, ComparisonGraderResult } from '../pipeline/pipeline.types.ts'
-import { bootstrap, getBootstrapConfigFromEnv } from './bootstrap.ts'
-
-/**
- * Statistical significance comparison grader.
- *
- * @remarks
- * Compares runs using bootstrap sampling to determine if differences
- * are statistically significant. When confidence intervals don't overlap,
- * the difference is flagged as significant (p<0.05).
- *
- * **Single-sample limitation:** When comparing individual prompts, each run
- * provides only one score sample. Bootstrap with a single sample yields a
- * degenerate CI of `[value, value]`. This grader is most useful when:
- * - Aggregating results across multiple prompts
- * - Using with the full comparison report (which combines per-prompt comparisons)
- *
- * For single-prompt comparisons, consider the weighted grader instead.
- *
- * @public
- */
-export const grade: ComparisonGrader = async ({ runs }: ComparisonGraderInput): Promise<ComparisonGraderResult> => {
-  const config = getBootstrapConfigFromEnv()
-
-  // Collect scores for each run
-  const runStats = Object.entries(runs).map(([label, run]) => {
-    // Use grader score if available, otherwise 0
-    const score = run.score?.score ?? 0
-
-    // For single-prompt comparison, we only have one sample
-    // In practice, this grader is most useful when aggregating across prompts
-    const stats = bootstrap([score], config)
-
-    return { label, score, stats }
-  })
-
-  // Sort by bootstrap median descending
-  const sorted = runStats.sort((a, b) => b.stats.median - a.stats.median)
-
-  // Check if winner is statistically significant
-  // CIs don't overlap = significant difference (approximately p<0.05)
-  let isSignificant = false
-  const first = sorted[0]
-  const second = sorted[1]
-  if (first && second) {
-    // Non-overlapping: first's lower bound > second's upper bound
-    isSignificant = first.stats.ci[0] > second.stats.ci[1]
-  }
-
-  const reasoning = isSignificant
-    ? `Winner "${first?.label}" is statistically significant (p<0.05, non-overlapping 95% CIs)`
-    : 'No statistically significant difference between top runs (overlapping 95% CIs)'
-
-  return {
-    rankings: sorted.map((s, i) => ({
-      run: s.label,
-      rank: i + 1,
-      score: s.stats.median,
-    })),
-    reasoning,
-  }
-}
-
-/**
- * Create a statistical grader with custom iteration count.
- *
- * @param iterations - Number of bootstrap iterations
- * @returns Comparison grader function
- *
- * @public
- */
-export const createStatisticalGrader = (iterations?: number): ComparisonGrader => {
-  const config = iterations ? { iterations } : getBootstrapConfigFromEnv()
-
-  return async ({ runs }: ComparisonGraderInput): Promise<ComparisonGraderResult> => {
-    const runStats = Object.entries(runs).map(([label, run]) => {
-      const score = run.score?.score ?? 0
-      const stats = bootstrap([score], config)
-      return { label, score, stats }
-    })
-
-    const sorted = runStats.sort((a, b) => b.stats.median - a.stats.median)
-
-    let isSignificant = false
-    const first = sorted[0]
-    const second = sorted[1]
-    if (first && second) {
-      isSignificant = first.stats.ci[0] > second.stats.ci[1]
-    }
-
-    return {
-      rankings: sorted.map((s, i) => ({
-        run: s.label,
-        rank: i + 1,
-        score: s.stats.median,
-      })),
-      reasoning: isSignificant
-        ? `Winner "${first?.label}" is statistically significant (p<0.05)`
-        : 'No statistically significant difference between top runs',
-    }
-  }
-}
diff --git a/src/graders/compare-weighted.ts b/src/graders/compare-weighted.ts
deleted file mode 100644
index 5b05e1f..0000000
--- a/src/graders/compare-weighted.ts
+++ /dev/null
@@ -1,112 +0,0 @@
-/**
- * Built-in weighted multi-dimensional comparison grader.
- *
- * @remarks
- * Configurable weights for quality, latency, and reliability.
- * Default strategy when no `--grader` is specified for the compare command.
- *
- * Weights can be customized via environment variables:
- * - `COMPARE_QUALITY` (default: 0.5)
- * - `COMPARE_LATENCY` (default: 0.3)
- * - `COMPARE_RELIABILITY` (default: 0.2)
- *
- * @packageDocumentation
- */
-
-import type { ComparisonGrader, ComparisonGraderInput, ComparisonGraderResult } from '../pipeline/pipeline.types.ts'
-
-/**
- * Weight configuration for comparison dimensions.
- */
-export type Weights = {
-  /** Weight for quality (pass/score) - how much correctness matters */
-  quality: number
-  /** Weight for latency - how much speed matters */
-  latency: number
-  /** Weight for reliability - how much error-free execution matters */
-  reliability: number
-}
-
-/** Default weights: quality=0.5, latency=0.3, reliability=0.2 */
-export const DEFAULT_WEIGHTS: Weights = {
-  quality: 0.5,
-  latency: 0.3,
-  reliability: 0.2,
-}
-
-/**
- * Read weights from environment variables with fallback to defaults.
- *
- * @returns Weights configuration
- */
-export const getWeightsFromEnv = (): Weights => {
-  const quality = Number.parseFloat(process.env.COMPARE_QUALITY ?? String(DEFAULT_WEIGHTS.quality))
-  const latency = Number.parseFloat(process.env.COMPARE_LATENCY ?? String(DEFAULT_WEIGHTS.latency))
-  const reliability = Number.parseFloat(process.env.COMPARE_RELIABILITY ?? String(DEFAULT_WEIGHTS.reliability))
-
-  return {
-    quality: Number.isNaN(quality) ? DEFAULT_WEIGHTS.quality : quality,
-    latency: Number.isNaN(latency) ? DEFAULT_WEIGHTS.latency : latency,
-    reliability: Number.isNaN(reliability) ? DEFAULT_WEIGHTS.reliability : reliability,
-  }
-}
-
-/**
- * Create a weighted comparison grader with custom weights.
- *
- * @param weights - Weight configuration for comparison dimensions
- * @returns Comparison grader function
- *
- * @public
- */
-export const createWeightedGrader = (weights: Weights = DEFAULT_WEIGHTS): ComparisonGrader => {
-  return async ({ runs }: ComparisonGraderInput): Promise<ComparisonGraderResult> => {
-    const scores = Object.entries(runs).map(([label, run]) => {
-      // Quality score: use grader score if available, otherwise 0
-      // Note: run.score is only present if the result was graded
-      const qualityScore = run.score?.score ?? 0
-
-      // Latency score: inverse relationship (faster = better)
-      // Normalize: 1 / (1 + duration/1000) gives ~0.5 at 1s, ~0.1 at 10s
-      const duration = run.duration ?? 10000
-      const latencyScore = 1 / (1 + duration / 1000)
-
-      // Reliability score: 1 if no errors, 0 if errors
-      const hasErrors = run.toolErrors ?? false
-      const reliabilityScore = hasErrors ? 0 : 1
-
-      // Weighted combination
-      const weighted =
-        qualityScore * weights.quality + latencyScore * weights.latency + reliabilityScore * weights.reliability
-
-      return { label, weighted, qualityScore, latencyScore, reliabilityScore }
-    })
-
-    // Sort by weighted score descending (highest = best)
-    const sorted = scores.sort((a, b) => b.weighted - a.weighted)
-
-    return {
-      rankings: sorted.map((s, i) => ({
-        run: s.label,
-        rank: i + 1,
-        score: s.weighted,
-      })),
-      reasoning: `Weighted: quality=${weights.quality}, latency=${weights.latency}, reliability=${weights.reliability}`,
-    }
-  }
-}
-
-/**
- * Default weighted comparison grader using environment or default weights.
- *
- * @remarks
- * This is the default grader used when `--strategy weighted` is specified
- * or when no strategy is specified for the compare command.
- *
- * @public
- */
-export const grade: ComparisonGrader = async (input: ComparisonGraderInput): Promise<ComparisonGraderResult> => {
-  const weights = getWeightsFromEnv()
-  const grader = createWeightedGrader(weights)
-  return grader(input)
-}
diff --git a/src/graders/tests/bootstrap.spec.ts b/src/graders/tests/bootstrap.spec.ts
deleted file mode 100644
index 83eecec..0000000
--- a/src/graders/tests/bootstrap.spec.ts
+++ /dev/null
@@ -1,169 +0,0 @@
-/**
- * Unit tests for bootstrap sampling utilities.
- */
-
-import { afterEach, describe, expect, test } from 'bun:test'
-import { bootstrap, DEFAULT_CONFIDENCE_LEVEL, DEFAULT_ITERATIONS, getBootstrapConfigFromEnv } from '../bootstrap.ts'
-
-describe('bootstrap', () => {
-  describe('edge cases', () => {
-    test('returns {median: 0, ci: [0, 0]} for empty array', () => {
-      const result = bootstrap([])
-      expect(result.median).toBe(0)
-      expect(result.ci).toEqual([0, 0])
-    })
-
-    test('returns {median: value, ci: [value, value]} for single sample', () => {
-      const result = bootstrap([0.75])
-      expect(result.median).toBe(0.75)
-      expect(result.ci).toEqual([0.75, 0.75])
-    })
-
-    test('handles single sample of 0', () => {
-      const result = bootstrap([0])
-      expect(result.median).toBe(0)
-      expect(result.ci).toEqual([0, 0])
-    })
-
-    test('handles single sample of 1', () => {
-      const result = bootstrap([1])
-      expect(result.median).toBe(1)
-      expect(result.ci).toEqual([1, 1])
-    })
-  })
-
-  describe('confidence interval bounds', () => {
-    test('CI lower bound <= median <= CI upper bound', () => {
-      const samples = [0.5, 0.6, 0.7, 0.8, 0.9]
-      const result = bootstrap(samples, { iterations: 1000 })
-
-      expect(result.ci[0]).toBeLessThanOrEqual(result.median)
-      expect(result.median).toBeLessThanOrEqual(result.ci[1])
-    })
-
-    test('CI contains the true median for uniform samples', () => {
-      // For identical samples, CI should collapse to the value
-      const samples = [0.5, 0.5, 0.5, 0.5, 0.5]
-      const result = bootstrap(samples, { iterations: 1000 })
-
-      expect(result.median).toBeCloseTo(0.5, 2)
-      expect(result.ci[0]).toBeCloseTo(0.5, 2)
-      expect(result.ci[1]).toBeCloseTo(0.5, 2)
-    })
-
-    test('CI widens with more variance in samples', () => {
-      const lowVariance = [0.49, 0.5, 0.51]
-      const highVariance = [0.1, 0.5, 0.9]
-
-      const lowResult = bootstrap(lowVariance, { iterations: 1000 })
-      const highResult = bootstrap(highVariance, { iterations: 1000 })
-
-      const lowWidth = lowResult.ci[1] - lowResult.ci[0]
-      const highWidth = highResult.ci[1] - highResult.ci[0]
-
-      expect(highWidth).toBeGreaterThan(lowWidth)
-    })
-  })
-
-  describe('configuration', () => {
-    test('uses default iterations when not specified', () => {
-      // Just verify it runs without error with defaults
-      const result = bootstrap([0.5, 0.6, 0.7])
-      expect(result.median).toBeGreaterThan(0)
-    })
-
-    test('accepts custom iteration count', () => {
-      const result = bootstrap([0.5, 0.6, 0.7], { iterations: 100 })
-      expect(result.median).toBeGreaterThan(0)
-    })
-
-    test('accepts custom confidence level', () => {
-      const samples = [0.3, 0.4, 0.5, 0.6, 0.7]
-
-      // 90% CI should be narrower than 95% CI
-      const ci90 = bootstrap(samples, { iterations: 1000, confidenceLevel: 0.9 })
-      const ci95 = bootstrap(samples, { iterations: 1000, confidenceLevel: 0.95 })
-
-      const width90 = ci90.ci[1] - ci90.ci[0]
-      const width95 = ci95.ci[1] - ci95.ci[0]
-
-      // 95% CI should generally be wider than 90% CI
-      // Allow some tolerance due to randomness
-      expect(width95).toBeGreaterThanOrEqual(width90 * 0.8)
-    })
-  })
-
-  describe('statistical properties', () => {
-    test('median is close to sample mean', () => {
-      const samples = [0.2, 0.4, 0.6, 0.8, 1.0]
-      const sampleMean = samples.reduce((a, b) => a + b, 0) / samples.length
-
-      const result = bootstrap(samples, { iterations: 10000 })
-
-      // Bootstrap median should be close to sample mean for symmetric distributions
-      expect(result.median).toBeCloseTo(sampleMean, 1)
-    })
-
-    test('is deterministic-ish for large iteration counts', () => {
-      const samples = [0.3, 0.5, 0.7]
-
-      // With many iterations, results should be similar across runs
-      const result1 = bootstrap(samples, { iterations: 10000 })
-      const result2 = bootstrap(samples, { iterations: 10000 })
-
-      expect(result1.median).toBeCloseTo(result2.median, 1)
-    })
-  })
-})
-
-describe('getBootstrapConfigFromEnv', () => {
-  const originalEnv = process.env.COMPARE_BOOTSTRAP_ITERATIONS
-
-  afterEach(() => {
-    if (originalEnv === undefined) {
-      delete process.env.COMPARE_BOOTSTRAP_ITERATIONS
-    } else {
-      process.env.COMPARE_BOOTSTRAP_ITERATIONS = originalEnv
-    }
-  })
-
-  test('returns default iterations when env var not set', () => {
-    delete process.env.COMPARE_BOOTSTRAP_ITERATIONS
-    const config = getBootstrapConfigFromEnv()
-    expect(config.iterations).toBe(DEFAULT_ITERATIONS)
-  })
-
-  test('parses valid iteration count from env', () => {
-    process.env.COMPARE_BOOTSTRAP_ITERATIONS = '5000'
-    const config = getBootstrapConfigFromEnv()
-    expect(config.iterations).toBe(5000)
-  })
-
-  test('returns default for invalid (non-numeric) env value', () => {
-    process.env.COMPARE_BOOTSTRAP_ITERATIONS = 'invalid'
-    const config = getBootstrapConfigFromEnv()
-    expect(config.iterations).toBe(DEFAULT_ITERATIONS)
-  })
-
-  test('returns default for iteration count below minimum (100)', () => {
-    process.env.COMPARE_BOOTSTRAP_ITERATIONS = '50'
-    const config = getBootstrapConfigFromEnv()
-    expect(config.iterations).toBe(DEFAULT_ITERATIONS)
-  })
-
-  test('accepts iteration count at minimum (100)', () => {
-    process.env.COMPARE_BOOTSTRAP_ITERATIONS = '100'
-    const config = getBootstrapConfigFromEnv()
-    expect(config.iterations).toBe(100)
-  })
-})
-
-describe('constants', () => {
-  test('DEFAULT_ITERATIONS is 1000', () => {
-    expect(DEFAULT_ITERATIONS).toBe(1000)
-  })
-
-  test('DEFAULT_CONFIDENCE_LEVEL is 0.95', () => {
-    expect(DEFAULT_CONFIDENCE_LEVEL).toBe(0.95)
-  })
-})
diff --git a/src/graders/tests/compare-graders.spec.ts b/src/graders/tests/compare-graders.spec.ts
deleted file mode 100644
index 1827420..0000000
--- a/src/graders/tests/compare-graders.spec.ts
+++ /dev/null
@@ -1,293 +0,0 @@
-/**
- * Unit tests for built-in comparison graders.
- *
- * @remarks
- * Tests for:
- * - compare-weighted: Configurable weight grader
- * - compare-statistical: Bootstrap confidence interval grader
- *
- * @packageDocumentation
- */
-
-import { describe, expect, test } from 'bun:test'
-import type { ComparisonGraderInput, ComparisonRunData } from '../../pipeline/pipeline.types.ts'
-import { createStatisticalGrader, grade as statisticalGrade } from '../compare-statistical.ts'
-import { createWeightedGrader, DEFAULT_WEIGHTS, type Weights } from '../compare-weighted.ts'
-
-// ============================================================================
-// Test Fixtures
-// ============================================================================
-
-const createMockRuns = (
-  overrides: Partial<Record<string, Partial<ComparisonRunData>>> = {},
-): Record<string, ComparisonRunData> => ({
-  baseline: {
-    output: 'Result A',
-    score: { pass: true, score: 0.8 },
-    duration: 1000,
-    toolErrors: false,
-    ...overrides.baseline,
-  },
-  variant: {
-    output: 'Result B',
-    score: { pass: true, score: 0.9 },
-    duration: 1500,
-    toolErrors: false,
-    ...overrides.variant,
-  },
-})
-
-const createMockInput = (runs: Record<string, ComparisonRunData>): ComparisonGraderInput => ({
-  id: 'test-001',
-  input: 'Test prompt',
-  hint: 'Expected output',
-  runs,
-})
-
-// ============================================================================
-// Weighted Grader Tests
-// ============================================================================
-
-describe('compare-weighted grader', () => {
-  describe('DEFAULT_WEIGHTS', () => {
-    test('has expected default values', () => {
-      expect(DEFAULT_WEIGHTS.quality).toBe(0.5)
-      expect(DEFAULT_WEIGHTS.latency).toBe(0.3)
-      expect(DEFAULT_WEIGHTS.reliability).toBe(0.2)
-    })
-
-    test('weights sum to 1.0', () => {
-      const sum = DEFAULT_WEIGHTS.quality + DEFAULT_WEIGHTS.latency + DEFAULT_WEIGHTS.reliability
-      expect(sum).toBe(1.0)
-    })
-  })
-
-  describe('createWeightedGrader', () => {
-    test('returns higher rank for better quality score', async () => {
-      const grader = createWeightedGrader({ quality: 1.0, latency: 0.0, reliability: 0.0 })
-      const runs = createMockRuns({
-        baseline: { score: { pass: true, score: 0.7 } },
-        variant: { score: { pass: true, score: 0.9 } },
-      })
-      const input = createMockInput(runs)
-
-      const result = await grader(input)
-
-      expect(result.rankings.length).toBe(2)
-      expect(result.rankings[0]?.run).toBe('variant')
-      expect(result.rankings[0]?.rank).toBe(1)
-      expect(result.rankings[1]?.run).toBe('baseline')
-      expect(result.rankings[1]?.rank).toBe(2)
-    })
-
-    test('returns higher rank for lower latency when latency weight is high', async () => {
-      const grader = createWeightedGrader({ quality: 0.0, latency: 1.0, reliability: 0.0 })
-      const runs = createMockRuns({
-        baseline: { duration: 500, score: { pass: true, score: 0.5 } },
-        variant: { duration: 2000, score: { pass: true, score: 0.9 } },
-      })
-      const input = createMockInput(runs)
-
-      const result = await grader(input)
-
-      // Faster run should win when latency is all that matters
-      expect(result.rankings[0]?.run).toBe('baseline')
-    })
-
-    test('penalizes runs with tool errors when reliability weight is high', async () => {
-      const grader = createWeightedGrader({ quality: 0.0, latency: 0.0, reliability: 1.0 })
-      const runs = createMockRuns({
-        baseline: { toolErrors: false },
-        variant: { toolErrors: true },
-      })
-      const input = createMockInput(runs)
-
-      const result = await grader(input)
-
-      expect(result.rankings[0]?.run).toBe('baseline')
-      expect(result.rankings[1]?.run).toBe('variant')
-    })
-
-    test('includes weights in reasoning', async () => {
-      const weights: Weights = { quality: 0.6, latency: 0.3, reliability: 0.1 }
-      const grader = createWeightedGrader(weights)
-      const input = createMockInput(createMockRuns())
-
-      const result = await grader(input)
-
-      expect(result.reasoning).toContain('quality=0.6')
-      expect(result.reasoning).toContain('latency=0.3')
-      expect(result.reasoning).toContain('reliability=0.1')
-    })
-
-    test('handles missing score gracefully', async () => {
-      const grader = createWeightedGrader()
-      const runs: Record<string, ComparisonRunData> = {
-        baseline: { output: 'A' },
-        variant: { output: 'B', score: { pass: true, score: 0.8 } },
-      }
-      const input = createMockInput(runs)
-
-      const result = await grader(input)
-
-      // Should not throw, variant should rank higher due to having a score
-      expect(result.rankings.length).toBe(2)
-      expect(result.rankings[0]?.run).toBe('variant')
-    })
-
-    test('handles three or more runs', async () => {
-      const grader = createWeightedGrader()
-      const runs: Record<string, ComparisonRunData> = {
-        a: { output: 'A', score: { pass: true, score: 0.9 }, duration: 1000, toolErrors: false },
-        b: { output: 'B', score: { pass: true, score: 0.7 }, duration: 800, toolErrors: false },
-        c: { output: 'C', score: { pass: false, score: 0.5 }, duration: 500, toolErrors: true },
-      }
-      const input = createMockInput(runs)
-
-      const result = await grader(input)
-
-      expect(result.rankings.length).toBe(3)
-      // Ranks should be 1, 2, 3
-      expect(result.rankings.map((r) => r.rank)).toEqual([1, 2, 3])
-    })
-  })
-})
-
-// ============================================================================
-// Statistical Grader Tests
-// ============================================================================
-
-describe('compare-statistical grader', () => {
-  describe('createStatisticalGrader', () => {
-    test('returns rankings based on score means', async () => {
-      const grader = createStatisticalGrader(100)
-      const runs = createMockRuns({
-        baseline: { score: { pass: true, score: 0.6 } },
-        variant: { score: { pass: true, score: 0.9 } },
-      })
-      const input = createMockInput(runs)
-
-      const result = await grader(input)
-
-      expect(result.rankings.length).toBe(2)
-      expect(result.rankings[0]?.run).toBe('variant')
-    })
-
-    test('handles missing scores as zero', async () => {
-      const grader = createStatisticalGrader(100)
-      const runs: Record<string, ComparisonRunData> = {
-        baseline: { output: 'A' },
-        variant: { output: 'B', score: { pass: true, score: 0.8 } },
-      }
-      const input = createMockInput(runs)
-
-      const result = await grader(input)
-
-      expect(result.rankings[0]?.run).toBe('variant')
-    })
-
-    test('indicates significance when scores differ (single samples have no variance)', async () => {
-      const grader = createStatisticalGrader(100)
-      const runs = createMockRuns({
-        baseline: { score: { pass: true, score: 0.8 } },
-        variant: { score: { pass: true, score: 0.81 } },
-      })
-      const input = createMockInput(runs)
-
-      const result = await grader(input)
-
-      // Note: With single samples, bootstrap has no variance.
-      // CIs are [0.8, 0.8] and [0.81, 0.81] - non-overlapping.
-      // Statistical significance test is most meaningful with multiple samples.
-      expect(result.reasoning).toContain('statistically significant')
-    })
-
-    test('indicates non-significance when scores are identical', async () => {
-      const grader = createStatisticalGrader(100)
-      const runs = createMockRuns({
-        baseline: { score: { pass: true, score: 0.8 } },
-        variant: { score: { pass: true, score: 0.8 } },
-      })
-      const input = createMockInput(runs)
-
-      const result = await grader(input)
-
-      // Identical scores = overlapping CIs = not significant
-      expect(result.reasoning).toContain('No statistically significant difference')
-    })
-  })
-
-  describe('grade function', () => {
-    test('works with default iterations', async () => {
-      const runs = createMockRuns()
-      const input = createMockInput(runs)
-
-      const result = await statisticalGrade(input)
-
-      expect(result.rankings).toBeDefined()
-      expect(result.rankings.length).toBe(2)
-    })
-  })
-})
-
-// ============================================================================
-// Edge Case Tests
-// ============================================================================
-
-describe('comparison grader edge cases', () => {
-  test('handles single run gracefully', async () => {
-    const grader = createWeightedGrader()
-    const runs: Record<string, ComparisonRunData> = {
-      only: { output: 'Only run', score: { pass: true, score: 1.0 } },
-    }
-    const input = createMockInput(runs)
-
-    const result = await grader(input)
-
-    expect(result.rankings.length).toBe(1)
-    expect(result.rankings[0]?.rank).toBe(1)
-  })
-
-  test('handles empty trajectory', async () => {
-    const grader = createWeightedGrader()
-    const runs = createMockRuns({
-      baseline: { trajectory: [] },
-      variant: { trajectory: undefined },
-    })
-    const input = createMockInput(runs)
-
-    const result = await grader(input)
-
-    expect(result.rankings.length).toBe(2)
-  })
-
-  test('handles zero duration', async () => {
-    const grader = createWeightedGrader({ quality: 0.0, latency: 1.0, reliability: 0.0 })
-    const runs = createMockRuns({
-      baseline: { duration: 0 },
-      variant: { duration: 1000 },
-    })
-    const input = createMockInput(runs)
-
-    const result = await grader(input)
-
-    // Zero duration should get highest latency score
-    expect(result.rankings[0]?.run).toBe('baseline')
-  })
-
-  test('deterministic ordering for equal scores', async () => {
-    const grader = createWeightedGrader()
-    const runs = createMockRuns({
-      baseline: { score: { pass: true, score: 0.8 }, duration: 1000, toolErrors: false },
-      variant: { score: { pass: true, score: 0.8 }, duration: 1000, toolErrors: false },
-    })
-    const input = createMockInput(runs)
-
-    // Run multiple times to check stability
-    const results = await Promise.all([grader(input), grader(input), grader(input)])
-
-    // All should have same ordering
-    const orders = results.map((r) => r.rankings.map((rank) => rank.run).join(','))
-    expect(new Set(orders).size).toBe(1)
-  })
-})
diff --git a/src/graders/tests/trials-compare-graders.spec.ts b/src/graders/tests/trials-compare-graders.spec.ts
deleted file mode 100644
index 38f096a..0000000
--- a/src/graders/tests/trials-compare-graders.spec.ts
+++ /dev/null
@@ -1,358 +0,0 @@
-/**
- * Unit tests for built-in trials comparison graders.
- *
- * @remarks
- * Tests for:
- * - trials-compare-weighted: Configurable weight grader for trials
- * - trials-compare-statistical: Bootstrap confidence interval grader for trials
- *
- * @packageDocumentation
- */
-
-import { describe, expect, test } from 'bun:test'
-import type { TrialsComparisonGraderInput, TrialsComparisonRunData } from '../../pipeline/pipeline.types.ts'
-import { createTrialsStatisticalGrader, grade as statisticalGrade } from '../trials-compare-statistical.ts'
-import { createTrialsWeightedGrader, DEFAULT_TRIALS_WEIGHTS, type TrialsWeights } from '../trials-compare-weighted.ts'
-
-// ============================================================================
-// Test Fixtures
-// ============================================================================
-
-const createMockTrialRuns = (
-  overrides: Partial<Record<string, Partial<TrialsComparisonRunData>>> = {},
-): Record<string, TrialsComparisonRunData> => ({
-  baseline: {
-    passRate: 0.67,
-    passAtK: 0.9,
-    passExpK: 0.3,
-    k: 3,
-    trials: [
-      { trialNum: 1, output: 'A', trajectory: [], duration: 100, pass: true, score: 1.0 },
-      { trialNum: 2, output: 'B', trajectory: [], duration: 110, pass: true, score: 0.9 },
-      { trialNum: 3, output: 'C', trajectory: [], duration: 120, pass: false, score: 0.2 },
-    ],
-    ...overrides.baseline,
-  },
-  variant: {
-    passRate: 1.0,
-    passAtK: 1.0,
-    passExpK: 1.0,
-    k: 3,
-    trials: [
-      { trialNum: 1, output: 'X', trajectory: [], duration: 150, pass: true, score: 1.0 },
-      { trialNum: 2, output: 'Y', trajectory: [], duration: 160, pass: true, score: 1.0 },
-      { trialNum: 3, output: 'Z', trajectory: [], duration: 170, pass: true, score: 1.0 },
-    ],
-    ...overrides.variant,
-  },
-})
-
-const createMockTrialInput = (runs: Record<string, TrialsComparisonRunData>): TrialsComparisonGraderInput => ({
-  id: 'test-001',
-  input: 'Test prompt',
-  hint: 'Expected output',
-  runs,
-})
-
-// ============================================================================
-// Weighted Grader Tests
-// ============================================================================
-
-describe('trials-compare-weighted grader', () => {
-  describe('DEFAULT_TRIALS_WEIGHTS', () => {
-    test('has expected default values', () => {
-      expect(DEFAULT_TRIALS_WEIGHTS.capability).toBe(0.4)
-      expect(DEFAULT_TRIALS_WEIGHTS.reliability).toBe(0.4)
-      expect(DEFAULT_TRIALS_WEIGHTS.consistency).toBe(0.2)
-    })
-
-    test('weights sum to 1.0', () => {
-      const sum =
-        DEFAULT_TRIALS_WEIGHTS.capability + DEFAULT_TRIALS_WEIGHTS.reliability + DEFAULT_TRIALS_WEIGHTS.consistency
-      expect(sum).toBe(1.0)
-    })
-  })
-
-  describe('createTrialsWeightedGrader', () => {
-    test('returns higher rank for better passAtK when capability weight is high', async () => {
-      const grader = createTrialsWeightedGrader({ capability: 1.0, reliability: 0.0, consistency: 0.0 })
-      const runs = createMockTrialRuns({
-        baseline: { passAtK: 0.7 },
-        variant: { passAtK: 0.95 },
-      })
-      const input = createMockTrialInput(runs)
-
-      const result = await grader(input)
-
-      expect(result.rankings.length).toBe(2)
-      expect(result.rankings[0]?.run).toBe('variant')
-      expect(result.rankings[0]?.rank).toBe(1)
-    })
-
-    test('returns higher rank for better passExpK when reliability weight is high', async () => {
-      const grader = createTrialsWeightedGrader({ capability: 0.0, reliability: 1.0, consistency: 0.0 })
-      const runs = createMockTrialRuns({
-        baseline: { passExpK: 0.9 },
-        variant: { passExpK: 0.3 },
-      })
-      const input = createMockTrialInput(runs)
-
-      const result = await grader(input)
-
-      expect(result.rankings[0]?.run).toBe('baseline')
-    })
-
-    test('penalizes flaky runs when consistency weight is high', async () => {
-      const grader = createTrialsWeightedGrader({ capability: 0.0, reliability: 0.0, consistency: 1.0 })
-      const runs = createMockTrialRuns({
-        // baseline: passAtK=0.9, passExpK=0.3, flakiness=0.6
-        baseline: { passAtK: 0.9, passExpK: 0.3 },
-        // variant: passAtK=0.8, passExpK=0.8, flakiness=0.0
-        variant: { passAtK: 0.8, passExpK: 0.8 },
-      })
-      const input = createMockTrialInput(runs)
-
-      const result = await grader(input)
-
-      // Variant should win due to lower flakiness (higher consistency)
-      expect(result.rankings[0]?.run).toBe('variant')
-    })
-
-    test('includes weights in reasoning', async () => {
-      const weights: TrialsWeights = { capability: 0.5, reliability: 0.3, consistency: 0.2 }
-      const grader = createTrialsWeightedGrader(weights)
-      const input = createMockTrialInput(createMockTrialRuns())
-
-      const result = await grader(input)
-
-      expect(result.reasoning).toContain('capability=0.5')
-      expect(result.reasoning).toContain('reliability=0.3')
-      expect(result.reasoning).toContain('consistency=0.2')
-    })
-
-    test('handles missing passAtK gracefully (treats as 0)', async () => {
-      const grader = createTrialsWeightedGrader()
-      const runs: Record<string, TrialsComparisonRunData> = {
-        baseline: {
-          k: 3,
-          trials: [],
-        },
-        variant: {
-          passAtK: 0.8,
-          passExpK: 0.5,
-          k: 3,
-          trials: [],
-        },
-      }
-      const input = createMockTrialInput(runs)
-
-      const result = await grader(input)
-
-      // Should not throw, variant should rank higher
-      expect(result.rankings.length).toBe(2)
-      expect(result.rankings[0]?.run).toBe('variant')
-    })
-
-    test('handles three or more runs', async () => {
-      const grader = createTrialsWeightedGrader()
-      const runs: Record<string, TrialsComparisonRunData> = {
-        a: { passAtK: 0.9, passExpK: 0.8, k: 3, trials: [] },
-        b: { passAtK: 0.7, passExpK: 0.7, k: 3, trials: [] },
-        c: { passAtK: 0.5, passExpK: 0.2, k: 3, trials: [] },
-      }
-      const input = createMockTrialInput(runs)
-
-      const result = await grader(input)
-
-      expect(result.rankings.length).toBe(3)
-      // Ranks should be 1, 2, 3
-      expect(result.rankings.map((r) => r.rank)).toEqual([1, 2, 3])
-    })
-  })
-})
-
-// ============================================================================
-// Statistical Grader Tests
-// ============================================================================
-
-describe('trials-compare-statistical grader', () => {
-  describe('createTrialsStatisticalGrader', () => {
-    test('returns rankings based on bootstrapped passAtK', async () => {
-      const grader = createTrialsStatisticalGrader(100)
-      const runs = createMockTrialRuns({
-        baseline: { passAtK: 0.6 },
-        variant: { passAtK: 0.95 },
-      })
-      const input = createMockTrialInput(runs)
-
-      const result = await grader(input)
-
-      expect(result.rankings.length).toBe(2)
-      expect(result.rankings[0]?.run).toBe('variant')
-    })
-
-    test('uses trial outcomes for bootstrap variance estimation', async () => {
-      const grader = createTrialsStatisticalGrader(100)
-      // All trials pass for variant, mixed for baseline
-      const runs: Record<string, TrialsComparisonRunData> = {
-        baseline: {
-          passAtK: 0.9,
-          passExpK: 0.3,
-          k: 5,
-          trials: [
-            { trialNum: 1, output: 'A', trajectory: [], duration: 100, pass: true },
-            { trialNum: 2, output: 'B', trajectory: [], duration: 100, pass: true },
-            { trialNum: 3, output: 'C', trajectory: [], duration: 100, pass: false },
-            { trialNum: 4, output: 'D', trajectory: [], duration: 100, pass: true },
-            { trialNum: 5, output: 'E', trajectory: [], duration: 100, pass: false },
-          ],
-        },
-        variant: {
-          passAtK: 1.0,
-          passExpK: 1.0,
-          k: 5,
-          trials: [
-            { trialNum: 1, output: 'X', trajectory: [], duration: 100, pass: true },
-            { trialNum: 2, output: 'Y', trajectory: [], duration: 100, pass: true },
-            { trialNum: 3, output: 'Z', trajectory: [], duration: 100, pass: true },
-            { trialNum: 4, output: 'W', trajectory: [], duration: 100, pass: true },
-            { trialNum: 5, output: 'V', trajectory: [], duration: 100, pass: true },
-          ],
-        },
-      }
-      const input = createMockTrialInput(runs)
-
-      const result = await grader(input)
-
-      // Variant with 100% pass rate should rank higher
-      expect(result.rankings[0]?.run).toBe('variant')
-    })
-
-    test('indicates significance when passAtK differs substantially', async () => {
-      const grader = createTrialsStatisticalGrader(500)
-      // Strong difference: all pass vs all fail
-      const runs: Record<string, TrialsComparisonRunData> = {
-        baseline: {
-          passAtK: 0,
-          k: 5,
-          trials: [
-            { trialNum: 1, output: 'A', trajectory: [], duration: 100, pass: false },
-            { trialNum: 2, output: 'B', trajectory: [], duration: 100, pass: false },
-            { trialNum: 3, output: 'C', trajectory: [], duration: 100, pass: false },
-            { trialNum: 4, output: 'D', trajectory: [], duration: 100, pass: false },
-            { trialNum: 5, output: 'E', trajectory: [], duration: 100, pass: false },
-          ],
-        },
-        variant: {
-          passAtK: 1.0,
-          k: 5,
-          trials: [
-            { trialNum: 1, output: 'X', trajectory: [], duration: 100, pass: true },
-            { trialNum: 2, output: 'Y', trajectory: [], duration: 100, pass: true },
-            { trialNum: 3, output: 'Z', trajectory: [], duration: 100, pass: true },
-            { trialNum: 4, output: 'W', trajectory: [], duration: 100, pass: true },
-            { trialNum: 5, output: 'V', trajectory: [], duration: 100, pass: true },
-          ],
-        },
-      }
-      const input = createMockTrialInput(runs)
-
-      const result = await grader(input)
-
-      expect(result.reasoning).toContain('clear separation')
-    })
-
-    test('handles empty trials array', async () => {
-      const grader = createTrialsStatisticalGrader(100)
-      const runs: Record<string, TrialsComparisonRunData> = {
-        baseline: { k: 3, trials: [] },
-        variant: {
-          k: 3,
-          trials: [{ trialNum: 1, output: 'X', trajectory: [], duration: 100, pass: true }],
-        },
-      }
-      const input = createMockTrialInput(runs)
-
-      const result = await grader(input)
-
-      // Should not throw
-      expect(result.rankings.length).toBe(2)
-    })
-  })
-
-  describe('grade function', () => {
-    test('works with default iterations', async () => {
-      const runs = createMockTrialRuns()
-      const input = createMockTrialInput(runs)
-
-      const result = await statisticalGrade(input)
-
-      expect(result.rankings).toBeDefined()
-      expect(result.rankings.length).toBe(2)
-    })
-  })
-})
-
-// ============================================================================
-// Edge Case Tests
-// ============================================================================
-
-describe('trials comparison grader edge cases', () => {
-  test('handles single run gracefully', async () => {
-    const grader = createTrialsWeightedGrader()
-    const runs: Record<string, TrialsComparisonRunData> = {
-      only: { passAtK: 1.0, passExpK: 0.8, k: 3, trials: [] },
-    }
-    const input = createMockTrialInput(runs)
-
-    const result = await grader(input)
-
-    expect(result.rankings.length).toBe(1)
-    expect(result.rankings[0]?.rank).toBe(1)
-  })
-
-  test('handles zero passAtK and passExpK', async () => {
-    const grader = createTrialsWeightedGrader()
-    const runs: Record<string, TrialsComparisonRunData> = {
-      baseline: { passAtK: 0, passExpK: 0, k: 3, trials: [] },
-      variant: { passAtK: 0.5, passExpK: 0.2, k: 3, trials: [] },
-    }
-    const input = createMockTrialInput(runs)
-
-    const result = await grader(input)
-
-    expect(result.rankings[0]?.run).toBe('variant')
-  })
-
-  test('deterministic ordering for equal scores', async () => {
-    const grader = createTrialsWeightedGrader()
-    const runs = createMockTrialRuns({
-      baseline: { passAtK: 0.8, passExpK: 0.6 },
-      variant: { passAtK: 0.8, passExpK: 0.6 },
-    })
-    const input = createMockTrialInput(runs)
-
-    // Run multiple times to check stability
-    const results = await Promise.all([grader(input), grader(input), grader(input)])
-
-    // All should have same ordering
-    const orders = results.map((r) => r.rankings.map((rank) => rank.run).join(','))
-    expect(new Set(orders).size).toBe(1)
-  })
-
-  test('flakiness is clamped to non-negative', async () => {
-    // Edge case: passExpK > passAtK shouldn't happen but handle gracefully
-    const grader = createTrialsWeightedGrader({ capability: 0.0, reliability: 0.0, consistency: 1.0 })
-    const runs: Record<string, TrialsComparisonRunData> = {
-      baseline: { passAtK: 0.5, passExpK: 0.7, k: 3, trials: [] }, // Invalid but should work
-      variant: { passAtK: 0.8, passExpK: 0.8, k: 3, trials: [] },
-    }
-    const input = createMockTrialInput(runs)
-
-    const result = await grader(input)
-
-    // Both should have flakiness 0, so consistency score should be 1.0 for both
-    // Variant has higher capability/reliability so it wins on tiebreaker
-    expect(result.rankings).toBeDefined()
-  })
-})
diff --git a/src/graders/trials-compare-statistical.ts b/src/graders/trials-compare-statistical.ts
deleted file mode 100644
index c146c39..0000000
--- a/src/graders/trials-compare-statistical.ts
+++ /dev/null
@@ -1,183 +0,0 @@
-/**
- * Built-in statistical significance comparison grader for trials data.
- *
- * @remarks
- * Uses bootstrap sampling to compute confidence intervals for passAtK and passExpK.
- * Flags when the winner is statistically significant (p<0.05, non-overlapping CIs).
- *
- * Unlike the capture statistical grader which only has one score per prompt,
- * trials data has multiple trial results per prompt, enabling proper bootstrap
- * variance estimation.
- *
- * Bootstrap iterations can be customized via environment variable:
- * - `COMPARE_BOOTSTRAP_ITERATIONS` (default: 1000)
- *
- * @packageDocumentation
- */
-
-import type {
-  ComparisonGraderResult,
-  TrialsComparisonGrader,
-  TrialsComparisonGraderInput,
-} from '../pipeline/pipeline.types.ts'
-import { DEFAULT_ITERATIONS, getBootstrapConfigFromEnv } from './bootstrap.ts'
-
-/**
- * Bootstrap confidence interval result.
- */
-type BootstrapResult = {
-  /** Median estimate from bootstrap samples (more robust than mean) */
-  median: number
-  /** 95% confidence interval [lower, upper] */
-  ci95: [number, number]
-}
-
-/**
- * Compute passAtK estimate from trial pass/fail samples via bootstrap.
- *
- * @remarks
- * passAtK = 1 - (1 - p)^k where p is estimated pass rate.
- * We bootstrap the pass rate and compute passAtK from each bootstrap sample.
- *
- * @param trials - Array of 0/1 values (0=fail, 1=pass)
- * @param k - Number of trials
- * @param iterations - Number of bootstrap iterations
- * @returns Bootstrap estimate and CI for passAtK
- */
-const bootstrapPassAtK = (trials: number[], k: number, iterations: number): BootstrapResult => {
-  if (trials.length === 0) {
-    return { median: 0, ci95: [0, 0] }
-  }
-
-  const passAtKValues: number[] = []
-
-  for (let i = 0; i < iterations; i++) {
-    // Resample with replacement
-    const resampled = Array.from(
-      { length: trials.length },
-      () => trials[Math.floor(Math.random() * trials.length)] as number,
-    )
-
-    // Compute pass rate from resample
-    const passRate = resampled.reduce((acc, val) => acc + val, 0) / resampled.length
-
-    // Compute passAtK: probability of at least one pass in k samples
-    // passAtK = 1 - (1 - p)^k
-    const passAtK = 1 - (1 - passRate) ** k
-    passAtKValues.push(passAtK)
-  }
-
-  // Sort for percentile calculation
-  passAtKValues.sort((a, b) => a - b)
-
-  const lowerIdx = Math.floor(iterations * 0.025)
-  const upperIdx = Math.floor(iterations * 0.975)
-
-  return {
-    median: passAtKValues[Math.floor(iterations / 2)] ?? 0,
-    ci95: [passAtKValues[lowerIdx] ?? 0, passAtKValues[upperIdx] ?? 0],
-  }
-}
-
-/**
- * Get bootstrap iterations from environment or use default.
- *
- * @returns Number of bootstrap iterations
- */
-const getIterations = (): number => {
-  const config = getBootstrapConfigFromEnv()
-  return config.iterations ?? DEFAULT_ITERATIONS
-}
-
-/**
- * Statistical significance trials comparison grader.
- *
- * @remarks
- * Compares runs using bootstrap sampling on trial outcomes to determine
- * if differences in passAtK are statistically significant.
- *
- * Unlike single-sample comparisons, trials data provides multiple samples
- * per prompt (k trials), enabling meaningful variance estimation.
- *
- * @public
- */
-export const grade: TrialsComparisonGrader = async ({
-  runs,
-}: TrialsComparisonGraderInput): Promise<ComparisonGraderResult> => {
-  const iterations = getIterations()
-
-  // Collect pass/fail outcomes for each run
-  const runStats = Object.entries(runs).map(([label, run]) => {
-    // Convert trials to 0/1 array
-    const trialOutcomes = run.trials.map((t) => (t.pass ? 1 : 0))
-
-    // Bootstrap passAtK estimate
-    const stats = bootstrapPassAtK(trialOutcomes, run.k, iterations)
-
-    return { label, passAtK: run.passAtK ?? 0, stats }
-  })
-
-  // Sort by bootstrap median passAtK descending
-  const sorted = runStats.sort((a, b) => b.stats.median - a.stats.median)
-
-  // Check if winner is statistically significant
-  // CIs don't overlap = significant difference (approximately p<0.05)
-  let isSignificant = false
-  const first = sorted[0]
-  const second = sorted[1]
-  if (first && second) {
-    // Non-overlapping: first's lower bound > second's upper bound
-    isSignificant = first.stats.ci95[0] > second.stats.ci95[1]
-  }
-
-  const reasoning = isSignificant
-    ? `Winner "${first?.label}" shows clear separation (non-overlapping 95% CIs for passAtK)`
-    : 'No clear winner - confidence intervals overlap between top runs'
-
-  return {
-    rankings: sorted.map((s, i) => ({
-      run: s.label,
-      rank: i + 1,
-      score: s.stats.median,
-    })),
-    reasoning,
-  }
-}
-
-/**
- * Create a statistical grader with custom iteration count.
- *
- * @param iterations - Number of bootstrap iterations
- * @returns Trials comparison grader function
- *
- * @public
- */
-export const createTrialsStatisticalGrader = (iterations: number = DEFAULT_ITERATIONS): TrialsComparisonGrader => {
-  return async ({ runs }: TrialsComparisonGraderInput): Promise<ComparisonGraderResult> => {
-    const runStats = Object.entries(runs).map(([label, run]) => {
-      const trialOutcomes = run.trials.map((t) => (t.pass ? 1 : 0))
-      const stats = bootstrapPassAtK(trialOutcomes, run.k, iterations)
-      return { label, passAtK: run.passAtK ?? 0, stats }
-    })
-
-    const sorted = runStats.sort((a, b) => b.stats.median - a.stats.median)
-
-    let isSignificant = false
-    const first = sorted[0]
-    const second = sorted[1]
-    if (first && second) {
-      isSignificant = first.stats.ci95[0] > second.stats.ci95[1]
-    }
-
-    return {
-      rankings: sorted.map((s, i) => ({
-        run: s.label,
-        rank: i + 1,
-        score: s.stats.median,
-      })),
-      reasoning: isSignificant
-        ? `Winner "${first?.label}" shows clear separation (non-overlapping 95% CIs)`
-        : 'No clear winner - confidence intervals overlap between top runs',
-    }
-  }
-}
diff --git a/src/graders/trials-compare-weighted.ts b/src/graders/trials-compare-weighted.ts
deleted file mode 100644
index 552957b..0000000
--- a/src/graders/trials-compare-weighted.ts
+++ /dev/null
@@ -1,128 +0,0 @@
-/**
- * Built-in weighted comparison grader for trials data.
- *
- * @remarks
- * Configurable weights for capability (passAtK), reliability (passExpK),
- * and consistency (1 - flakiness) dimensions.
- *
- * Weights can be customized via environment variables:
- * - `COMPARE_CAPABILITY` (default: 0.4)
- * - `COMPARE_RELIABILITY` (default: 0.4)
- * - `COMPARE_CONSISTENCY` (default: 0.2)
- *
- * @packageDocumentation
- */
-
-import type {
-  ComparisonGraderResult,
-  TrialsComparisonGrader,
-  TrialsComparisonGraderInput,
-} from '../pipeline/pipeline.types.ts'
-
-/**
- * Weight configuration for trials comparison dimensions.
- */
-export type TrialsWeights = {
-  /** Weight for capability (passAtK) - can the agent solve this at least once? */
-  capability: number
-  /** Weight for reliability (passExpK) - does the agent solve this consistently? */
-  reliability: number
-  /** Weight for consistency (1 - flakiness) - low gap between capability and reliability */
-  consistency: number
-}
-
-/** Default weights: capability=0.4, reliability=0.4, consistency=0.2 */
-export const DEFAULT_TRIALS_WEIGHTS: TrialsWeights = {
-  capability: 0.4,
-  reliability: 0.4,
-  consistency: 0.2,
-}
-
-/**
- * Read weights from environment variables with fallback to defaults.
- *
- * @remarks
- * Validates that weights are non-negative. Invalid or negative values
- * fall back to defaults.
- *
- * @returns TrialsWeights configuration
- *
- * @public
- */
-export const getTrialsWeightsFromEnv = (): TrialsWeights => {
-  const parseWeight = (envVar: string | undefined, defaultValue: number): number => {
-    if (!envVar) return defaultValue
-    const parsed = Number.parseFloat(envVar)
-    // Must be a valid non-negative number
-    if (Number.isNaN(parsed) || parsed < 0) return defaultValue
-    return parsed
-  }
-
-  return {
-    capability: parseWeight(process.env.COMPARE_CAPABILITY, DEFAULT_TRIALS_WEIGHTS.capability),
-    reliability: parseWeight(process.env.COMPARE_RELIABILITY, DEFAULT_TRIALS_WEIGHTS.reliability),
-    consistency: parseWeight(process.env.COMPARE_CONSISTENCY, DEFAULT_TRIALS_WEIGHTS.consistency),
-  }
-}
-
-/**
- * Create a weighted trials comparison grader with custom weights.
- *
- * @param weights - Weight configuration for comparison dimensions
- * @returns Trials comparison grader function
- *
- * @public
- */
-export const createTrialsWeightedGrader = (weights: TrialsWeights = DEFAULT_TRIALS_WEIGHTS): TrialsComparisonGrader => {
-  return async ({ runs }: TrialsComparisonGraderInput): Promise<ComparisonGraderResult> => {
-    const scores = Object.entries(runs).map(([label, run]) => {
-      // Capability score: passAtK (0-1)
-      const capabilityScore = run.passAtK ?? 0
-
-      // Reliability score: passExpK (0-1)
-      const reliabilityScore = run.passExpK ?? 0
-
-      // Consistency score: 1 - flakiness
-      // Flakiness = passAtK - passExpK (how much gap between capability and reliability)
-      const flakiness = Math.max(0, capabilityScore - reliabilityScore)
-      const consistencyScore = 1 - flakiness
-
-      // Weighted combination
-      const weighted =
-        capabilityScore * weights.capability +
-        reliabilityScore * weights.reliability +
-        consistencyScore * weights.consistency
-
-      return { label, weighted, capabilityScore, reliabilityScore, consistencyScore, flakiness }
-    })
-
-    // Sort by weighted score descending (highest = best)
-    const sorted = scores.sort((a, b) => b.weighted - a.weighted)
-
-    return {
-      rankings: sorted.map((s, i) => ({
-        run: s.label,
-        rank: i + 1,
-        score: s.weighted,
-      })),
-      reasoning: `Weighted trials: capability=${weights.capability}, reliability=${weights.reliability}, consistency=${weights.consistency}`,
-    }
-  }
-}
-
-/**
- * Default weighted trials comparison grader using environment or default weights.
- *
- * @remarks
- * This is the default grader used when `--strategy weighted` is specified
- * for trials format comparison.
- *
- * @public
- */
-export const grade: TrialsComparisonGrader = async (
-  input: TrialsComparisonGraderInput,
-): Promise<ComparisonGraderResult> => {
-  const weights = getTrialsWeightsFromEnv()
-  const grader = createTrialsWeightedGrader(weights)
-  return grader(input)
-}
diff --git a/src/harness.ts b/src/harness.ts
deleted file mode 100644
index 042208d..0000000
--- a/src/harness.ts
+++ /dev/null
@@ -1,46 +0,0 @@
-/**
- * Harness commands for agent evaluation.
- *
- * @remarks
- * Re-exports all harness command modules for programmatic use.
- * For CLI usage, run `agent-eval-harness <command> --help`.
- *
- * **Commands:**
- * - `capture` - Core trajectory capture
- * - `trials` - Multi-run pass@k/pass^k analysis
- * - `summarize` - Derive compact views from results
- * - `calibrate` - Sample failures for grader review
- * - `validateRefs` - Check reference solutions
- * - `balance` - Analyze test set coverage
- * - `schemasCli` - Export JSON schemas
- * - `headless` - Schema-driven adapter for headless CLI agents
- *
- * @packageDocumentation
- */
-
-export type { BalanceConfig } from './commands/balance.ts'
-export { balance, runBalance } from './commands/balance.ts'
-export type { CalibrateConfig } from './commands/calibrate.ts'
-export { calibrate, runCalibrate } from './commands/calibrate.ts'
-// Config types
-export type { CaptureConfig } from './commands/capture.ts'
-// Command implementations (for programmatic use)
-export {
-  capture,
-  extractOutput,
-  extractTrajectory,
-  hasToolErrors,
-  loadPrompts,
-  runCapture,
-} from './commands/capture.ts'
-export type { SummarizeConfig } from './commands/summarize.ts'
-export { runSummarize, summarize } from './commands/summarize.ts'
-export type { TrialsConfig } from './commands/trials.ts'
-export { runTrials, trials } from './commands/trials.ts'
-export type { ValidateRefsConfig } from './commands/validate-refs.ts'
-export { runValidateRefs, validateRefs } from './commands/validate-refs.ts'
-export type { HeadlessAdapterConfig } from './headless.ts'
-// Headless adapter factory
-export { headless } from './headless.ts'
-export type { SchemasConfig } from './schemas/schemas-cli.ts'
-export { runSchemas, schemasCli } from './schemas/schemas-cli.ts'
diff --git a/src/headless.ts b/src/headless.ts
deleted file mode 100644
index 02530b5..0000000
--- a/src/headless.ts
+++ /dev/null
@@ -1,72 +0,0 @@
-/**
- * Headless adapter factory - schema-driven adapter for any CLI agent.
- *
- * @remarks
- * Re-exports public API from the headless module. The headless adapter enables
- * capturing trajectories from ANY headless CLI agent by defining a schema
- * that describes how to interact with the CLI.
- *
- * **CLI Usage:**
- * ```bash
- * agent-eval-harness headless --schema ./my-agent.json
- * ```
- *
- * **Programmatic Usage:**
- * ```typescript
- * import { parseHeadlessConfig, createSessionManager } from '@plaited/agent-eval-harness/headless'
- *
- * const schema = parseHeadlessConfig(jsonConfig)
- * const sessions = createSessionManager({ schema })
- * ```
- *
- * @packageDocumentation
- */
-
-// Schema definitions and parsing
-export {
-  HeadlessAdapterSchema,
-  OutputConfigSchema,
-  OutputEventExtractSchema,
-  OutputEventMappingSchema,
-  OutputEventMatchSchema,
-  PromptConfigSchema,
-  parseHeadlessConfig,
-  ResultConfigSchema,
-  ResumeConfigSchema,
-  safeParseHeadlessConfig,
-} from './headless/headless.schemas.ts'
-// Types
-export type {
-  HeadlessAdapterConfig,
-  OutputConfig,
-  OutputEventExtract,
-  OutputEventMapping,
-  OutputEventMatch,
-  PromptConfig,
-  ResultConfig,
-  ResumeConfig,
-} from './headless/headless.types.ts'
-// CLI entry point
-export { headless } from './headless/headless-cli.ts'
-export type { HistoryBuilder, HistoryBuilderConfig, HistoryTurn } from './headless/headless-history-builder.ts'
-// History builder
-export { createHistoryBuilder } from './headless/headless-history-builder.ts'
-export type {
-  OutputParser,
-  ParsedResult,
-  ParsedUpdate,
-  ResultParseResult,
-  SessionUpdateType,
-} from './headless/headless-output-parser.ts'
-// Output parser
-export { createOutputParser, jsonPath, jsonPathString } from './headless/headless-output-parser.ts'
-export type {
-  ProcessExitInfo,
-  PromptResult,
-  Session,
-  SessionManager,
-  SessionManagerConfig,
-  UpdateCallback,
-} from './headless/headless-session-manager.ts'
-// Session manager
-export { createSessionManager } from './headless/headless-session-manager.ts'
diff --git a/src/headless/headless-cli.ts b/src/headless/headless-cli.ts
deleted file mode 100644
index 07ab7d4..0000000
--- a/src/headless/headless-cli.ts
+++ /dev/null
@@ -1,428 +0,0 @@
-#!/usr/bin/env bun
-/**
- * Headless adapter factory CLI entry point.
- *
- * @remarks
- * This module implements a schema-driven adapter that can interact with
- * ANY headless CLI agent. The adapter:
- *
- * 1. Reads a JSON schema defining how to interact with the CLI
- * 2. Spawns the CLI process per schema's command + flags
- * 3. Parses stdout using schema's outputEvents mappings
- * 4. Emits session update notifications
- * 5. Manages session state for multi-turn (stream or iterative mode)
- *
- * @packageDocumentation
- */
-
-import { createInterface } from 'node:readline'
-import { parseArgs } from 'node:util'
-import { PROTOCOL_VERSION } from '../schemas/constants.ts'
-import { type HeadlessAdapterConfig, parseHeadlessConfig } from './headless.schemas.ts'
-import { createSessionManager, type SessionManager } from './headless-session-manager.ts'
-
-// ============================================================================
-// Types
-// ============================================================================
-
-/** JSON-RPC 2.0 request */
-type JsonRpcRequest = {
-  jsonrpc: '2.0'
-  id: string | number
-  method: string
-  params?: unknown
-}
-
-/** JSON-RPC 2.0 notification */
-type JsonRpcNotification = {
-  jsonrpc: '2.0'
-  method: string
-  params?: unknown
-}
-
-/** JSON-RPC 2.0 success response */
-type JsonRpcSuccessResponse = {
-  jsonrpc: '2.0'
-  id: string | number
-  result: unknown
-}
-
-/** JSON-RPC 2.0 error response */
-type JsonRpcErrorResponse = {
-  jsonrpc: '2.0'
-  id: string | number | null
-  error: {
-    code: number
-    message: string
-    data?: unknown
-  }
-}
-
-/** JSON-RPC 2.0 response */
-type JsonRpcResponse = JsonRpcSuccessResponse | JsonRpcErrorResponse
-
-/** Content block for prompts */
-type ContentBlock = { type: 'text'; text: string } | { type: 'image'; source: unknown }
-
-// ============================================================================
-// Message Sending
-// ============================================================================
-
-/**
- * Sends a JSON-RPC message to stdout.
- */
-const sendMessage = (message: JsonRpcResponse | JsonRpcNotification): void => {
-  console.log(JSON.stringify(message))
-}
-
-/**
- * Sends a session update notification.
- */
-const sendSessionUpdate = (sessionId: string, update: unknown): void => {
-  sendMessage({
-    jsonrpc: '2.0',
-    method: 'session/update',
-    params: { sessionId, update },
-  })
-}
-
-// ============================================================================
-// Request Handlers
-// ============================================================================
-
-/**
- * Creates request handlers for the headless adapter.
- *
- * @param schema - Headless adapter configuration
- * @param sessions - Session manager instance
- */
-const createHandlers = (schema: HeadlessAdapterConfig, sessions: SessionManager) => {
-  /**
-   * Handle initialize request.
-   */
-  const handleInitialize = async (params: unknown): Promise<unknown> => {
-    const { protocolVersion } = params as { protocolVersion: number }
-
-    if (protocolVersion !== PROTOCOL_VERSION) {
-      throw new Error(`Unsupported protocol version: ${protocolVersion}`)
-    }
-
-    return {
-      protocolVersion: PROTOCOL_VERSION,
-      agentInfo: {
-        name: schema.name,
-        version: '1.0.0',
-      },
-      agentCapabilities: {
-        loadSession: !!schema.resume,
-        promptCapabilities: {
-          image: false,
-        },
-      },
-    }
-  }
-
-  /**
-   * Handle session/new request.
-   */
-  const handleSessionNew = async (params: unknown): Promise<unknown> => {
-    const { cwd } = params as { cwd: string }
-    const session = await sessions.create(cwd)
-    return { sessionId: session.id }
-  }
-
-  /**
-   * Handle session/load request.
-   */
-  const handleSessionLoad = async (params: unknown): Promise<unknown> => {
-    const { sessionId } = params as { sessionId: string }
-    const session = sessions.get(sessionId)
-
-    if (!session) {
-      throw new Error(`Session not found: ${sessionId}`)
-    }
-
-    return { sessionId }
-  }
-
-  /**
-   * Handle session/prompt request.
-   */
-  const handleSessionPrompt = async (params: unknown): Promise<unknown> => {
-    const { sessionId, prompt } = params as { sessionId: string; prompt: ContentBlock[] }
-
-    // Extract text from content blocks
-    const promptText = prompt
-      .filter((block): block is ContentBlock & { type: 'text' } => block.type === 'text')
-      .map((block) => block.text)
-      .join('\n')
-
-    // Execute prompt and stream updates
-    const result = await sessions.prompt(sessionId, promptText, (update) => {
-      // Map parsed update to session update format
-      const sessionUpdate = mapToSessionUpdate(update)
-      sendSessionUpdate(sessionId, sessionUpdate)
-    })
-
-    return {
-      content: [{ type: 'text', text: result.output }],
-    }
-  }
-
-  /**
-   * Handle session/cancel notification.
-   */
-  const handleSessionCancel = async (params: unknown): Promise<void> => {
-    const { sessionId } = params as { sessionId: string }
-    sessions.cancel(sessionId)
-  }
-
-  return {
-    handleInitialize,
-    handleSessionNew,
-    handleSessionLoad,
-    handleSessionPrompt,
-    handleSessionCancel,
-  }
-}
-
-/**
- * Maps a parsed update to session update format.
- */
-const mapToSessionUpdate = (update: { type: string; content?: string; title?: string; status?: string }): unknown => {
-  switch (update.type) {
-    case 'thought':
-      return {
-        sessionUpdate: 'agent_thought_chunk',
-        content: { type: 'text', text: update.content ?? '' },
-      }
-
-    case 'message':
-      return {
-        sessionUpdate: 'agent_message_chunk',
-        content: { type: 'text', text: update.content ?? '' },
-      }
-
-    case 'tool_call':
-      return {
-        sessionUpdate: 'agent_tool_call',
-        toolCall: {
-          name: update.title ?? 'unknown',
-          status: update.status ?? 'pending',
-        },
-      }
-
-    case 'plan':
-      return {
-        sessionUpdate: 'agent_plan',
-        content: { type: 'text', text: update.content ?? '' },
-      }
-
-    default:
-      return {
-        sessionUpdate: 'agent_message_chunk',
-        content: { type: 'text', text: update.content ?? '' },
-      }
-  }
-}
-
-// ============================================================================
-// Main Loop
-// ============================================================================
-
-/**
- * Runs the headless adapter main loop.
- *
- * @param schema - Headless adapter configuration
- * @param verbose - Whether to show debug output
- */
-const runAdapter = async (schema: HeadlessAdapterConfig, verbose = false): Promise<void> => {
-  const sessions = createSessionManager({ schema, verbose })
-  const handlers = createHandlers(schema, sessions)
-
-  // Method handlers (requests expect responses)
-  const methodHandlers: Record<string, (params: unknown) => Promise<unknown>> = {
-    initialize: handlers.handleInitialize,
-    'session/new': handlers.handleSessionNew,
-    'session/load': handlers.handleSessionLoad,
-    'session/prompt': handlers.handleSessionPrompt,
-  }
-
-  // Notification handlers (no response expected)
-  const notificationHandlers: Record<string, (params: unknown) => Promise<void>> = {
-    'session/cancel': handlers.handleSessionCancel,
-  }
-
-  /**
-   * Process incoming JSON-RPC message.
-   */
-  const processMessage = async (line: string): Promise<void> => {
-    let request: JsonRpcRequest | JsonRpcNotification
-
-    try {
-      request = JSON.parse(line)
-    } catch {
-      sendMessage({
-        jsonrpc: '2.0',
-        id: null,
-        error: { code: -32700, message: 'Parse error' },
-      })
-      return
-    }
-
-    // Check if it's a notification (no id)
-    const isNotification = !('id' in request)
-
-    if (isNotification) {
-      const handler = notificationHandlers[request.method]
-      if (handler) {
-        await handler(request.params)
-      }
-      // No response for notifications
-      return
-    }
-
-    // It's a request - send response
-    const reqWithId = request as JsonRpcRequest
-    const handler = methodHandlers[reqWithId.method]
-
-    if (!handler) {
-      sendMessage({
-        jsonrpc: '2.0',
-        id: reqWithId.id,
-        error: { code: -32601, message: `Method not found: ${reqWithId.method}` },
-      })
-      return
-    }
-
-    try {
-      const result = await handler(reqWithId.params)
-      sendMessage({
-        jsonrpc: '2.0',
-        id: reqWithId.id,
-        result,
-      })
-    } catch (error) {
-      sendMessage({
-        jsonrpc: '2.0',
-        id: reqWithId.id,
-        error: {
-          code: -32603,
-          message: error instanceof Error ? error.message : 'Internal error',
-        },
-      })
-    }
-  }
-
-  // Main loop: read lines from stdin
-  const rl = createInterface({
-    input: process.stdin,
-    output: process.stdout,
-    terminal: false,
-  })
-
-  rl.on('line', processMessage)
-
-  // Handle clean shutdown
-  process.on('SIGTERM', () => {
-    rl.close()
-    process.exit(0)
-  })
-
-  process.on('SIGINT', () => {
-    rl.close()
-    process.exit(0)
-  })
-}
-
-// ============================================================================
-// CLI Entry Point
-// ============================================================================
-
-/**
- * Headless adapter CLI entry point.
- *
- * @param args - Command line arguments
- */
-export const headless = async (args: string[]): Promise<void> => {
-  const { values } = parseArgs({
-    args,
-    options: {
-      schema: { type: 'string', short: 's' },
-      verbose: { type: 'boolean', short: 'v' },
-      help: { type: 'boolean', short: 'h' },
-    },
-    allowPositionals: false,
-  })
-
-  if (values.help) {
-    console.log(`
-Usage: agent-eval-harness headless --schema <path> [--verbose]
-
-Arguments:
-  -s, --schema    Path to headless adapter schema (JSON)
-  -v, --verbose   Show constructed commands (for debugging)
-  -h, --help      Show this help message
-
-Description:
-  Schema-driven adapter for ANY headless CLI agent. The adapter reads
-  a JSON schema defining how to interact with the CLI and translates between
-  protocol and CLI stdio.
-
-Schema Format:
-  {
-    "version": 1,
-    "name": "my-agent",
-    "command": ["my-agent-cli"],
-    "sessionMode": "stream" | "iterative",
-    "prompt": { "flag": "-p" },
-    "output": { "flag": "--output-format", "value": "stream-json" },
-    "outputEvents": [...],
-    "result": { "matchPath": "$.type", "matchValue": "result", "contentPath": "$.content" }
-  }
-
-Examples:
-  # Run with Claude headless schema
-  agent-eval-harness headless --schema ./claude-headless.json
-
-  # Use in capture pipeline
-  agent-eval-harness capture prompts.jsonl --schema ./claude-headless.json -o results.jsonl
-`)
-    return
-  }
-
-  if (!values.schema) {
-    console.error('Error: --schema is required')
-    console.error('Example: agent-eval-harness headless --schema ./my-agent.json')
-    process.exit(1)
-  }
-
-  // Load and validate schema
-  const schemaPath = values.schema
-  const schemaFile = Bun.file(schemaPath)
-
-  if (!(await schemaFile.exists())) {
-    console.error(`Error: schema file not found: ${schemaPath}`)
-    process.exit(1)
-  }
-
-  let schema: HeadlessAdapterConfig
-  try {
-    const rawSchema = await schemaFile.json()
-    schema = parseHeadlessConfig(rawSchema)
-  } catch (error) {
-    console.error(`Error: invalid schema: ${error instanceof Error ? error.message : String(error)}`)
-    process.exit(1)
-  }
-
-  // Run the adapter
-  await runAdapter(schema, values.verbose ?? false)
-}
-
-// Allow direct execution
-if (import.meta.main) {
-  headless(Bun.argv.slice(2)).catch((error) => {
-    console.error('Error:', error instanceof Error ? error.message : error)
-    process.exit(1)
-  })
-}
diff --git a/src/headless/headless-history-builder.ts b/src/headless/headless-history-builder.ts
deleted file mode 100644
index 28e04c9..0000000
--- a/src/headless/headless-history-builder.ts
+++ /dev/null
@@ -1,141 +0,0 @@
-/**
- * History builder for iterative mode sessions.
- *
- * @remarks
- * In iterative mode, each prompt spawns a new process. The history builder
- * accumulates conversation context and formats it using the schema's
- * historyTemplate for inclusion in subsequent prompts.
- *
- * @packageDocumentation
- */
-
-// ============================================================================
-// Types
-// ============================================================================
-
-/** A single turn in conversation history */
-export type HistoryTurn = {
-  /** User input */
-  input: string
-  /** Agent output */
-  output: string
-}
-
-/** History builder configuration */
-export type HistoryBuilderConfig = {
-  /** Template for formatting history (e.g., "User: {{input}}\nAssistant: {{output}}") */
-  template?: string
-}
-
-// ============================================================================
-// Default Template
-// ============================================================================
-
-const DEFAULT_TEMPLATE = 'User: {{input}}\nAssistant: {{output}}'
-
-// ============================================================================
-// History Builder Factory
-// ============================================================================
-
-/**
- * Creates a history builder for iterative mode sessions.
- *
- * @remarks
- * The history builder:
- * 1. Stores conversation turns
- * 2. Formats history using the template
- * 3. Builds complete prompts with context
- *
- * @param config - History builder configuration
- * @returns History builder with add, format, and build methods
- */
-export const createHistoryBuilder = (config: HistoryBuilderConfig = {}) => {
-  const template = config.template ?? DEFAULT_TEMPLATE
-  const history: HistoryTurn[] = []
-
-  /**
-   * Adds a turn to history.
-   *
-   * @param input - User input
-   * @param output - Agent output
-   */
-  const addTurn = (input: string, output: string): void => {
-    history.push({ input, output })
-  }
-
-  /**
-   * Formats the current history as a string.
-   *
-   * @returns Formatted history string
-   */
-  const formatHistory = (): string => {
-    return history.map((turn) => formatTurn(turn, template)).join('\n\n')
-  }
-
-  /**
-   * Builds a prompt with history context.
-   *
-   * @remarks
-   * For the first turn, returns just the input.
-   * For subsequent turns, prepends formatted history.
-   *
-   * @param newInput - The new user input
-   * @returns Full prompt including history context
-   */
-  const buildPrompt = (newInput: string): string => {
-    if (history.length === 0) {
-      return newInput
-    }
-
-    const formattedHistory = formatHistory()
-    return `${formattedHistory}\n\nUser: ${newInput}`
-  }
-
-  /**
-   * Gets the number of turns in history.
-   */
-  const getLength = (): number => {
-    return history.length
-  }
-
-  /**
-   * Clears all history.
-   */
-  const clear = (): void => {
-    history.length = 0
-  }
-
-  /**
-   * Gets a copy of the history.
-   */
-  const getHistory = (): HistoryTurn[] => {
-    return [...history]
-  }
-
-  return {
-    addTurn,
-    formatHistory,
-    buildPrompt,
-    getLength,
-    clear,
-    getHistory,
-  }
-}
-
-// ============================================================================
-// Helper Functions
-// ============================================================================
-
-/**
- * Formats a single turn using the template.
- *
- * @param turn - History turn
- * @param template - Template string with {{input}} and {{output}} placeholders
- * @returns Formatted turn string
- */
-const formatTurn = (turn: HistoryTurn, template: string): string => {
-  return template.replace('{{input}}', turn.input).replace('{{output}}', turn.output)
-}
-
-/** History builder type */
-export type HistoryBuilder = ReturnType<typeof createHistoryBuilder>
diff --git a/src/headless/headless-output-parser.ts b/src/headless/headless-output-parser.ts
deleted file mode 100644
index cbf374f..0000000
--- a/src/headless/headless-output-parser.ts
+++ /dev/null
@@ -1,388 +0,0 @@
-/**
- * Generic output parser for headless CLI agents.
- *
- * @remarks
- * Uses schema-defined mappings to convert CLI JSON output into session updates.
- * Supports JSONPath-like expressions for matching and extraction.
- *
- * @packageDocumentation
- */
-
-import type { HeadlessAdapterConfig, OutputEventMapping, PassthroughTypeMap } from './headless.schemas.ts'
-
-// ============================================================================
-// Types
-// ============================================================================
-
-/** session update types */
-export type SessionUpdateType = 'thought' | 'tool_call' | 'message' | 'plan'
-
-/** Parsed session update from CLI output */
-export type ParsedUpdate = {
-  type: SessionUpdateType
-  content?: string
-  title?: string
-  status?: string
-  input?: unknown
-  output?: unknown
-  timestamp: number
-  raw: unknown
-}
-
-/** Result extraction from CLI output */
-export type ParsedResult = {
-  isResult: true
-  content: string
-  raw: unknown
-}
-
-/** Not a result */
-export type NotResult = {
-  isResult: false
-}
-
-/** Parse result for final output */
-export type ResultParseResult = ParsedResult | NotResult
-
-// ============================================================================
-// JSONPath Implementation
-// ============================================================================
-
-/**
- * Extracts a value from an object using a simple JSONPath expression.
- *
- * @remarks
- * Supports:
- * - `$.field` - Root field access
- * - `$.nested.field` - Nested field access
- * - `$.array[0]` - Array index access
- * - `$.array[*]` - Array wildcard (returns all items)
- * - `$.array[0].field` - Combined array and field access
- * - `'literal'` - Literal string values (single quotes)
- *
- * @param obj - Object to extract from
- * @param path - JSONPath expression
- * @returns Extracted value, array of values (for wildcard), or undefined
- */
-export const jsonPath = (obj: unknown, path: string): unknown => {
-  // Handle literal strings (e.g., "'pending'")
-  if (path.startsWith("'") && path.endsWith("'")) {
-    return path.slice(1, -1)
-  }
-
-  // Handle JSONPath expressions (e.g., "$.type", "$.message.content[0].text")
-  if (!path.startsWith('$.')) {
-    return undefined
-  }
-
-  // Parse path into segments, handling both dot notation and array indices
-  // e.g., "message.content[0].text" -> ["message", "content", 0, "text"]
-  // e.g., "message.content[*].type" -> ["message", "content", "*", "type"]
-  const segments: (string | number | '*')[] = []
-  const pathBody = path.slice(2) // Remove "$."
-
-  // Split by dots first, then handle array indices within each part
-  for (const part of pathBody.split('.')) {
-    if (!part) continue
-
-    // Check for array wildcard: "content[*]"
-    const wildcardMatch = part.match(/^([^[]*)\[\*\]$/)
-    if (wildcardMatch) {
-      const propName = wildcardMatch[1]
-      if (propName) {
-        segments.push(propName)
-      }
-      segments.push('*')
-      continue
-    }
-
-    // Check for array index: "content[0]" or just "[0]"
-    const arrayMatch = part.match(/^([^[]*)\[(\d+)\]$/)
-    if (arrayMatch) {
-      const propName = arrayMatch[1]
-      const indexStr = arrayMatch[2]
-      if (propName) {
-        segments.push(propName)
-      }
-      if (indexStr) {
-        segments.push(parseInt(indexStr, 10))
-      }
-    } else {
-      segments.push(part)
-    }
-  }
-
-  let current: unknown = obj
-
-  for (const segment of segments) {
-    if (current === null || current === undefined) {
-      return undefined
-    }
-
-    if (segment === '*') {
-      // Array wildcard - return array as-is for further processing
-      if (!Array.isArray(current)) {
-        return undefined
-      }
-      return current
-    } else if (typeof segment === 'number') {
-      // Array index access
-      if (!Array.isArray(current)) {
-        return undefined
-      }
-      current = current[segment]
-    } else {
-      // Property access
-      if (typeof current !== 'object') {
-        return undefined
-      }
-      current = (current as Record<string, unknown>)[segment]
-    }
-  }
-
-  return current
-}
-
-/**
- * Extracts a string value from an object using JSONPath.
- *
- * @param obj - Object to extract from
- * @param path - JSONPath expression
- * @returns String value or undefined
- */
-export const jsonPathString = (obj: unknown, path: string): string | undefined => {
-  const value = jsonPath(obj, path)
-  if (value === undefined || value === null) {
-    return undefined
-  }
-  return String(value)
-}
-
-// ============================================================================
-// Output Parser Factory
-// ============================================================================
-
-/**
- * Parse line using passthrough mode.
- *
- * @remarks
- * Passthrough mode directly maps the agent's type field to session update types.
- * Simpler than JSONPath for agents with well-structured output.
- *
- * @param line - JSON string from CLI stdout
- * @param typeMap - Passthrough type mapping configuration
- * @returns Parsed update or null if no mapping matches
- */
-const parsePassthrough = (line: string, typeMap: PassthroughTypeMap): ParsedUpdate | null => {
-  let event: Record<string, unknown>
-  try {
-    event = JSON.parse(line) as Record<string, unknown>
-  } catch {
-    return null
-  }
-
-  const typeField = typeMap.typeField ?? 'type'
-  const eventType = event[typeField]
-
-  if (typeof eventType !== 'string') {
-    return null
-  }
-
-  // Check if this type has a mapping
-  const typeValues = typeMap.typeValues as Record<string, SessionUpdateType> | undefined
-  const mappedType = typeValues?.[eventType]
-  if (!mappedType) {
-    // No explicit mapping - try direct match if it's a valid session type
-    const validTypes = ['thought', 'tool_call', 'message', 'plan'] as const
-    if (!validTypes.includes(eventType as (typeof validTypes)[number])) {
-      return null
-    }
-    // Use the event type directly if it's already a valid session type
-    return {
-      type: eventType as SessionUpdateType,
-      content: typeof event.content === 'string' ? event.content : undefined,
-      title: typeof event.name === 'string' ? event.name : typeof event.title === 'string' ? event.title : undefined,
-      status: typeof event.status === 'string' ? event.status : undefined,
-      input: event.input,
-      output: event.output,
-      timestamp: Date.now(),
-      raw: event,
-    }
-  }
-
-  // Use mapped type
-  return {
-    type: mappedType,
-    content: typeof event.content === 'string' ? event.content : undefined,
-    title: typeof event.name === 'string' ? event.name : typeof event.title === 'string' ? event.title : undefined,
-    status: typeof event.status === 'string' ? event.status : undefined,
-    input: event.input,
-    output: event.output,
-    timestamp: Date.now(),
-    raw: event,
-  }
-}
-
-/**
- * Creates an output parser from adapter configuration.
- *
- * @remarks
- * The parser uses the schema's outputEvents mappings to:
- * 1. Match incoming JSON lines against patterns
- * 2. Extract content using JSONPath expressions
- * 3. Emit session update objects
- *
- * Supports two modes:
- * - 'jsonpath' (default): Uses outputEvents for complex pattern matching
- * - 'passthrough': Direct type mapping for well-structured output
- *
- * @param config - Headless adapter configuration
- * @returns Parser function for individual lines
- */
-export const createOutputParser = (config: HeadlessAdapterConfig) => {
-  const { result, outputMode = 'jsonpath', outputEvents = [], passthroughTypeMap } = config
-
-  /**
-   * Parses a single JSON line from CLI output.
-   *
-   * @param line - JSON string from CLI stdout
-   * @returns Parsed update, array of updates (for wildcard matches), or null if no mapping matches
-   */
-  const parseLine = (line: string): ParsedUpdate | ParsedUpdate[] | null => {
-    // Use passthrough mode if configured
-    if (outputMode === 'passthrough' && passthroughTypeMap) {
-      return parsePassthrough(line, passthroughTypeMap)
-    }
-
-    // JSONPath mode (default)
-    if (!outputEvents || outputEvents.length === 0) {
-      return null
-    }
-
-    let event: unknown
-    try {
-      event = JSON.parse(line)
-    } catch {
-      // Not valid JSON, skip
-      return null
-    }
-
-    // Try each mapping until one matches
-    for (const mapping of outputEvents) {
-      const matchValue = jsonPath(event, mapping.match.path)
-
-      // Handle array results from wildcard paths (e.g., $.message.content[*])
-      if (Array.isArray(matchValue)) {
-        const updates: ParsedUpdate[] = []
-        for (const item of matchValue) {
-          // Check if this array item matches the expected value
-          if (mapping.match.value === '*') {
-            // Wildcard: match any non-null item
-            if (item !== undefined && item !== null) {
-              updates.push(createUpdate(item, mapping))
-            }
-          } else if (typeof item === 'object' && item !== null && 'type' in item) {
-            // For objects with 'type' property, check nested match
-            const itemType = (item as Record<string, unknown>).type
-            if (itemType === mapping.match.value) {
-              updates.push(createUpdate(item, mapping))
-            }
-          } else if (item === mapping.match.value) {
-            // For primitives, direct match
-            updates.push(createUpdate(item, mapping))
-          }
-        }
-        if (updates.length > 0) {
-          return updates
-        }
-      } else {
-        // Single value matching (original behavior)
-        if (mapping.match.value === '*') {
-          if (matchValue !== undefined && matchValue !== null) {
-            return createUpdate(event, mapping)
-          }
-        } else if (matchValue === mapping.match.value) {
-          return createUpdate(event, mapping)
-        }
-      }
-    }
-
-    return null
-  }
-
-  /**
-   * Creates a ParsedUpdate from a matched event.
-   */
-  const createUpdate = (event: unknown, mapping: OutputEventMapping): ParsedUpdate => {
-    const update: ParsedUpdate = {
-      type: mapping.emitAs,
-      timestamp: Date.now(),
-      raw: event,
-    }
-
-    if (mapping.extract) {
-      if (mapping.extract.content) {
-        update.content = jsonPathString(event, mapping.extract.content)
-      }
-      if (mapping.extract.title) {
-        update.title = jsonPathString(event, mapping.extract.title)
-      }
-      if (mapping.extract.status) {
-        update.status = jsonPathString(event, mapping.extract.status)
-      }
-      if (mapping.extract.input) {
-        const value = jsonPath(event, mapping.extract.input)
-        if (value !== undefined) {
-          update.input = value
-        }
-      }
-      if (mapping.extract.output) {
-        const value = jsonPath(event, mapping.extract.output)
-        if (value !== undefined) {
-          update.output = value
-        }
-      }
-    }
-
-    return update
-  }
-
-  /**
-   * Checks if a JSON line represents the final result.
-   *
-   * @param line - JSON string from CLI stdout
-   * @returns Result extraction or indication that it's not a result
-   */
-  const parseResult = (line: string): ResultParseResult => {
-    let event: unknown
-    try {
-      event = JSON.parse(line)
-    } catch {
-      return { isResult: false }
-    }
-
-    const matchValue = jsonPath(event, result.matchPath)
-    // Support wildcard "*" to match any non-null value
-    const matches =
-      result.matchValue === '*' ? matchValue !== undefined && matchValue !== null : matchValue === result.matchValue
-
-    if (matches) {
-      const content = jsonPathString(event, result.contentPath)
-      return {
-        isResult: true,
-        content: content ?? '',
-        raw: event,
-      }
-    }
-
-    return { isResult: false }
-  }
-
-  return {
-    parseLine,
-    parseResult,
-  }
-}
-
-/** Output parser type */
-export type OutputParser = ReturnType<typeof createOutputParser>
diff --git a/src/headless/headless-session-manager.ts b/src/headless/headless-session-manager.ts
deleted file mode 100644
index 73638ad..0000000
--- a/src/headless/headless-session-manager.ts
+++ /dev/null
@@ -1,590 +0,0 @@
-/**
- * Session manager for headless CLI agents.
- *
- * @remarks
- * Manages the lifecycle of CLI agent sessions including:
- * - Process spawning and tracking
- * - Stream mode (persistent process) vs iterative mode (new process per turn)
- * - Output parsing and update emission
- * - Session state management
- *
- * @packageDocumentation
- */
-
-import type { Subprocess } from 'bun'
-import type { HeadlessAdapterConfig } from './headless.schemas.ts'
-import { createHistoryBuilder, type HistoryBuilder } from './headless-history-builder.ts'
-import { createOutputParser, type OutputParser, type ParsedUpdate } from './headless-output-parser.ts'
-
-// ============================================================================
-// Types
-// ============================================================================
-
-/** Session state */
-export type Session = {
-  /** Unique session identifier */
-  id: string
-  /** Working directory for this session */
-  cwd: string
-  /** Subprocess (stream mode only) */
-  process?: Subprocess
-  /** History builder (iterative mode only) */
-  history?: HistoryBuilder
-  /** Session ID from CLI (for resume, stream mode) */
-  cliSessionId?: string
-  /** Whether the session is active */
-  active: boolean
-  /** Turn count for this session */
-  turnCount: number
-}
-
-/** Process exit information for debugging */
-export type ProcessExitInfo = {
-  /** Exit code (null if killed by signal or timed out) */
-  exitCode: number | null
-  /** Signal that killed the process (if any) */
-  signal?: string
-  /** Whether the process was killed due to timeout */
-  timedOut: boolean
-}
-
-/** Update callback for emitting session updates */
-export type UpdateCallback = (update: ParsedUpdate) => void
-
-/** Prompt result with final output */
-export type PromptResult = {
-  /** Final output content */
-  output: string
-  /** All updates collected during the prompt */
-  updates: ParsedUpdate[]
-  /** Session ID from CLI (if available) */
-  cliSessionId?: string
-  /** Process exit information */
-  exitInfo?: ProcessExitInfo
-}
-
-/** Session manager configuration */
-export type SessionManagerConfig = {
-  /** Headless adapter configuration */
-  schema: HeadlessAdapterConfig
-  /** Default timeout for operations in ms (overrides schema timeout) */
-  timeout?: number
-  /** Whether to show debug output (constructed commands, raw stdout) */
-  verbose?: boolean
-  /**
-   * Debug mode - shows detailed output for troubleshooting.
-   * When enabled:
-   * - Raw CLI stdout/stderr is logged
-   * - JSONPath match attempts and results are shown
-   * - Process spawn/exit info is displayed
-   * - Timing for each stage is reported
-   */
-  debug?: boolean
-}
-
-// ============================================================================
-// Session Manager Factory
-// ============================================================================
-
-/**
- * Creates a session manager for headless CLI agents.
- *
- * @remarks
- * The session manager is the core orchestrator for CLI agent interaction:
- *
- * **Stream mode:**
- * - Spawns one process per session
- * - Keeps process alive across turns
- * - Uses stdin/stdout for communication
- * - Supports session resume via CLI flags
- *
- * **Iterative mode:**
- * - Spawns a new process per turn
- * - Accumulates history in prompts
- * - No persistent process state
- *
- * @param config - Session manager configuration
- * @returns Session manager with create, prompt, and cancel methods
- */
-export const createSessionManager = (config: SessionManagerConfig) => {
-  const { schema, verbose = false, debug = false } = config
-  // Use schema timeout if available, otherwise default to 60000ms
-  const schemaTimeout = 'timeout' in schema ? (schema.timeout ?? 60000) : 60000
-  const timeout = config.timeout ?? schemaTimeout
-  const sessions = new Map<string, Session>()
-  const outputParser = createOutputParser(schema)
-
-  /**
-   * Debug logging helper - only logs when debug mode is enabled.
-   */
-  const debugLog = (category: string, message: string, data?: unknown): void => {
-    if (debug) {
-      const timestamp = new Date().toISOString()
-      console.error(`[${timestamp}] [${category}] ${message}`)
-      if (data !== undefined) {
-        console.error(JSON.stringify(data, null, 2))
-      }
-    }
-  }
-
-  /**
-   * Creates a new session.
-   *
-   * @param cwd - Working directory for the session
-   * @returns Created session
-   */
-  const create = async (cwd: string): Promise<Session> => {
-    const id = generateSessionId()
-
-    const session: Session = {
-      id,
-      cwd,
-      active: true,
-      turnCount: 0,
-    }
-
-    // Initialize mode-specific state
-    if (schema.sessionMode === 'iterative') {
-      // Normalize historyTemplate: v2 schemas can have object format, convert to string
-      let templateString: string | undefined
-      if (typeof schema.historyTemplate === 'object' && schema.historyTemplate !== null) {
-        // Use turnFormat from object-style template
-        templateString = schema.historyTemplate.turnFormat
-      } else {
-        templateString = schema.historyTemplate
-      }
-      session.history = createHistoryBuilder({
-        template: templateString,
-      })
-    }
-
-    sessions.set(id, session)
-    return session
-  }
-
-  /**
-   * Sends a prompt to a session and collects the response.
-   *
-   * @param sessionId - Session ID
-   * @param promptText - Prompt text to send
-   * @param onUpdate - Callback for streaming updates
-   * @returns Prompt result with output and updates
-   */
-  const prompt = async (sessionId: string, promptText: string, onUpdate?: UpdateCallback): Promise<PromptResult> => {
-    const session = sessions.get(sessionId)
-    if (!session) {
-      throw new Error(`Session not found: ${sessionId}`)
-    }
-
-    if (!session.active) {
-      throw new Error(`Session is not active: ${sessionId}`)
-    }
-
-    session.turnCount++
-
-    if (schema.sessionMode === 'stream') {
-      return promptStream(session, promptText, onUpdate)
-    }
-
-    return promptIterative(session, promptText, onUpdate)
-  }
-
-  /**
-   * Stream mode: send prompt via stdin to persistent process.
-   */
-  const promptStream = async (
-    session: Session,
-    promptText: string,
-    onUpdate?: UpdateCallback,
-  ): Promise<PromptResult> => {
-    // Build command for first turn or if no process exists
-    if (!session.process || session.process.killed) {
-      const args = buildCommand(session, promptText)
-
-      // Choose stdin mode based on schema configuration
-      const stdinMode = schema.prompt.stdin ? 'pipe' : 'ignore'
-
-      session.process = Bun.spawn(args, {
-        cwd: session.cwd,
-        stdin: stdinMode,
-        stdout: 'pipe',
-        stderr: 'inherit',
-      })
-
-      // If using stdin, write the prompt and close stdin
-      // (stream mode spawns new process per turn, so stdin should close after writing)
-      if (schema.prompt.stdin && session.process) {
-        writePromptToStdin(session.process, promptText, true)
-      }
-    } else {
-      // Subsequent turns: spawn new process with resume flag
-      const args = buildCommand(session, promptText)
-      const stdinMode = schema.prompt.stdin ? 'pipe' : 'ignore'
-
-      session.process = Bun.spawn(args, {
-        cwd: session.cwd,
-        stdin: stdinMode,
-        stdout: 'pipe',
-        stderr: 'inherit',
-      })
-
-      // If using stdin, write the prompt and close stdin
-      // (stream mode spawns new process per turn, so stdin should close after writing)
-      if (schema.prompt.stdin && session.process) {
-        writePromptToStdin(session.process, promptText, true)
-      }
-    }
-
-    return collectOutput(session, outputParser, onUpdate, timeout, debugLog)
-  }
-
-  /**
-   * Iterative mode: spawn new process per turn with history context.
-   */
-  const promptIterative = async (
-    session: Session,
-    promptText: string,
-    onUpdate?: UpdateCallback,
-  ): Promise<PromptResult> => {
-    // Build full prompt with history
-    const fullPrompt = session.history?.buildPrompt(promptText) ?? promptText
-
-    // Build and spawn command
-    const args = buildCommand(session, fullPrompt)
-    const stdinMode = schema.prompt.stdin ? 'pipe' : 'ignore'
-
-    session.process = Bun.spawn(args, {
-      cwd: session.cwd,
-      stdin: stdinMode,
-      stdout: 'pipe',
-      stderr: 'inherit',
-    })
-
-    // If using stdin, write the prompt and close stdin
-    // (iterative mode spawns new process per turn, so stdin should close after writing)
-    if (schema.prompt.stdin && session.process) {
-      writePromptToStdin(session.process, fullPrompt, true)
-    }
-
-    const result = await collectOutput(session, outputParser, onUpdate, timeout, debugLog)
-
-    // Store in history for next turn
-    session.history?.addTurn(promptText, result.output)
-
-    // Clean up process
-    session.process = undefined
-
-    return result
-  }
-
-  /**
-   * Builds the command array for spawning the CLI.
-   */
-  const buildCommand = (session: Session, promptText: string): string[] => {
-    const args = [...schema.command]
-
-    // Add output format flags (only if non-empty)
-    if (schema.output.flag) {
-      args.push(schema.output.flag, schema.output.value)
-    }
-
-    // Add auto-approve flags
-    if (schema.autoApprove) {
-      args.push(...schema.autoApprove)
-    }
-
-    // Add cwd flag if specified
-    if (schema.cwdFlag) {
-      args.push(schema.cwdFlag, session.cwd)
-    }
-
-    // Add resume flag if available (stream mode, after first turn)
-    if (schema.sessionMode === 'stream' && schema.resume && session.cliSessionId) {
-      args.push(schema.resume.flag, session.cliSessionId)
-    }
-
-    // Add prompt flag and text (skip if using stdin)
-    if (!schema.prompt.stdin) {
-      if (schema.prompt.flag) {
-        args.push(schema.prompt.flag, promptText)
-      } else {
-        // Positional argument (no flag)
-        args.push(promptText)
-      }
-    }
-
-    // Debug output: show constructed command
-    if (verbose || debug) {
-      const stdinNote = schema.prompt.stdin ? ' (+ stdin)' : ''
-      console.error(`[headless] Command: ${args.join(' ')}${stdinNote}`)
-    }
-
-    return args
-  }
-
-  /**
-   * Cancels an active session.
-   *
-   * @param sessionId - Session ID to cancel
-   */
-  const cancel = (sessionId: string): void => {
-    const session = sessions.get(sessionId)
-    if (!session) return
-
-    session.active = false
-
-    if (session.process && !session.process.killed) {
-      session.process.kill()
-    }
-  }
-
-  /**
-   * Gets a session by ID.
-   *
-   * @param sessionId - Session ID
-   * @returns Session or undefined
-   */
-  const get = (sessionId: string): Session | undefined => {
-    return sessions.get(sessionId)
-  }
-
-  /**
-   * Deletes a session.
-   *
-   * @param sessionId - Session ID
-   */
-  const destroy = (sessionId: string): void => {
-    cancel(sessionId)
-    sessions.delete(sessionId)
-  }
-
-  return {
-    create,
-    prompt,
-    cancel,
-    get,
-    destroy,
-  }
-}
-
-// ============================================================================
-// Helper Functions
-// ============================================================================
-
-/**
- * Generates a unique session ID.
- *
- * @remarks
- * Uses crypto.randomUUID() for secure random generation instead of Math.random().
- */
-const generateSessionId = (): string => {
-  return `sess_${crypto.randomUUID()}`
-}
-
-/**
- * Writes a prompt to a process stdin stream.
- *
- * @remarks
- * Uses Bun's FileSink API to write text to the process stdin.
- * The FileSink type provides `write()` and `flush()` methods for
- * efficient stream writing without async overhead.
- *
- * Type guard ensures stdin is a FileSink (not a file descriptor number)
- * before attempting to write. This handles Bun's subprocess stdin types:
- * - `'pipe'` → FileSink with write/flush methods
- * - `'ignore'` → null (not writable)
- * - number → file descriptor (not a FileSink)
- *
- * **Closing stdin:** When `closeAfterWrite` is true, the stdin stream is
- * closed after writing. This is required for CLIs that read from stdin
- * with `-` and wait for EOF before processing (e.g., Codex). For stream
- * mode sessions where stdin stays open for subsequent prompts, pass false.
- *
- * @param process - Subprocess with stdin stream
- * @param prompt - Prompt text to write
- * @param closeAfterWrite - Whether to close stdin after writing (default: false)
- *
- * @internal
- */
-const writePromptToStdin = (process: Subprocess, prompt: string, closeAfterWrite = false): void => {
-  if (process.stdin && typeof process.stdin !== 'number') {
-    process.stdin.write(`${prompt}\n`)
-    process.stdin.flush()
-    if (closeAfterWrite) {
-      process.stdin.end()
-    }
-  }
-}
-
-/**
- * Collects output from a running process.
- *
- * @param session - Active session
- * @param parser - Output parser
- * @param onUpdate - Update callback
- * @param timeoutMs - Timeout in ms
- * @param logDebug - Debug logging function
- * @returns Collected output and updates
- */
-const collectOutput = async (
-  session: Session,
-  parser: OutputParser,
-  onUpdate: UpdateCallback | undefined,
-  timeoutMs: number,
-  logDebug: (category: string, message: string, data?: unknown) => void,
-): Promise<PromptResult> => {
-  const updates: ParsedUpdate[] = []
-  let output = ''
-  let cliSessionId: string | undefined
-  const accumulatedMessages: string[] = []
-  let timedOut = false
-
-  const stdout = session.process?.stdout
-  if (!stdout || typeof stdout === 'number') {
-    throw new Error('No stdout available')
-  }
-
-  const reader = stdout.getReader()
-  const decoder = new TextDecoder()
-  let buffer = ''
-
-  // Track timeout with a timer ID so we can clear it
-  let timeoutId: Timer | undefined
-
-  const timeoutPromise = new Promise<'timeout'>((resolve) => {
-    timeoutId = setTimeout(() => resolve('timeout'), timeoutMs)
-  })
-
-  logDebug('process', `Starting output collection with ${timeoutMs}ms timeout`)
-
-  try {
-    const readLoop = async (): Promise<'complete'> => {
-      readLines: while (true) {
-        const { done, value } = await reader.read()
-
-        if (done) {
-          logDebug('process', 'Process stdout closed')
-          break
-        }
-
-        const chunk = decoder.decode(value, { stream: true })
-        logDebug('raw', `Received ${chunk.length} bytes`)
-
-        buffer += chunk
-
-        // Process complete lines
-        const lines = buffer.split('\n')
-        buffer = lines.pop() ?? ''
-
-        for (const line of lines) {
-          if (!line.trim()) continue
-
-          logDebug('line', `Processing line: ${line.slice(0, 100)}${line.length > 100 ? '...' : ''}`)
-
-          // Parse as update first (so updates are emitted even for result lines)
-          const update = parser.parseLine(line)
-          if (update !== null) {
-            // Handle both single updates and arrays of updates (from wildcard matches)
-            const updatesToProcess = Array.isArray(update) ? update : [update]
-
-            for (const singleUpdate of updatesToProcess) {
-              logDebug('parse', `Matched event: ${singleUpdate.type}`, {
-                title: singleUpdate.title,
-                status: singleUpdate.status,
-                content: singleUpdate.content?.slice(0, 50),
-              })
-
-              updates.push(singleUpdate)
-              onUpdate?.(singleUpdate)
-
-              // Accumulate message content for fallback
-              if (singleUpdate.type === 'message' && singleUpdate.content) {
-                accumulatedMessages.push(singleUpdate.content)
-              }
-
-              // Extract CLI session ID if available
-              if (!cliSessionId && singleUpdate.raw && typeof singleUpdate.raw === 'object') {
-                const raw = singleUpdate.raw as Record<string, unknown>
-                if (typeof raw.session_id === 'string') {
-                  cliSessionId = raw.session_id
-                  session.cliSessionId = cliSessionId
-                  logDebug('session', `Extracted CLI session ID: ${cliSessionId}`)
-                }
-              }
-            }
-          } else {
-            logDebug('parse', 'No matching event mapping for line')
-          }
-
-          // Check for final result (after emitting update)
-          const resultCheck = parser.parseResult(line)
-          if (resultCheck.isResult) {
-            output = resultCheck.content
-            logDebug('result', `Found result: ${output.slice(0, 100)}${output.length > 100 ? '...' : ''}`)
-            break readLines // Exit both loops immediately on result
-          }
-        }
-      }
-      return 'complete'
-    }
-
-    const raceResult = await Promise.race([readLoop(), timeoutPromise])
-
-    if (raceResult === 'timeout') {
-      timedOut = true
-      logDebug('timeout', `Process timed out after ${timeoutMs}ms`)
-
-      // Kill the process on timeout
-      if (session.process && !session.process.killed) {
-        session.process.kill('SIGTERM')
-        logDebug('process', 'Sent SIGTERM to process')
-      }
-    }
-  } finally {
-    if (timeoutId) {
-      clearTimeout(timeoutId)
-    }
-    reader.releaseLock()
-  }
-
-  // Fallback: if result contentPath didn't yield output, use accumulated messages
-  if (!output && accumulatedMessages.length > 0) {
-    output = accumulatedMessages.join('\n')
-    logDebug('fallback', `Using accumulated messages as output (${accumulatedMessages.length} messages)`)
-  }
-
-  // Get exit info from process
-  let exitInfo: ProcessExitInfo | undefined
-  if (session.process) {
-    try {
-      // Wait for process to exit (with a short timeout to not block)
-      const exitCode = await Promise.race([
-        session.process.exited,
-        new Promise<null>((resolve) => setTimeout(() => resolve(null), 1000)),
-      ])
-
-      exitInfo = {
-        exitCode: exitCode,
-        timedOut,
-        signal: timedOut ? 'SIGTERM' : undefined,
-      }
-
-      logDebug('exit', `Process exit info`, exitInfo)
-    } catch {
-      exitInfo = {
-        exitCode: null,
-        timedOut,
-      }
-    }
-  }
-
-  return {
-    output,
-    updates,
-    cliSessionId,
-    exitInfo,
-  }
-}
-
-/** Session manager type */
-export type SessionManager = ReturnType<typeof createSessionManager>
diff --git a/src/headless/headless.schemas.ts b/src/headless/headless.schemas.ts
deleted file mode 100644
index 18f6145..0000000
--- a/src/headless/headless.schemas.ts
+++ /dev/null
@@ -1,321 +0,0 @@
-/**
- * Zod schemas for headless adapter configuration.
- *
- * @remarks
- * These schemas define how to interact with ANY headless CLI agent via a
- * schema-driven approach. No hardcoded agent-specific logic - the schema
- * defines everything: command, flags, output parsing rules.
- *
- * @packageDocumentation
- */
-
-import { z } from 'zod'
-
-// ============================================================================
-// Output Event Mapping Schema
-// ============================================================================
-
-/**
- * Schema for matching CLI output to session update types.
- *
- * @remarks
- * Uses JSONPath-like patterns to match events in CLI JSON output
- * and map them to session update types.
- */
-export const OutputEventMatchSchema = z.object({
-  /** JSONPath to match event type in CLI output (e.g., "$.type") */
-  path: z.string(),
-  /** Value to match at the path (e.g., "tool_use") */
-  value: z.string(),
-})
-
-/** Output event match type */
-export type OutputEventMatch = z.infer<typeof OutputEventMatchSchema>
-
-/**
- * Schema for extracting content from matched events.
- *
- * @remarks
- * Known fields (`content`, `title`, `status`, `input`, `output`) are used by the
- * output parser to populate `ParsedUpdate` properties. Additional string-valued
- * fields are preserved during validation for forward compatibility but are not
- * consumed by the parser.
- *
- * Paths can be:
- * - JSONPath expressions (e.g., "$.message.text")
- * - Literal strings in single quotes (e.g., "'pending'")
- */
-export const OutputEventExtractSchema = z
-  .object({
-    /** JSONPath to extract main content */
-    content: z.string().optional(),
-    /** JSONPath to extract title (for tool calls) */
-    title: z.string().optional(),
-    /** JSONPath to extract status (or literal like "'pending'") */
-    status: z.string().optional(),
-    /** JSONPath to extract tool input arguments (e.g., "$.input") */
-    input: z.string().optional(),
-    /** JSONPath to extract tool output/result content (e.g., "$.content") */
-    output: z.string().optional(),
-  })
-  .catchall(z.string())
-
-/** Output event extract type */
-export type OutputEventExtract = z.infer<typeof OutputEventExtractSchema>
-
-/**
- * Schema for mapping CLI output events to session update types.
- *
- * @remarks
- * Each mapping specifies:
- * 1. How to match events (match.path + match.value)
- * 2. What session update type to emit (emitAs)
- * 3. What content to extract (extract)
- */
-export const OutputEventMappingSchema = z.object({
-  /** Matching criteria for CLI output */
-  match: OutputEventMatchSchema,
-  /** session update type to emit */
-  emitAs: z.enum(['thought', 'tool_call', 'message', 'plan']),
-  /** Content extraction configuration */
-  extract: OutputEventExtractSchema.optional(),
-})
-
-/** Output event mapping type */
-export type OutputEventMapping = z.infer<typeof OutputEventMappingSchema>
-
-// ============================================================================
-// Prompt Configuration Schema
-// ============================================================================
-
-/**
- * Schema for how to pass prompts to the CLI.
- *
- * @remarks
- * Three modes are supported:
- * 1. **Flag-based**: `flag: "-p"` - Pass prompt via command-line flag
- * 2. **Positional**: `flag: ""` - Pass prompt as positional argument
- * 3. **Stdin**: `stdin: true` - Write prompt to stdin (command should include `-` or equivalent)
- */
-export const PromptConfigSchema = z
-  .object({
-    /** Flag to pass prompt (e.g., "-p", "--prompt"). Empty string for positional. */
-    flag: z.string().optional(),
-    /** Use stdin to pass prompt instead of command args */
-    stdin: z.boolean().optional(),
-    /** Format for stdin input in stream mode */
-    stdinFormat: z.enum(['text', 'json']).optional(),
-  })
-  .refine((data) => !(data.flag && data.stdin), {
-    message: "Cannot specify both 'flag' and 'stdin' modes - use either flag-based or stdin mode, not both",
-  })
-
-/** Prompt configuration type */
-export type PromptConfig = z.infer<typeof PromptConfigSchema>
-
-// ============================================================================
-// Output Configuration Schema
-// ============================================================================
-
-/**
- * Schema for output format configuration.
- */
-export const OutputConfigSchema = z.object({
-  /** Flag for output format (e.g., "--output-format") */
-  flag: z.string(),
-  /** Value for output format (e.g., "stream-json") */
-  value: z.string(),
-})
-
-/** Output configuration type */
-export type OutputConfig = z.infer<typeof OutputConfigSchema>
-
-// ============================================================================
-// Resume Configuration Schema
-// ============================================================================
-
-/**
- * Schema for session resume support (stream mode).
- */
-export const ResumeConfigSchema = z.object({
-  /** Flag to resume session (e.g., "--resume") */
-  flag: z.string(),
-  /** JSONPath to extract session ID from output */
-  sessionIdPath: z.string(),
-})
-
-/** Resume configuration type */
-export type ResumeConfig = z.infer<typeof ResumeConfigSchema>
-
-// ============================================================================
-// Result Configuration Schema
-// ============================================================================
-
-/**
- * Schema for final result extraction.
- */
-export const ResultConfigSchema = z.object({
-  /** JSONPath to match result type (e.g., "$.type") */
-  matchPath: z.string(),
-  /** Value indicating final result (e.g., "result") */
-  matchValue: z.string(),
-  /** JSONPath to extract result content */
-  contentPath: z.string(),
-})
-
-/** Result configuration type */
-export type ResultConfig = z.infer<typeof ResultConfigSchema>
-
-// ============================================================================
-// Passthrough Type Mapping Schema
-// ============================================================================
-
-/**
- * Schema for passthrough type mapping.
- *
- * @remarks
- * Used when outputMode is 'passthrough' to map agent's native type names
- * to standard session update types. Useful for agents with well-structured
- * output that doesn't need complex JSONPath parsing.
- */
-export const PassthroughTypeMapSchema = z.object({
-  /** JSON field that contains the event type (default: "type") */
-  typeField: z.string().default('type'),
-  /** Mapping from agent type values to session update types */
-  typeValues: z.record(z.string(), z.enum(['thought', 'tool_call', 'message', 'plan'])).optional(),
-})
-
-/** Passthrough type mapping type */
-export type PassthroughTypeMap = z.infer<typeof PassthroughTypeMapSchema>
-
-// ============================================================================
-// Main Adapter Schema
-// ============================================================================
-
-/**
- * Schema for headless adapter configuration.
- *
- * @remarks
- * This schema defines everything needed to interact with a headless CLI agent:
- * - Command and flags to spawn
- * - How to pass prompts
- * - How to parse output (jsonpath or passthrough mode)
- * - Session handling mode
- *
- * Supports two output parsing modes:
- * - 'jsonpath': Use outputEvents for complex JSONPath-based parsing (default)
- * - 'passthrough': Direct type mapping for well-structured output
- *
- * Example (Claude):
- * ```json
- * {
- *   "version": 1,
- *   "name": "claude-headless",
- *   "command": ["claude"],
- *   "sessionMode": "stream",
- *   "timeout": 90000,
- *   "prompt": { "flag": "-p" },
- *   "output": { "flag": "--output-format", "value": "stream-json" },
- *   "outputEvents": [...]
- * }
- * ```
- */
-export const HeadlessAdapterSchema = z.object({
-  /** Schema version */
-  version: z.literal(1),
-
-  /** Human-readable adapter name */
-  name: z.string(),
-
-  /** Base command to spawn (e.g., ["claude"], ["gemini"]) */
-  command: z.array(z.string()),
-
-  /**
-   * Session mode determines how multi-turn conversations work:
-   * - 'stream': Keep process alive, multi-turn via stdin
-   * - 'iterative': New process per turn, accumulate context in prompt
-   */
-  sessionMode: z.enum(['stream', 'iterative']),
-
-  /** Default timeout for this agent in milliseconds (can be overridden per-prompt) */
-  timeout: z.number().optional(),
-
-  /** How to pass the prompt */
-  prompt: PromptConfigSchema,
-
-  /** Output format configuration */
-  output: OutputConfigSchema,
-
-  /** Flags for auto-approval in headless mode (e.g., ["--allowedTools", "*"]) */
-  autoApprove: z.array(z.string()).optional(),
-
-  /** Session resume support (stream mode only) */
-  resume: ResumeConfigSchema.optional(),
-
-  /** Working directory flag (if CLI needs explicit --cwd) */
-  cwdFlag: z.string().optional(),
-
-  /**
-   * Output parsing mode:
-   * - 'jsonpath': Use outputEvents for complex JSONPath-based parsing (default)
-   * - 'passthrough': Direct type mapping for well-structured output
-   */
-  outputMode: z.enum(['jsonpath', 'passthrough']).default('jsonpath'),
-
-  /** Output event mappings - how to parse CLI output into updates (jsonpath mode) */
-  outputEvents: z.array(OutputEventMappingSchema).optional(),
-
-  /** Type mapping for passthrough mode */
-  passthroughTypeMap: PassthroughTypeMapSchema.optional(),
-
-  /** Final result extraction configuration */
-  result: ResultConfigSchema,
-
-  /**
-   * Template for formatting conversation history (iterative mode only).
-   *
-   * @remarks
-   * Supports both string format (simple) and object format (advanced):
-   * - String: "User: {{input}}\nAssistant: {{output}}"
-   * - Object: { system: "...", turnFormat: "..." }
-   */
-  historyTemplate: z
-    .union([
-      z.string(),
-      z.object({
-        /** System prefix for accumulated history */
-        system: z.string().optional(),
-        /** Format for each turn: {{input}} and {{output}} placeholders */
-        turnFormat: z.string(),
-      }),
-    ])
-    .optional(),
-})
-
-/** Headless adapter configuration type */
-export type HeadlessAdapterConfig = z.infer<typeof HeadlessAdapterSchema>
-
-// ============================================================================
-// Validation Helpers
-// ============================================================================
-
-/**
- * Validates and parses a headless adapter configuration.
- *
- * @param config - Raw configuration object (e.g., from JSON file)
- * @returns Validated HeadlessAdapterConfig
- * @throws ZodError if validation fails
- */
-export const parseHeadlessConfig = (config: unknown): HeadlessAdapterConfig => {
-  return HeadlessAdapterSchema.parse(config)
-}
-
-/**
- * Safely validates a headless adapter configuration.
- *
- * @param config - Raw configuration object
- * @returns Result with success/failure and data or error
- */
-export const safeParseHeadlessConfig = (config: unknown) => {
-  return HeadlessAdapterSchema.safeParse(config)
-}
diff --git a/src/headless/headless.types.ts b/src/headless/headless.types.ts
deleted file mode 100644
index 95b0a80..0000000
--- a/src/headless/headless.types.ts
+++ /dev/null
@@ -1,19 +0,0 @@
-/**
- * Type exports for headless adapter.
- *
- * @remarks
- * Re-exports all types from the schemas module for external consumers.
- *
- * @packageDocumentation
- */
-
-export type {
-  HeadlessAdapterConfig,
-  OutputConfig,
-  OutputEventExtract,
-  OutputEventMapping,
-  OutputEventMatch,
-  PromptConfig,
-  ResultConfig,
-  ResumeConfig,
-} from './headless.schemas.ts'
diff --git a/src/headless/tests/fixtures/claude-headless.json b/src/headless/tests/fixtures/claude-headless.json
deleted file mode 100644
index 632b72f..0000000
--- a/src/headless/tests/fixtures/claude-headless.json
+++ /dev/null
@@ -1,40 +0,0 @@
-{
-  "version": 1,
-  "name": "claude-headless",
-  "command": ["claude"],
-  "sessionMode": "stream",
-  "prompt": {
-    "flag": "-p"
-  },
-  "output": {
-    "flag": "--output-format",
-    "value": "stream-json"
-  },
-  "autoApprove": ["--dangerously-skip-permissions", "--verbose"],
-  "resume": {
-    "flag": "--resume",
-    "sessionIdPath": "$.session_id"
-  },
-  "outputEvents": [
-    {
-      "match": { "path": "$.type", "value": "assistant" },
-      "emitAs": "message",
-      "extract": { "content": "$.message.content[0].text" }
-    },
-    {
-      "match": { "path": "$.type", "value": "tool_use" },
-      "emitAs": "tool_call",
-      "extract": { "title": "$.name", "status": "'pending'", "input": "$.input" }
-    },
-    {
-      "match": { "path": "$.type", "value": "tool_result" },
-      "emitAs": "tool_call",
-      "extract": { "title": "$.name", "status": "'completed'", "output": "$.content" }
-    }
-  ],
-  "result": {
-    "matchPath": "$.type",
-    "matchValue": "result",
-    "contentPath": "$.result"
-  }
-}
diff --git a/src/headless/tests/fixtures/gemini-headless.json b/src/headless/tests/fixtures/gemini-headless.json
deleted file mode 100644
index bd09dec..0000000
--- a/src/headless/tests/fixtures/gemini-headless.json
+++ /dev/null
@@ -1,37 +0,0 @@
-{
-  "version": 1,
-  "name": "gemini-headless",
-  "command": ["gemini"],
-  "sessionMode": "iterative",
-  "prompt": {
-    "flag": ""
-  },
-  "output": {
-    "flag": "--output-format",
-    "value": "stream-json"
-  },
-  "autoApprove": ["--sandbox", "false"],
-  "outputEvents": [
-    {
-      "match": { "path": "$.type", "value": "message" },
-      "emitAs": "message",
-      "extract": { "content": "$.content" }
-    },
-    {
-      "match": { "path": "$.type", "value": "tool_use" },
-      "emitAs": "tool_call",
-      "extract": { "title": "$.tool_name", "status": "'pending'", "input": "$.args" }
-    },
-    {
-      "match": { "path": "$.type", "value": "tool_result" },
-      "emitAs": "tool_call",
-      "extract": { "title": "$.tool_name", "status": "'completed'", "output": "$.output" }
-    }
-  ],
-  "result": {
-    "matchPath": "$.type",
-    "matchValue": "result",
-    "contentPath": "$.content"
-  },
-  "historyTemplate": "User: {{input}}\nAssistant: {{output}}"
-}
diff --git a/src/headless/tests/headless.spec.ts b/src/headless/tests/headless.spec.ts
deleted file mode 100644
index 168d476..0000000
--- a/src/headless/tests/headless.spec.ts
+++ /dev/null
@@ -1,873 +0,0 @@
-/**
- * Unit tests for headless adapter factory.
- *
- * @remarks
- * Tests cover:
- * - Schema validation with Zod
- * - JSONPath extraction
- * - Output parsing with event mappings
- * - History building for iterative mode
- */
-
-import { describe, expect, test } from 'bun:test'
-import { HeadlessAdapterSchema, parseHeadlessConfig, safeParseHeadlessConfig } from '../headless.schemas.ts'
-import { createHistoryBuilder } from '../headless-history-builder.ts'
-import { createOutputParser, jsonPath, jsonPathString } from '../headless-output-parser.ts'
-
-// ============================================================================
-// Test Fixtures
-// ============================================================================
-
-const validClaudeSchema = {
-  version: 1,
-  name: 'claude-headless',
-  command: ['claude'],
-  sessionMode: 'stream',
-  prompt: { flag: '-p' },
-  output: { flag: '--output-format', value: 'stream-json' },
-  autoApprove: ['--dangerously-skip-permissions'],
-  resume: { flag: '--resume', sessionIdPath: '$.session_id' },
-  outputEvents: [
-    {
-      match: { path: '$.type', value: 'assistant' },
-      emitAs: 'message',
-      extract: { content: '$.message.text' },
-    },
-    {
-      match: { path: '$.type', value: 'tool_use' },
-      emitAs: 'tool_call',
-      extract: { title: '$.name', status: "'pending'", input: '$.input' },
-    },
-    {
-      match: { path: '$.type', value: 'tool_result' },
-      emitAs: 'tool_call',
-      extract: { title: '$.name', status: "'completed'", output: '$.content' },
-    },
-  ],
-  result: {
-    matchPath: '$.type',
-    matchValue: 'result',
-    contentPath: '$.result',
-  },
-}
-
-const validGeminiSchema = {
-  version: 1,
-  name: 'gemini-headless',
-  command: ['gemini'],
-  sessionMode: 'iterative',
-  prompt: { flag: '--prompt' },
-  output: { flag: '--output-format', value: 'json' },
-  outputEvents: [
-    {
-      match: { path: '$.type', value: 'message' },
-      emitAs: 'message',
-      extract: { content: '$.content' },
-    },
-  ],
-  result: {
-    matchPath: '$.type',
-    matchValue: 'result',
-    contentPath: '$.response',
-  },
-  historyTemplate: 'User: {{input}}\nAssistant: {{output}}',
-}
-
-// ============================================================================
-// Schema Validation Tests
-// ============================================================================
-
-describe('HeadlessAdapterSchema', () => {
-  describe('valid schemas', () => {
-    test('validates Claude headless schema', () => {
-      const result = HeadlessAdapterSchema.safeParse(validClaudeSchema)
-      expect(result.success).toBe(true)
-    })
-
-    test('validates Gemini headless schema', () => {
-      const result = HeadlessAdapterSchema.safeParse(validGeminiSchema)
-      expect(result.success).toBe(true)
-    })
-  })
-
-  describe('validates schema files from disk', () => {
-    const fixturesDir = 'src/headless/tests/fixtures'
-
-    test('validates claude-headless.json from disk', async () => {
-      const content = await Bun.file(`${fixturesDir}/claude-headless.json`).json()
-      const result = HeadlessAdapterSchema.safeParse(content)
-      expect(result.success).toBe(true)
-    })
-
-    test('validates gemini-headless.json from disk', async () => {
-      const content = await Bun.file(`${fixturesDir}/gemini-headless.json`).json()
-      const result = HeadlessAdapterSchema.safeParse(content)
-      expect(result.success).toBe(true)
-    })
-  })
-
-  describe('extract input/output fields', () => {
-    test('validates schema with input and output in extract config', () => {
-      const schemaWithIO = {
-        ...validClaudeSchema,
-        outputEvents: [
-          ...validClaudeSchema.outputEvents,
-          {
-            match: { path: '$.type', value: 'custom' },
-            emitAs: 'tool_call',
-            extract: { title: '$.name', input: '$.args', output: '$.result' },
-          },
-        ],
-      }
-      const result = HeadlessAdapterSchema.safeParse(schemaWithIO)
-      expect(result.success).toBe(true)
-    })
-
-    test('preserves extra extract fields via catchall', () => {
-      const schemaWithExtras = {
-        ...validClaudeSchema,
-        outputEvents: [
-          {
-            match: { path: '$.type', value: 'tool_use' },
-            emitAs: 'tool_call',
-            extract: {
-              title: '$.name',
-              status: "'pending'",
-              input: '$.input',
-              toolName: '$.name',
-              mcpServer: '$.server',
-            },
-          },
-        ],
-      }
-      const result = HeadlessAdapterSchema.safeParse(schemaWithExtras)
-      expect(result.success).toBe(true)
-      if (result.success) {
-        const extract = result.data.outputEvents![0]!.extract!
-        expect(extract.title).toBe('$.name')
-        expect(extract.input).toBe('$.input')
-        // Catchall fields aren't in the inferred type — cast needed to access them
-        expect((extract as Record<string, string>).toolName).toBe('$.name')
-        expect((extract as Record<string, string>).mcpServer).toBe('$.server')
-      }
-    })
-
-    test('rejects non-string extra extract fields', () => {
-      const schemaWithBadExtras = {
-        ...validClaudeSchema,
-        outputEvents: [
-          {
-            match: { path: '$.type', value: 'tool_use' },
-            emitAs: 'tool_call',
-            extract: { title: '$.name', badField: 123 },
-          },
-        ],
-      }
-      const result = HeadlessAdapterSchema.safeParse(schemaWithBadExtras)
-      expect(result.success).toBe(false)
-    })
-  })
-
-  describe('minimal valid schema', () => {
-    test('validates minimal required fields', () => {
-      const minimal = {
-        version: 1,
-        name: 'minimal',
-        command: ['agent'],
-        sessionMode: 'iterative',
-        prompt: {},
-        output: { flag: '--format', value: 'json' },
-        outputEvents: [],
-        result: { matchPath: '$.type', matchValue: 'done', contentPath: '$.text' },
-      }
-      const result = HeadlessAdapterSchema.safeParse(minimal)
-      expect(result.success).toBe(true)
-    })
-  })
-
-  describe('stdin mode configuration', () => {
-    test('validates schema with stdin: true', () => {
-      const stdinSchema = {
-        version: 1,
-        name: 'stdin-agent',
-        command: ['agent', 'exec', '-'],
-        sessionMode: 'stream',
-        prompt: { stdin: true },
-        output: { flag: '--format', value: 'json' },
-        outputEvents: [],
-        result: { matchPath: '$.type', matchValue: 'done', contentPath: '$.text' },
-      }
-      const result = HeadlessAdapterSchema.safeParse(stdinSchema)
-      expect(result.success).toBe(true)
-    })
-
-    test('validates schema with stdin: false', () => {
-      const stdinSchema = {
-        version: 1,
-        name: 'stdin-agent',
-        command: ['agent'],
-        sessionMode: 'stream',
-        prompt: { stdin: false, flag: '-p' },
-        output: { flag: '--format', value: 'json' },
-        outputEvents: [],
-        result: { matchPath: '$.type', matchValue: 'done', contentPath: '$.text' },
-      }
-      const result = HeadlessAdapterSchema.safeParse(stdinSchema)
-      expect(result.success).toBe(true)
-    })
-
-    test('validates schema with positional prompt and - in command', () => {
-      const stdinSchema = {
-        version: 1,
-        name: 'codex-like',
-        command: ['codex', 'exec', '--json', '-'],
-        sessionMode: 'iterative',
-        prompt: { stdin: true },
-        output: { flag: '', value: '' },
-        outputEvents: [
-          {
-            match: { path: '$.item.type', value: 'agent_message' },
-            emitAs: 'message',
-            extract: { content: '$.item.text' },
-          },
-        ],
-        result: { matchPath: '$.type', matchValue: 'turn.completed', contentPath: '$.usage.output_tokens' },
-      }
-      const result = HeadlessAdapterSchema.safeParse(stdinSchema)
-      expect(result.success).toBe(true)
-    })
-  })
-
-  describe('invalid schemas', () => {
-    test('rejects missing version', () => {
-      const invalid = { ...validClaudeSchema, version: undefined }
-      const result = HeadlessAdapterSchema.safeParse(invalid)
-      expect(result.success).toBe(false)
-    })
-
-    test('rejects unsupported version', () => {
-      const invalid = { ...validClaudeSchema, version: 2 }
-      const result = HeadlessAdapterSchema.safeParse(invalid)
-      expect(result.success).toBe(false)
-    })
-
-    test('rejects invalid sessionMode', () => {
-      const invalid = { ...validClaudeSchema, sessionMode: 'batch' }
-      const result = HeadlessAdapterSchema.safeParse(invalid)
-      expect(result.success).toBe(false)
-    })
-
-    test('rejects missing command', () => {
-      const invalid = { ...validClaudeSchema, command: undefined }
-      const result = HeadlessAdapterSchema.safeParse(invalid)
-      expect(result.success).toBe(false)
-    })
-
-    test('rejects both flag and stdin specified', () => {
-      const invalid = {
-        ...validClaudeSchema,
-        prompt: {
-          flag: '-p',
-          stdin: true,
-        },
-      }
-      const result = HeadlessAdapterSchema.safeParse(invalid)
-      expect(result.success).toBe(false)
-      // Type assertion after checking success is false
-      const error = (result as { success: false; error: { issues: Array<{ message: string }> } }).error
-      expect(error.issues.length).toBeGreaterThan(0)
-      expect(error.issues[0]!.message).toContain("Cannot specify both 'flag' and 'stdin' modes")
-    })
-
-    test('rejects invalid emitAs type', () => {
-      const invalid = {
-        ...validClaudeSchema,
-        outputEvents: [
-          {
-            match: { path: '$.type', value: 'x' },
-            emitAs: 'invalid_type',
-          },
-        ],
-      }
-      const result = HeadlessAdapterSchema.safeParse(invalid)
-      expect(result.success).toBe(false)
-    })
-  })
-
-  describe('parseHeadlessConfig', () => {
-    test('returns parsed config for valid input', () => {
-      const config = parseHeadlessConfig(validClaudeSchema)
-      expect(config.name).toBe('claude-headless')
-      expect(config.command).toEqual(['claude'])
-      expect(config.sessionMode).toBe('stream')
-    })
-
-    test('throws for invalid input', () => {
-      expect(() => parseHeadlessConfig({ version: 99 })).toThrow()
-    })
-  })
-
-  describe('safeParseHeadlessConfig', () => {
-    test('returns success for valid input', () => {
-      const result = safeParseHeadlessConfig(validClaudeSchema)
-      expect(result.success).toBe(true)
-      if (result.success) {
-        expect(result.data.name).toBe('claude-headless')
-      }
-    })
-
-    test('returns failure for invalid input', () => {
-      const result = safeParseHeadlessConfig({ version: 99 })
-      expect(result.success).toBe(false)
-    })
-  })
-})
-
-// ============================================================================
-// JSONPath Tests
-// ============================================================================
-
-describe('jsonPath', () => {
-  const testObj = {
-    type: 'message',
-    message: {
-      text: 'Hello world',
-      nested: { value: 42 },
-    },
-    array: [1, 2, 3],
-  }
-
-  describe('basic extraction', () => {
-    test('extracts root field', () => {
-      expect(jsonPath(testObj, '$.type')).toBe('message')
-    })
-
-    test('extracts nested field', () => {
-      expect(jsonPath(testObj, '$.message.text')).toBe('Hello world')
-    })
-
-    test('extracts deeply nested field', () => {
-      expect(jsonPath(testObj, '$.message.nested.value')).toBe(42)
-    })
-
-    test('returns undefined for non-existent path', () => {
-      expect(jsonPath(testObj, '$.missing')).toBeUndefined()
-    })
-
-    test('returns undefined for non-existent nested path', () => {
-      expect(jsonPath(testObj, '$.message.missing.deep')).toBeUndefined()
-    })
-  })
-
-  describe('literal strings', () => {
-    test('returns literal string value', () => {
-      expect(jsonPath(testObj, "'pending'")).toBe('pending')
-    })
-
-    test('returns empty literal string', () => {
-      expect(jsonPath(testObj, "''")).toBe('')
-    })
-
-    test('returns literal with spaces', () => {
-      expect(jsonPath(testObj, "'hello world'")).toBe('hello world')
-    })
-  })
-
-  describe('edge cases', () => {
-    test('handles null input', () => {
-      expect(jsonPath(null, '$.type')).toBeUndefined()
-    })
-
-    test('handles undefined input', () => {
-      expect(jsonPath(undefined, '$.type')).toBeUndefined()
-    })
-
-    test('handles non-object input', () => {
-      expect(jsonPath('string', '$.type')).toBeUndefined()
-    })
-
-    test('handles invalid path format', () => {
-      expect(jsonPath(testObj, 'type')).toBeUndefined()
-    })
-  })
-})
-
-describe('jsonPathString', () => {
-  test('extracts string value', () => {
-    expect(jsonPathString({ text: 'hello' }, '$.text')).toBe('hello')
-  })
-
-  test('converts number to string', () => {
-    expect(jsonPathString({ num: 42 }, '$.num')).toBe('42')
-  })
-
-  test('returns undefined for missing path', () => {
-    expect(jsonPathString({ x: 1 }, '$.y')).toBeUndefined()
-  })
-
-  test('returns undefined for null value', () => {
-    expect(jsonPathString({ x: null }, '$.x')).toBeUndefined()
-  })
-})
-
-// ============================================================================
-// Output Parser Tests
-// ============================================================================
-
-describe('createOutputParser', () => {
-  const config = parseHeadlessConfig(validClaudeSchema)
-  const parser = createOutputParser(config)
-
-  describe('parseLine', () => {
-    test('maps assistant type to message', () => {
-      const line = JSON.stringify({ type: 'assistant', message: { text: 'Hello' } })
-      const result = parser.parseLine(line)
-      expect(result).not.toBeNull()
-      // Handle both single result and array of results
-      const singleResult = Array.isArray(result) ? result[0] : result
-      expect(singleResult?.type).toBe('message')
-      expect(singleResult?.content).toBe('Hello')
-    })
-
-    test('maps tool_use type to tool_call', () => {
-      const line = JSON.stringify({ type: 'tool_use', name: 'Read' })
-      const result = parser.parseLine(line)
-      expect(result).not.toBeNull()
-      // Handle both single result and array of results
-      const singleResult = Array.isArray(result) ? result[0] : result
-      expect(singleResult?.type).toBe('tool_call')
-      expect(singleResult?.title).toBe('Read')
-      expect(singleResult?.status).toBe('pending')
-    })
-
-    test('returns null for unmapped event types', () => {
-      const line = JSON.stringify({ type: 'unknown', data: 'test' })
-      const result = parser.parseLine(line)
-      expect(result).toBeNull()
-    })
-
-    test('returns null for invalid JSON', () => {
-      const result = parser.parseLine('not valid json')
-      expect(result).toBeNull()
-    })
-
-    test('returns null for empty line', () => {
-      const result = parser.parseLine('')
-      expect(result).toBeNull()
-    })
-
-    test('preserves raw event in result', () => {
-      const event = { type: 'assistant', message: { text: 'Hi' } }
-      const line = JSON.stringify(event)
-      const result = parser.parseLine(line)
-      // Handle both single result and array of results
-      const singleResult = Array.isArray(result) ? result[0] : result
-      expect(singleResult?.raw).toEqual(event)
-    })
-
-    test('extracts input from tool_use event', () => {
-      const line = JSON.stringify({ type: 'tool_use', name: 'Read', input: { file_path: '/test.ts' } })
-      const result = parser.parseLine(line)
-      const singleResult = Array.isArray(result) ? result[0] : result
-      expect(singleResult?.input).toEqual({ file_path: '/test.ts' })
-    })
-
-    test('extracts output from tool_result event', () => {
-      const line = JSON.stringify({ type: 'tool_result', name: 'Read', content: 'file contents' })
-      const result = parser.parseLine(line)
-      const singleResult = Array.isArray(result) ? result[0] : result
-      expect(singleResult?.output).toBe('file contents')
-    })
-
-    test('sets timestamp on parsed updates', () => {
-      const before = Date.now()
-      const line = JSON.stringify({ type: 'assistant', message: { text: 'Hello' } })
-      const result = parser.parseLine(line)
-      const after = Date.now()
-      const singleResult = Array.isArray(result) ? result[0] : result
-      expect(singleResult?.timestamp).toBeGreaterThanOrEqual(before)
-      expect(singleResult?.timestamp).toBeLessThanOrEqual(after)
-    })
-  })
-
-  describe('parseLine with extra extract fields', () => {
-    test('extra extract fields do not break parser', () => {
-      const configWithExtras = parseHeadlessConfig({
-        version: 1,
-        name: 'extras-test',
-        command: ['test'],
-        sessionMode: 'stream',
-        prompt: { flag: '-p' },
-        output: { flag: '--output', value: 'json' },
-        outputEvents: [
-          {
-            match: { path: '$.type', value: 'tool_use' },
-            emitAs: 'tool_call',
-            extract: {
-              title: '$.name',
-              status: "'pending'",
-              input: '$.input',
-              toolName: '$.name',
-              mcpServer: '$.server',
-            },
-          },
-        ],
-        result: { matchPath: '$.type', matchValue: 'done', contentPath: '$.text' },
-      })
-      const extrasParser = createOutputParser(configWithExtras)
-      const line = JSON.stringify({
-        type: 'tool_use',
-        name: 'WebSearch',
-        input: { query: 'test' },
-        server: 'mcp-search',
-      })
-      const result = extrasParser.parseLine(line)
-      const singleResult = Array.isArray(result) ? result[0] : result
-      expect(singleResult).not.toBeNull()
-      expect(singleResult?.type).toBe('tool_call')
-      expect(singleResult?.title).toBe('WebSearch')
-      expect(singleResult?.input).toEqual({ query: 'test' })
-    })
-  })
-
-  describe('parseLine with array wildcards', () => {
-    const wildcardConfig = parseHeadlessConfig({
-      version: 1,
-      name: 'wildcard-test',
-      command: ['test'],
-      sessionMode: 'stream',
-      prompt: { flag: '-p' },
-      output: { flag: '--output', value: 'json' },
-      outputEvents: [
-        {
-          match: { path: '$.message.content[*].type', value: 'tool_use' },
-          emitAs: 'tool_call',
-          extract: { title: '$.name', status: "'pending'" },
-        },
-        {
-          match: { path: '$.items[*]', value: '*' },
-          emitAs: 'message',
-          extract: { content: '$.text' },
-        },
-      ],
-      result: {
-        matchPath: '$.type',
-        matchValue: 'result',
-        contentPath: '$.output',
-      },
-    })
-    const wildcardParser = createOutputParser(wildcardConfig)
-
-    test('returns array of updates for matching array items', () => {
-      const line = JSON.stringify({
-        message: {
-          content: [
-            { type: 'tool_use', name: 'Read', input: {} },
-            { type: 'text', value: 'Hello' },
-            { type: 'tool_use', name: 'Write', input: {} },
-          ],
-        },
-      })
-      const result = wildcardParser.parseLine(line)
-      expect(Array.isArray(result)).toBe(true)
-      if (Array.isArray(result)) {
-        expect(result).toHaveLength(2)
-        expect(result[0]!.type).toBe('tool_call')
-        expect(result[0]!.title).toBe('Read')
-        expect(result[0]!.status).toBe('pending')
-        expect(result[1]!.type).toBe('tool_call')
-        expect(result[1]!.title).toBe('Write')
-        expect(result[1]!.status).toBe('pending')
-      }
-    })
-
-    test('handles empty array gracefully', () => {
-      const line = JSON.stringify({
-        message: { content: [] },
-      })
-      const result = wildcardParser.parseLine(line)
-      expect(result).toBeNull()
-    })
-
-    test('handles non-matching array items', () => {
-      const line = JSON.stringify({
-        message: {
-          content: [
-            { type: 'text', value: 'No tool use here' },
-            { type: 'image', data: 'base64...' },
-          ],
-        },
-      })
-      const result = wildcardParser.parseLine(line)
-      expect(result).toBeNull()
-    })
-
-    test('matches wildcard value for all non-null items', () => {
-      const line = JSON.stringify({
-        items: [{ text: 'Item 1' }, { text: 'Item 2' }, { text: 'Item 3' }],
-      })
-      const result = wildcardParser.parseLine(line)
-      expect(Array.isArray(result)).toBe(true)
-      if (Array.isArray(result)) {
-        expect(result).toHaveLength(3)
-        expect(result[0]!.content).toBe('Item 1')
-        expect(result[1]!.content).toBe('Item 2')
-        expect(result[2]!.content).toBe('Item 3')
-      }
-    })
-
-    test('handles mixed array content with type guards', () => {
-      const line = JSON.stringify({
-        message: {
-          content: [
-            { type: 'tool_use', name: 'Valid' },
-            'string-item',
-            { no_type_property: true },
-            null,
-            { type: 'tool_use', name: 'AlsoValid' },
-          ],
-        },
-      })
-      const result = wildcardParser.parseLine(line)
-      expect(Array.isArray(result)).toBe(true)
-      if (Array.isArray(result)) {
-        expect(result).toHaveLength(2)
-        expect(result[0]!.title).toBe('Valid')
-        expect(result[1]!.title).toBe('AlsoValid')
-      }
-    })
-  })
-
-  describe('jsonPath with array wildcard', () => {
-    test('extracts array with [*] wildcard', () => {
-      const obj = { items: [{ id: 1 }, { id: 2 }] }
-      const result = jsonPath(obj, '$.items[*]')
-      expect(Array.isArray(result)).toBe(true)
-      if (Array.isArray(result)) {
-        expect(result).toHaveLength(2)
-      }
-    })
-
-    test('returns undefined for non-array at wildcard position', () => {
-      const obj = { items: 'not-an-array' }
-      const result = jsonPath(obj, '$.items[*]')
-      expect(result).toBeUndefined()
-    })
-
-    test('handles empty array', () => {
-      const obj = { items: [] }
-      const result = jsonPath(obj, '$.items[*]')
-      expect(result).toEqual([])
-    })
-
-    test('handles nested path to array', () => {
-      const obj = { message: { content: [1, 2, 3] } }
-      const result = jsonPath(obj, '$.message.content[*]')
-      expect(result).toEqual([1, 2, 3])
-    })
-
-    test('returns undefined when path before wildcard is invalid', () => {
-      const obj = { items: [1, 2, 3] }
-      const result = jsonPath(obj, '$.missing[*]')
-      expect(result).toBeUndefined()
-    })
-  })
-
-  describe('parseResult', () => {
-    test('detects result event', () => {
-      const line = JSON.stringify({ type: 'result', result: 'Final answer' })
-      const result = parser.parseResult(line)
-      expect(result.isResult).toBe(true)
-      if (result.isResult) {
-        expect(result.content).toBe('Final answer')
-      }
-    })
-
-    test('returns not-result for non-result events', () => {
-      const line = JSON.stringify({ type: 'assistant', message: { text: 'Hi' } })
-      const result = parser.parseResult(line)
-      expect(result.isResult).toBe(false)
-    })
-
-    test('returns not-result for invalid JSON', () => {
-      const result = parser.parseResult('invalid')
-      expect(result.isResult).toBe(false)
-    })
-
-    test('handles missing content path', () => {
-      const line = JSON.stringify({ type: 'result' })
-      const result = parser.parseResult(line)
-      expect(result.isResult).toBe(true)
-      if (result.isResult) {
-        expect(result.content).toBe('')
-      }
-    })
-  })
-})
-
-// ============================================================================
-// Passthrough Mode Tests
-// ============================================================================
-
-describe('passthrough mode', () => {
-  const passthroughConfig = parseHeadlessConfig({
-    version: 1,
-    name: 'passthrough-test',
-    command: ['test-agent'],
-    sessionMode: 'stream',
-    prompt: { flag: '-p' },
-    output: { flag: '--output', value: 'json' },
-    outputMode: 'passthrough',
-    passthroughTypeMap: {
-      typeField: 'type',
-      typeValues: { tool_use: 'tool_call', tool_result: 'tool_call' },
-    },
-    result: { matchPath: '$.type', matchValue: 'result', contentPath: '$.content' },
-  })
-  const passthroughParser = createOutputParser(passthroughConfig)
-
-  test('extracts input from tool_call event', () => {
-    const line = JSON.stringify({ type: 'tool_use', name: 'Read', input: { file_path: '/test.ts' }, status: 'pending' })
-    const result = passthroughParser.parseLine(line)
-    const singleResult = Array.isArray(result) ? result[0] : result
-    expect(singleResult?.type).toBe('tool_call')
-    expect(singleResult?.input).toEqual({ file_path: '/test.ts' })
-  })
-
-  test('extracts output from tool_result event', () => {
-    const line = JSON.stringify({ type: 'tool_result', name: 'Read', output: 'file contents', status: 'completed' })
-    const result = passthroughParser.parseLine(line)
-    const singleResult = Array.isArray(result) ? result[0] : result
-    expect(singleResult?.type).toBe('tool_call')
-    expect(singleResult?.output).toBe('file contents')
-  })
-
-  test('preserves object input type', () => {
-    const line = JSON.stringify({ type: 'tool_use', name: 'Write', input: { path: '/a.ts', content: 'code' } })
-    const result = passthroughParser.parseLine(line)
-    const singleResult = Array.isArray(result) ? result[0] : result
-    expect(singleResult?.input).toEqual({ path: '/a.ts', content: 'code' })
-  })
-
-  test('sets timestamp on passthrough updates', () => {
-    const before = Date.now()
-    const line = JSON.stringify({ type: 'message', content: 'Hello' })
-    const result = passthroughParser.parseLine(line)
-    const after = Date.now()
-    const singleResult = Array.isArray(result) ? result[0] : result
-    expect(singleResult?.timestamp).toBeGreaterThanOrEqual(before)
-    expect(singleResult?.timestamp).toBeLessThanOrEqual(after)
-  })
-
-  test('handles absent input/output fields gracefully', () => {
-    const line = JSON.stringify({ type: 'tool_use', name: 'Bash', status: 'pending' })
-    const result = passthroughParser.parseLine(line)
-    const singleResult = Array.isArray(result) ? result[0] : result
-    expect(singleResult?.type).toBe('tool_call')
-    expect(singleResult?.input).toBeUndefined()
-    expect(singleResult?.output).toBeUndefined()
-  })
-})
-
-// ============================================================================
-// History Builder Tests
-// ============================================================================
-
-describe('createHistoryBuilder', () => {
-  describe('basic operations', () => {
-    test('starts with empty history', () => {
-      const builder = createHistoryBuilder()
-      expect(builder.getLength()).toBe(0)
-      expect(builder.getHistory()).toEqual([])
-    })
-
-    test('adds turns to history', () => {
-      const builder = createHistoryBuilder()
-      builder.addTurn('Hello', 'Hi there')
-      expect(builder.getLength()).toBe(1)
-      expect(builder.getHistory()).toEqual([{ input: 'Hello', output: 'Hi there' }])
-    })
-
-    test('accumulates multiple turns', () => {
-      const builder = createHistoryBuilder()
-      builder.addTurn('Hello', 'Hi')
-      builder.addTurn('How are you?', 'Fine')
-      expect(builder.getLength()).toBe(2)
-    })
-
-    test('clears history', () => {
-      const builder = createHistoryBuilder()
-      builder.addTurn('Hello', 'Hi')
-      builder.clear()
-      expect(builder.getLength()).toBe(0)
-    })
-  })
-
-  describe('formatHistory', () => {
-    test('uses default template', () => {
-      const builder = createHistoryBuilder()
-      builder.addTurn('Hello', 'Hi there')
-      const formatted = builder.formatHistory()
-      expect(formatted).toBe('User: Hello\nAssistant: Hi there')
-    })
-
-    test('uses custom template', () => {
-      const builder = createHistoryBuilder({
-        template: 'Q: {{input}}\nA: {{output}}',
-      })
-      builder.addTurn('Question', 'Answer')
-      const formatted = builder.formatHistory()
-      expect(formatted).toBe('Q: Question\nA: Answer')
-    })
-
-    test('separates multiple turns with double newline', () => {
-      const builder = createHistoryBuilder()
-      builder.addTurn('First', 'One')
-      builder.addTurn('Second', 'Two')
-      const formatted = builder.formatHistory()
-      expect(formatted).toBe('User: First\nAssistant: One\n\nUser: Second\nAssistant: Two')
-    })
-
-    test('returns empty string for no history', () => {
-      const builder = createHistoryBuilder()
-      expect(builder.formatHistory()).toBe('')
-    })
-  })
-
-  describe('buildPrompt', () => {
-    test('returns just input for first turn', () => {
-      const builder = createHistoryBuilder()
-      const prompt = builder.buildPrompt('Hello')
-      expect(prompt).toBe('Hello')
-    })
-
-    test('includes history for subsequent turns', () => {
-      const builder = createHistoryBuilder()
-      builder.addTurn('Hello', 'Hi')
-      const prompt = builder.buildPrompt('Next question')
-      expect(prompt).toContain('User: Hello')
-      expect(prompt).toContain('Assistant: Hi')
-      expect(prompt).toContain('User: Next question')
-    })
-
-    test('builds complete context with multiple turns', () => {
-      const builder = createHistoryBuilder()
-      builder.addTurn('One', 'Reply one')
-      builder.addTurn('Two', 'Reply two')
-      const prompt = builder.buildPrompt('Three')
-      expect(prompt).toContain('User: One')
-      expect(prompt).toContain('User: Two')
-      expect(prompt).toContain('User: Three')
-    })
-  })
-
-  describe('getHistory returns copy', () => {
-    test('modifying returned array does not affect internal state', () => {
-      const builder = createHistoryBuilder()
-      builder.addTurn('Hello', 'Hi')
-      const history = builder.getHistory()
-      history.push({ input: 'Fake', output: 'Fake' })
-      expect(builder.getLength()).toBe(1)
-    })
-  })
-})
diff --git a/src/integration_tests/claude.spec.ts b/src/integration_tests/claude.spec.ts
deleted file mode 100644
index aacb491..0000000
--- a/src/integration_tests/claude.spec.ts
+++ /dev/null
@@ -1,157 +0,0 @@
-/**
- * Integration tests for Claude Code headless adapter.
- *
- * @remarks
- * Tests verify the headless session manager works correctly with Claude Code CLI
- * using the schema-driven headless adapter approach.
- *
- * Run locally with API key:
- * ```bash
- * ANTHROPIC_API_KEY=sk-... bun test ./src/integration_tests/claude.spec.ts
- * ```
- *
- * Prerequisites:
- * 1. Claude CLI installed (`curl -fsSL https://claude.ai/install.sh | bash`)
- * 2. API key: `ANTHROPIC_API_KEY` environment variable
- *
- * These tests make real API calls and consume credits.
- */
-
-import { afterAll, beforeAll, describe, expect, setDefaultTimeout, test } from 'bun:test'
-import { join } from 'node:path'
-import { parseHeadlessConfig } from '../headless/headless.schemas.ts'
-import { createSessionManager } from '../headless/headless-session-manager.ts'
-
-// Long timeout for real agent interactions (2 minutes)
-setDefaultTimeout(120000)
-
-// Use project root as cwd - agents discover MCP servers from config files
-const PROJECT_ROOT = process.cwd()
-
-// Schema path for Claude headless adapter
-const SCHEMA_PATH = join(PROJECT_ROOT, 'src/headless/tests/fixtures/claude-headless.json')
-
-// Get API key from environment
-const API_KEY = process.env.ANTHROPIC_API_KEY ?? ''
-
-// Skip all tests if no API key is available
-const describeWithApiKey = API_KEY ? describe : describe.skip
-
-describeWithApiKey('Claude Code Integration', () => {
-  let sessionManager: ReturnType<typeof createSessionManager>
-  let schemaConfig: ReturnType<typeof parseHeadlessConfig>
-
-  beforeAll(async () => {
-    // Load JSON from file, then parse with Zod schema
-    const schemaJson = await Bun.file(SCHEMA_PATH).json()
-    schemaConfig = parseHeadlessConfig(schemaJson)
-
-    // Create session manager with the schema
-    sessionManager = createSessionManager({
-      schema: schemaConfig,
-      timeout: 120000,
-      debug: false,
-    })
-  })
-
-  afterAll(async () => {
-    // Cleanup handled automatically by session manager
-  })
-
-  test('creates session successfully', async () => {
-    const session = await sessionManager.create(PROJECT_ROOT)
-
-    expect(session).toBeDefined()
-    expect(session.id).toBeDefined()
-    expect(typeof session.id).toBe('string')
-    expect(session.active).toBe(true)
-    expect(session.cwd).toBe(PROJECT_ROOT)
-  })
-
-  test('sends prompt and receives response', async () => {
-    const session = await sessionManager.create(PROJECT_ROOT)
-
-    // Simple prompt that doesn't require tools
-    const result = await sessionManager.prompt(session.id, 'What is 2 + 2? Reply with just the number.')
-
-    expect(result).toBeDefined()
-    expect(result.output).toBeDefined()
-    expect(result.output.length).toBeGreaterThan(0)
-    expect(result.updates).toBeInstanceOf(Array)
-
-    // Should contain "4" somewhere in the response
-    expect(result.output).toMatch(/4/)
-  })
-
-  test('collects trajectory updates during execution', async () => {
-    const session = await sessionManager.create(PROJECT_ROOT)
-    const collectedUpdates: unknown[] = []
-
-    const result = await sessionManager.prompt(session.id, 'Say "hello" and nothing else.', (update) => {
-      collectedUpdates.push(update)
-    })
-
-    expect(result.updates.length).toBeGreaterThan(0)
-
-    // Should have at least one message update
-    const messageUpdates = result.updates.filter((u) => u.type === 'message')
-    expect(messageUpdates.length).toBeGreaterThan(0)
-  })
-
-  test('uses MCP server from project config', async () => {
-    // This test verifies that Claude discovers MCP servers from .mcp.json
-    // The bun-docs MCP server is configured at project root
-    const session = await sessionManager.create(PROJECT_ROOT)
-
-    // Query the bun-docs MCP server (configured in .mcp.json)
-    const result = await sessionManager.prompt(
-      session.id,
-      'Use the bun-docs MCP server to search for information about Bun.serve(). ' +
-        'What are the key options for creating an HTTP server with Bun?',
-    )
-
-    // Response should contain Bun server-related information
-    expect(result.output.length).toBeGreaterThan(0)
-    // Should mention server/HTTP-related concepts from Bun docs
-    expect(result.output.toLowerCase()).toMatch(/serve|server|http|port|fetch|handler/)
-  })
-
-  test('multi-turn conversation maintains context (stream mode)', async () => {
-    // Multi-turn: multiple prompts to same session
-    const session = await sessionManager.create(PROJECT_ROOT)
-
-    // Turn 1: Establish context
-    const turn1Result = await sessionManager.prompt(session.id, 'Remember this number: 42. Just confirm you have it.')
-    expect(turn1Result.output).toMatch(/42|forty.?two|remember/i)
-
-    // Turn 2: Reference previous context
-    const turn2Result = await sessionManager.prompt(
-      session.id,
-      'What number did I ask you to remember? Reply with just the number.',
-    )
-    expect(turn2Result.output).toMatch(/42/)
-  })
-
-  test('receives valid trajectory updates', async () => {
-    const session = await sessionManager.create(PROJECT_ROOT)
-
-    // Prompt that generates a response with trajectory updates
-    const result = await sessionManager.prompt(
-      session.id,
-      'What programming language is this project written in? Look at the file extensions.',
-    )
-
-    // Result should have output
-    expect(result.output).toBeDefined()
-    expect(result.output.length).toBeGreaterThan(0)
-
-    // Should have collected updates during execution
-    expect(result.updates).toBeInstanceOf(Array)
-    expect(result.updates.length).toBeGreaterThan(0)
-
-    // All updates should have valid types
-    const validTypes = ['thought', 'tool_call', 'message', 'plan']
-    const allValidTypes = result.updates.every((u) => validTypes.includes(u.type))
-    expect(allValidTypes).toBe(true)
-  })
-})
diff --git a/src/integration_tests/gemini.spec.ts b/src/integration_tests/gemini.spec.ts
deleted file mode 100644
index d95216f..0000000
--- a/src/integration_tests/gemini.spec.ts
+++ /dev/null
@@ -1,139 +0,0 @@
-/**
- * Integration tests for Gemini CLI headless adapter.
- *
- * @remarks
- * Tests verify the headless session manager works correctly with Gemini CLI
- * using the schema-driven headless adapter approach.
- *
- * Run locally with API key:
- * ```bash
- * GEMINI_API_KEY=... bun test ./src/integration_tests/gemini.spec.ts
- * ```
- *
- * Prerequisites:
- * 1. Gemini CLI installed (`npm install -g @google/gemini-cli`)
- * 2. API key: `GEMINI_API_KEY` environment variable
- *
- * These tests make real API calls and consume credits.
- */
-
-import { afterAll, beforeAll, describe, expect, setDefaultTimeout, test } from 'bun:test'
-import { join } from 'node:path'
-import { parseHeadlessConfig } from '../headless/headless.schemas.ts'
-import { createSessionManager } from '../headless/headless-session-manager.ts'
-
-// Long timeout for real agent interactions (2 minutes)
-setDefaultTimeout(120000)
-
-// Use project root as cwd - agents discover MCP servers from config files
-const PROJECT_ROOT = process.cwd()
-
-// Schema path for Gemini headless adapter
-const SCHEMA_PATH = join(PROJECT_ROOT, 'src/headless/tests/fixtures/gemini-headless.json')
-
-// Get API key from environment
-const GEMINI_API_KEY = process.env.GEMINI_API_KEY ?? ''
-
-// Skip all tests if no API key is available
-const describeWithApiKey = GEMINI_API_KEY ? describe : describe.skip
-
-describeWithApiKey('Gemini CLI Integration', () => {
-  let sessionManager: ReturnType<typeof createSessionManager>
-  let schemaConfig: ReturnType<typeof parseHeadlessConfig>
-
-  beforeAll(async () => {
-    // Load JSON from file, then parse with Zod schema
-    const schemaJson = await Bun.file(SCHEMA_PATH).json()
-    schemaConfig = parseHeadlessConfig(schemaJson)
-
-    // Create session manager with the schema
-    sessionManager = createSessionManager({
-      schema: schemaConfig,
-      timeout: 120000,
-      debug: false,
-    })
-  })
-
-  afterAll(async () => {
-    // Cleanup handled automatically by session manager
-  })
-
-  test('creates session successfully', async () => {
-    const session = await sessionManager.create(PROJECT_ROOT)
-
-    expect(session).toBeDefined()
-    expect(session.id).toBeDefined()
-    expect(typeof session.id).toBe('string')
-    expect(session.active).toBe(true)
-    expect(session.cwd).toBe(PROJECT_ROOT)
-  })
-
-  test('sends prompt and receives response', async () => {
-    const session = await sessionManager.create(PROJECT_ROOT)
-
-    // Simple prompt that doesn't require tools
-    const result = await sessionManager.prompt(session.id, 'What is 2 + 2? Reply with just the number.')
-
-    expect(result).toBeDefined()
-    expect(result.output).toBeDefined()
-    expect(result.output.length).toBeGreaterThan(0)
-    expect(result.updates).toBeInstanceOf(Array)
-
-    // Should contain "4" somewhere in the response
-    expect(result.output).toMatch(/4/)
-  })
-
-  test('collects trajectory updates during execution', async () => {
-    const session = await sessionManager.create(PROJECT_ROOT)
-    const collectedUpdates: unknown[] = []
-
-    const result = await sessionManager.prompt(session.id, 'Say "hello" and nothing else.', (update) => {
-      collectedUpdates.push(update)
-    })
-
-    expect(result.updates.length).toBeGreaterThan(0)
-
-    // Should have at least one message update
-    const messageUpdates = result.updates.filter((u) => u.type === 'message')
-    expect(messageUpdates.length).toBeGreaterThan(0)
-  })
-
-  test('multi-turn conversation maintains context (iterative mode)', async () => {
-    // Multi-turn via headless adapter in iterative mode (history accumulation)
-    const session = await sessionManager.create(PROJECT_ROOT)
-
-    // Turn 1: Establish context
-    const turn1Result = await sessionManager.prompt(session.id, 'Remember this number: 42. Just confirm you have it.')
-    expect(turn1Result.output).toMatch(/42|forty.?two|remember/i)
-
-    // Turn 2: Reference previous context
-    const turn2Result = await sessionManager.prompt(
-      session.id,
-      'What number did I ask you to remember? Reply with just the number.',
-    )
-    expect(turn2Result.output).toMatch(/42/)
-  })
-
-  test('handles simple math question correctly', async () => {
-    const session = await sessionManager.create(PROJECT_ROOT)
-
-    const result = await sessionManager.prompt(session.id, 'Calculate 15 * 7. Reply with just the number.')
-
-    // Gemini CLI may include formatting variations (newlines, spaces)
-    // Strip whitespace to verify the correct answer is present
-    expect(result.output.replace(/\s/g, '')).toContain('105')
-  })
-
-  test('processes longer response without timeout', async () => {
-    const session = await sessionManager.create(PROJECT_ROOT)
-
-    const result = await sessionManager.prompt(
-      session.id,
-      'List 5 programming languages and one key feature of each. Be brief.',
-    )
-
-    expect(result.output.length).toBeGreaterThan(50)
-    // Should mention at least some programming languages
-    expect(result.output.toLowerCase()).toMatch(/python|javascript|java|rust|go|typescript|c\+\+|ruby/)
-  })
-})
diff --git a/src/pipeline.ts b/src/pipeline.ts
deleted file mode 100644
index 7c3c1bd..0000000
--- a/src/pipeline.ts
+++ /dev/null
@@ -1,34 +0,0 @@
-/**
- * Pipeline commands re-export.
- *
- * @remarks
- * Public API for pipeline commands. Import from here for external use.
- *
- * @packageDocumentation
- */
-
-export {
-  // Types
-  type CompareConfig,
-  type ComparisonGrader,
-  type ComparisonGraderInput,
-  type ComparisonGraderResult,
-  type ComparisonRanking,
-  type ComparisonResult,
-  // Commands
-  compare,
-  type ExtractConfig,
-  type ExtractedResult,
-  extract,
-  type FormatConfig,
-  type FormatStyle,
-  format,
-  type GradeConfig,
-  type GradedResult,
-  grade,
-  type LabeledRun,
-  type RawOutput,
-  type RunConfig,
-  type RunMode,
-  run,
-} from './pipeline/pipeline.ts'
diff --git a/src/pipeline/compare-format-detection.ts b/src/pipeline/compare-format-detection.ts
deleted file mode 100644
index edbc0b6..0000000
--- a/src/pipeline/compare-format-detection.ts
+++ /dev/null
@@ -1,100 +0,0 @@
-/**
- * Format detection for compare command.
- *
- * @remarks
- * Auto-detects whether input files contain CaptureResult or TrialResult data
- * by inspecting the first line of the JSONL file.
- *
- * Detection logic:
- * - TrialResult: has `trials` array and `k` number
- * - CaptureResult: has `trajectory` array and `timing` object
- *
- * @packageDocumentation
- */
-
-/** Detected input format for compare command */
-export type CompareInputFormat = 'capture' | 'trials'
-
-/**
- * Detect input format from JSONL file.
- *
- * @remarks
- * Reads the first non-empty line of the file and checks for
- * discriminating fields to determine the format.
- *
- * @param path - Path to JSONL file
- * @returns Detected format ('capture' or 'trials')
- * @throws Error if file is empty or format cannot be detected
- *
- * @public
- */
-export const detectInputFormat = async (path: string): Promise<CompareInputFormat> => {
-  const file = Bun.file(path)
-  const text = await file.text()
-  const firstLine = text.split('\n').find((line) => line.trim())
-
-  if (!firstLine) {
-    throw new Error(`Empty file: ${path}`)
-  }
-
-  let parsed: unknown
-  try {
-    parsed = JSON.parse(firstLine)
-  } catch {
-    throw new Error(`Invalid JSON in first line of: ${path}`)
-  }
-
-  if (typeof parsed !== 'object' || parsed === null) {
-    throw new Error(`Expected object in first line of: ${path}`)
-  }
-
-  const obj = parsed as Record<string, unknown>
-
-  // TrialResult has `trials` array and `k` number
-  if ('trials' in obj && Array.isArray(obj.trials) && 'k' in obj && typeof obj.k === 'number') {
-    return 'trials'
-  }
-
-  // CaptureResult has `trajectory` array and `timing` object
-  if ('trajectory' in obj && Array.isArray(obj.trajectory) && 'timing' in obj && typeof obj.timing === 'object') {
-    return 'capture'
-  }
-
-  throw new Error(
-    `Unable to detect format for: ${path}. ` +
-      `Expected either TrialResult (with trials/k fields) or CaptureResult (with trajectory/timing fields).`,
-  )
-}
-
-/**
- * Validate that all files have the same format.
- *
- * @param paths - Paths to JSONL files
- * @returns Detected format (all files must match)
- * @throws Error if files have different formats
- *
- * @public
- */
-export const detectAndValidateFormat = async (paths: string[]): Promise<CompareInputFormat> => {
-  const firstPath = paths[0]
-  if (!firstPath) {
-    throw new Error('No files provided for format detection')
-  }
-
-  const format = await detectInputFormat(firstPath)
-
-  for (let i = 1; i < paths.length; i++) {
-    const path = paths[i]
-    if (!path) continue
-
-    const otherFormat = await detectInputFormat(path)
-    if (otherFormat !== format) {
-      throw new Error(
-        `Format mismatch: ${firstPath} is ${format}, but ${path} is ${otherFormat}. ` +
-          `All files must have the same format.`,
-      )
-    }
-  }
-
-  return format
-}
diff --git a/src/pipeline/compare-trials.ts b/src/pipeline/compare-trials.ts
deleted file mode 100644
index d476c7e..0000000
--- a/src/pipeline/compare-trials.ts
+++ /dev/null
@@ -1,800 +0,0 @@
-/**
- * Pipeline compare command for trials data.
- *
- * @remarks
- * Compares multiple runs of TrialResult data, analyzing capability (passAtK),
- * reliability (passExpK), and flakiness metrics.
- *
- * Outputs a TrialsComparisonReport JSON (not JSONL) containing aggregate
- * statistics across all dimensions plus head-to-head comparisons.
- *
- * Built-in strategies:
- * - `weighted`: Configurable weights for capability, reliability, consistency (default)
- * - `statistical`: Bootstrap sampling for confidence intervals on passAtK
- *
- * @packageDocumentation
- */
-
-import { logProgress, writeOutput } from '../core.ts'
-import { bootstrap, formatCI, getBootstrapConfigFromEnv } from '../graders/bootstrap.ts'
-import { grade as statisticalGrade } from '../graders/trials-compare-statistical.ts'
-import { grade as weightedGrade } from '../graders/trials-compare-weighted.ts'
-import type {
-  PairwiseComparison,
-  TrialResult,
-  TrialsCapabilityMetrics,
-  TrialsComparisonMeta,
-  TrialsComparisonReport,
-  TrialsFlakinessMetrics,
-  TrialsPerformanceMetrics,
-  TrialsPromptComparison,
-  TrialsQualityMetrics,
-  TrialsReliabilityMetrics,
-} from '../schemas.ts'
-import { TrialResultSchema } from '../schemas.ts'
-import { computeLatencyStats, percentile } from './compare-utils.ts'
-import type {
-  ComparisonGraderResult,
-  LabeledRun,
-  TrialsComparisonGrader,
-  TrialsComparisonGraderInput,
-  TrialsComparisonRunData,
-} from './pipeline.types.ts'
-
-/** Comparison strategy type for trials */
-export type TrialsCompareStrategy = 'weighted' | 'statistical' | 'custom'
-
-/** Extended compare config for trials */
-export type TrialsCompareConfig = {
-  /** Labeled runs to compare */
-  runs: LabeledRun[]
-  /** Comparison strategy (default: weighted) */
-  strategy?: TrialsCompareStrategy
-  /** Path to custom grader (required if strategy is 'custom') */
-  graderPath?: string
-  /** Output file path */
-  outputPath?: string
-  /** Show progress to stderr */
-  progress?: boolean
-  /** Output format (default: json) */
-  format?: 'json' | 'markdown'
-}
-
-/**
- * Stream trial results from a JSONL file.
- *
- * @param path - Path to the trials.jsonl file
- * @yields Parsed and validated trial results
- */
-async function* streamTrialResults(path: string): AsyncGenerator<TrialResult, void, unknown> {
-  const file = Bun.file(path)
-  const text = await file.text()
-  const lines = text.split('\n')
-
-  for (let i = 0; i < lines.length; i++) {
-    const line = lines[i]?.trim()
-    if (!line) continue
-
-    try {
-      yield TrialResultSchema.parse(JSON.parse(line))
-    } catch (error) {
-      throw new Error(`Invalid trial result at line ${i + 1}: ${error instanceof Error ? error.message : error}`)
-    }
-  }
-}
-
-/**
- * Build an indexed map of trial results by ID.
- *
- * @param path - Path to the trials.jsonl file
- * @returns Map of result ID to TrialResult
- */
-export const buildTrialsIndex = async (path: string): Promise<Map<string, TrialResult>> => {
-  const index = new Map<string, TrialResult>()
-
-  for await (const result of streamTrialResults(path)) {
-    index.set(result.id, result)
-  }
-
-  return index
-}
-
-/**
- * Load trials comparison grader from file.
- *
- * @param path - Path to grader module
- * @returns Loaded trials comparison grader function
- * @throws Error if module cannot be loaded or doesn't export a grader function
- */
-const loadTrialsComparisonGrader = async (path: string): Promise<TrialsComparisonGrader> => {
-  let module: Record<string, unknown>
-  try {
-    module = (await import(path)) as Record<string, unknown>
-  } catch (error) {
-    throw new Error(`Failed to load grader from '${path}': ${error instanceof Error ? error.message : error}`)
-  }
-
-  if (typeof module.grade === 'function') {
-    return module.grade as TrialsComparisonGrader
-  }
-  if (typeof module.default === 'function') {
-    return module.default as TrialsComparisonGrader
-  }
-  if (typeof module.compare === 'function') {
-    return module.compare as TrialsComparisonGrader
-  }
-
-  throw new Error(`Trials comparison grader must export 'grade', 'compare', or 'default' function`)
-}
-
-/**
- * Get grader function based on strategy.
- *
- * @param strategy - Comparison strategy
- * @param graderPath - Path to custom grader (for 'custom' strategy)
- * @returns Trials comparison grader function
- */
-const getTrialsGrader = async (
-  strategy: TrialsCompareStrategy,
-  graderPath?: string,
-): Promise<TrialsComparisonGrader> => {
-  switch (strategy) {
-    case 'weighted':
-      return weightedGrade
-    case 'statistical':
-      return statisticalGrade
-    case 'custom':
-      if (!graderPath) {
-        throw new Error('Custom strategy requires --grader path')
-      }
-      return loadTrialsComparisonGrader(graderPath)
-  }
-}
-
-/**
- * Compute capability metrics from trial results.
- *
- * @param results - Array of trial results
- * @returns Capability metrics (passAtK statistics)
- */
-const computeCapabilityMetrics = (results: TrialResult[]): TrialsCapabilityMetrics => {
-  const passAtKValues = results.map((r) => r.passAtK ?? 0)
-
-  if (passAtKValues.length === 0) {
-    return { avgPassAtK: 0, medianPassAtK: 0, p25PassAtK: 0, p75PassAtK: 0 }
-  }
-
-  const sorted = [...passAtKValues].sort((a, b) => a - b)
-  const sum = passAtKValues.reduce((a, b) => a + b, 0)
-
-  return {
-    avgPassAtK: sum / passAtKValues.length,
-    medianPassAtK: percentile(sorted, 0.5),
-    p25PassAtK: percentile(sorted, 0.25),
-    p75PassAtK: percentile(sorted, 0.75),
-  }
-}
-
-/**
- * Compute reliability metrics from trial results.
- *
- * @param results - Array of trial results
- * @returns Reliability metrics (passExpK statistics)
- */
-const computeReliabilityMetrics = (results: TrialResult[]): TrialsReliabilityMetrics => {
-  const passExpKValues = results.map((r) => r.passExpK ?? 0)
-
-  if (passExpKValues.length === 0) {
-    return { type: 'trial', avgPassExpK: 0, medianPassExpK: 0, p25PassExpK: 0, p75PassExpK: 0 }
-  }
-
-  const sorted = [...passExpKValues].sort((a, b) => a - b)
-  const sum = passExpKValues.reduce((a, b) => a + b, 0)
-
-  return {
-    type: 'trial',
-    avgPassExpK: sum / passExpKValues.length,
-    medianPassExpK: percentile(sorted, 0.5),
-    p25PassExpK: percentile(sorted, 0.25),
-    p75PassExpK: percentile(sorted, 0.75),
-  }
-}
-
-/**
- * Compute flakiness metrics from trial results.
- *
- * @param results - Array of trial results
- * @param maxTopFlaky - Maximum number of top flaky prompts to include
- * @returns Flakiness metrics
- */
-const computeFlakinessMetrics = (results: TrialResult[], maxTopFlaky: number = 10): TrialsFlakinessMetrics => {
-  const flakinessData = results.map((r) => ({
-    id: r.id,
-    flakiness: Math.max(0, (r.passAtK ?? 0) - (r.passExpK ?? 0)),
-  }))
-
-  if (flakinessData.length === 0) {
-    return { avgFlakiness: 0, medianFlakiness: 0, flakyPromptCount: 0, topFlakyPrompts: [] }
-  }
-
-  const flakinessValues = flakinessData.map((d) => d.flakiness)
-  const sorted = [...flakinessValues].sort((a, b) => a - b)
-  const sum = flakinessValues.reduce((a, b) => a + b, 0)
-
-  // Sort by flakiness descending to get top flaky prompts
-  const topFlaky = [...flakinessData]
-    .filter((d) => d.flakiness > 0)
-    .sort((a, b) => b.flakiness - a.flakiness)
-    .slice(0, maxTopFlaky)
-
-  return {
-    avgFlakiness: sum / flakinessValues.length,
-    medianFlakiness: percentile(sorted, 0.5),
-    flakyPromptCount: flakinessData.filter((d) => d.flakiness > 0).length,
-    topFlakyPrompts: topFlaky,
-  }
-}
-
-/** Result from quality metrics computation, including raw scores for CI reuse */
-type QualityComputeResult = {
-  metrics: TrialsQualityMetrics
-  rawScores: number[]
-}
-
-/**
- * Compute quality metrics from trial results.
- *
- * @remarks
- * Flattens all trial scores across all prompts into a single distribution.
- * Returns undefined if no scores are present (no grader was used).
- * Returns raw scores alongside metrics to avoid re-traversal for CI computation.
- *
- * @param results - Array of trial results
- * @returns Quality metrics with raw scores, or undefined if no scores
- */
-const computeTrialsQualityMetrics = (results: TrialResult[]): QualityComputeResult | undefined => {
-  const rawScores = results.flatMap((r) => r.trials.filter((t) => t.score !== undefined).map((t) => t.score as number))
-
-  if (rawScores.length === 0) return undefined
-
-  const sorted = [...rawScores].sort((a, b) => a - b)
-  const sum = rawScores.reduce((a, b) => a + b, 0)
-
-  return {
-    metrics: {
-      type: 'trial',
-      avgScore: sum / rawScores.length,
-      medianScore: percentile(sorted, 0.5),
-      p25Score: percentile(sorted, 0.25),
-      p75Score: percentile(sorted, 0.75),
-    },
-    rawScores,
-  }
-}
-
-/** Result from performance metrics computation, including raw durations for CI reuse */
-type PerformanceComputeResult = {
-  metrics: TrialsPerformanceMetrics
-  rawDurations: number[]
-}
-
-/**
- * Compute performance metrics from trial results.
- *
- * @remarks
- * Flattens all trial durations across all prompts into latency statistics.
- * Always returns a value since TrialEntry.duration is required.
- * Returns raw durations alongside metrics to avoid re-traversal for CI computation.
- *
- * @param results - Array of trial results
- * @returns Performance metrics with raw durations
- */
-const computeTrialsPerformanceMetrics = (results: TrialResult[]): PerformanceComputeResult => {
-  const rawDurations = results.flatMap((r) => r.trials.map((t) => t.duration))
-
-  return {
-    metrics: {
-      latency: computeLatencyStats(rawDurations),
-      totalDuration: rawDurations.reduce((a, b) => a + b, 0),
-    },
-    rawDurations,
-  }
-}
-
-/**
- * Execute trials comparison and generate aggregate report.
- *
- * @param config - Trials compare configuration
- * @returns Trials comparison report
- */
-export const runTrialsCompare = async (config: TrialsCompareConfig): Promise<TrialsComparisonReport> => {
-  const { runs, strategy = 'weighted', graderPath, outputPath, progress = false, format = 'json' } = config
-
-  if (runs.length < 2) {
-    throw new Error('At least 2 runs required for comparison')
-  }
-
-  // Get grader based on strategy
-  const grader = await getTrialsGrader(strategy, graderPath)
-
-  const strategyLabel = strategy === 'custom' ? `custom: ${graderPath}` : strategy
-  logProgress(`Comparing ${runs.length} trials runs with strategy: ${strategyLabel}`, progress)
-  for (const run of runs) {
-    logProgress(`  - ${run.label}: ${run.path}`, progress)
-  }
-
-  // Load all runs using indexed streaming
-  const runResults: Record<string, Map<string, TrialResult>> = {}
-  for (const run of runs) {
-    logProgress(`Loading ${run.label}...`, progress)
-    runResults[run.label] = await buildTrialsIndex(run.path)
-  }
-
-  // Build set of all prompt IDs across runs
-  const promptIds = new Set<string>()
-  for (const resultsMap of Object.values(runResults)) {
-    for (const id of resultsMap.keys()) {
-      promptIds.add(id)
-    }
-  }
-
-  logProgress(`Comparing ${promptIds.size} prompts...`, progress)
-
-  // Per-prompt comparison results
-  const promptComparisons: TrialsPromptComparison[] = []
-  const perPromptGraderResults: { id: string; result: ComparisonGraderResult }[] = []
-
-  // Track k value (should be consistent across all results)
-  let trialsPerPrompt = 0
-
-  for (const promptId of promptIds) {
-    logProgress(`  ${promptId}`, progress)
-
-    // Build comparison input
-    const runsData: TrialsComparisonGraderInput['runs'] = {}
-    let input: string | string[] = ''
-    let hint: string | undefined
-
-    for (const [label, resultsMap] of Object.entries(runResults)) {
-      const result = resultsMap.get(promptId)
-      if (result) {
-        const runData: TrialsComparisonRunData = {
-          passRate: result.passRate,
-          passAtK: result.passAtK,
-          passExpK: result.passExpK,
-          k: result.k,
-          trials: result.trials,
-        }
-        runsData[label] = runData
-
-        // Track k value
-        if (trialsPerPrompt === 0) {
-          trialsPerPrompt = result.k
-        }
-
-        // Use first found input/hint as the reference
-        if (!input) {
-          input = result.input
-          hint = result.hint
-        }
-      }
-    }
-
-    // Skip if not present in at least 2 runs
-    if (Object.keys(runsData).length < 2) {
-      logProgress(`    Skipped (only in ${Object.keys(runsData).length} run)`, progress)
-      continue
-    }
-
-    // Apply comparison grader
-    const graderInput: TrialsComparisonGraderInput = {
-      id: promptId,
-      input,
-      hint,
-      runs: runsData,
-    }
-
-    const graderResult = await grader(graderInput)
-    perPromptGraderResults.push({ id: promptId, result: graderResult })
-
-    // Build prompt comparison for head-to-head
-    const passAtK: Record<string, number> = {}
-    const passExpK: Record<string, number> = {}
-    const flakiness: Record<string, number> = {}
-
-    for (const [label, data] of Object.entries(runsData)) {
-      passAtK[label] = data.passAtK ?? 0
-      passExpK[label] = data.passExpK ?? 0
-      flakiness[label] = Math.max(0, (data.passAtK ?? 0) - (data.passExpK ?? 0))
-    }
-
-    // Determine winners
-    const labels = Object.keys(runsData)
-    let capabilityWinner: string | null = null
-    let reliabilityWinner: string | null = null
-
-    // Capability winner: highest passAtK
-    const sortedByCapability = [...labels].sort((a, b) => (passAtK[b] ?? 0) - (passAtK[a] ?? 0))
-    if (sortedByCapability.length >= 2) {
-      const first = sortedByCapability[0]
-      const second = sortedByCapability[1]
-      if (first && second && (passAtK[first] ?? 0) > (passAtK[second] ?? 0)) {
-        capabilityWinner = first
-      }
-    }
-
-    // Reliability winner: highest passExpK
-    const sortedByReliability = [...labels].sort((a, b) => (passExpK[b] ?? 0) - (passExpK[a] ?? 0))
-    if (sortedByReliability.length >= 2) {
-      const first = sortedByReliability[0]
-      const second = sortedByReliability[1]
-      if (first && second && (passExpK[first] ?? 0) > (passExpK[second] ?? 0)) {
-        reliabilityWinner = first
-      }
-    }
-
-    promptComparisons.push({
-      id: promptId,
-      capabilityWinner,
-      reliabilityWinner,
-      passAtK,
-      passExpK,
-      flakiness,
-    })
-
-    // Log winner
-    const winner = graderResult.rankings.find((r) => r.rank === 1)
-    if (winner) {
-      logProgress(`    Overall winner: ${winner.run} (${winner.score.toFixed(3)})`, progress)
-    }
-  }
-
-  // Compute aggregate metrics per run
-  const runLabels = runs.map((r) => r.label)
-
-  const capability: Record<string, TrialsCapabilityMetrics> = {}
-  const reliability: Record<string, TrialsReliabilityMetrics> = {}
-  const flakiness: Record<string, TrialsFlakinessMetrics> = {}
-  const quality: Record<string, TrialsQualityMetrics> = {}
-  const performance: Record<string, TrialsPerformanceMetrics> = {}
-  const rawScoresByRun: Record<string, number[]> = {}
-  const rawDurationsByRun: Record<string, number[]> = {}
-
-  let hasQuality = false
-
-  for (const label of runLabels) {
-    const resultsMap = runResults[label] ?? new Map()
-    const results = [...resultsMap.values()]
-
-    capability[label] = computeCapabilityMetrics(results)
-    reliability[label] = computeReliabilityMetrics(results)
-    flakiness[label] = computeFlakinessMetrics(results)
-
-    const perfResult = computeTrialsPerformanceMetrics(results)
-    performance[label] = perfResult.metrics
-    rawDurationsByRun[label] = perfResult.rawDurations
-
-    const qualityResult = computeTrialsQualityMetrics(results)
-    if (qualityResult) {
-      quality[label] = qualityResult.metrics
-      rawScoresByRun[label] = qualityResult.rawScores
-      hasQuality = true
-    }
-  }
-
-  // Compute confidence intervals when using statistical strategy
-  if (strategy === 'statistical') {
-    const bootstrapConfig = getBootstrapConfigFromEnv()
-
-    for (const label of runLabels) {
-      const resultsMap = runResults[label] ?? new Map()
-      const resultsArr = [...resultsMap.values()]
-      const passAtKValues = resultsArr.map((r) => r.passAtK ?? 0)
-      const passExpKValues = resultsArr.map((r) => r.passExpK ?? 0)
-
-      // Capability CIs
-      const capabilityMetrics = capability[label]
-      if (capabilityMetrics) {
-        capabilityMetrics.confidenceIntervals = {
-          avgPassAtK: bootstrap(passAtKValues, bootstrapConfig).ci,
-        }
-      }
-
-      // Reliability CIs
-      const reliabilityMetrics = reliability[label]
-      if (reliabilityMetrics) {
-        reliabilityMetrics.confidenceIntervals = {
-          avgPassExpK: bootstrap(passExpKValues, bootstrapConfig).ci,
-        }
-      }
-
-      // Quality CIs (only when scores present)
-      const qualityMetrics = quality[label]
-      const scores = rawScoresByRun[label]
-      if (qualityMetrics && scores && scores.length > 0) {
-        qualityMetrics.confidenceIntervals = {
-          avgScore: bootstrap(scores, bootstrapConfig).ci,
-        }
-      }
-
-      // Performance CIs
-      const performanceMetrics = performance[label]
-      const durations = rawDurationsByRun[label]
-      if (performanceMetrics && durations && durations.length > 0) {
-        performanceMetrics.confidenceIntervals = {
-          latencyMean: bootstrap(durations, bootstrapConfig).ci,
-        }
-      }
-    }
-  }
-
-  // Compute pairwise comparisons
-  const capabilityPairwise: PairwiseComparison[] = []
-  const reliabilityPairwise: PairwiseComparison[] = []
-  const overallPairwise: PairwiseComparison[] = []
-
-  for (let i = 0; i < runLabels.length; i++) {
-    for (let j = i + 1; j < runLabels.length; j++) {
-      const runA = runLabels[i]
-      const runB = runLabels[j]
-
-      if (!runA || !runB) continue
-
-      // Capability pairwise
-      let capAWins = 0
-      let capBWins = 0
-      let capTies = 0
-
-      // Reliability pairwise
-      let relAWins = 0
-      let relBWins = 0
-      let relTies = 0
-
-      // Overall pairwise (from grader results)
-      let overallAWins = 0
-      let overallBWins = 0
-      let overallTies = 0
-
-      for (const pc of promptComparisons) {
-        // Capability
-        if (pc.capabilityWinner === runA) capAWins++
-        else if (pc.capabilityWinner === runB) capBWins++
-        else capTies++
-
-        // Reliability
-        if (pc.reliabilityWinner === runA) relAWins++
-        else if (pc.reliabilityWinner === runB) relBWins++
-        else relTies++
-      }
-
-      // Overall from grader results
-      for (const { result } of perPromptGraderResults) {
-        const winner = result.rankings.find((r) => r.rank === 1)
-        if (winner?.run === runA) overallAWins++
-        else if (winner?.run === runB) overallBWins++
-        else overallTies++
-      }
-
-      capabilityPairwise.push({ runA, runB, aWins: capAWins, bWins: capBWins, ties: capTies })
-      reliabilityPairwise.push({ runA, runB, aWins: relAWins, bWins: relBWins, ties: relTies })
-      overallPairwise.push({ runA, runB, aWins: overallAWins, bWins: overallBWins, ties: overallTies })
-    }
-  }
-
-  // Build meta
-  const meta: TrialsComparisonMeta = {
-    generatedAt: new Date().toISOString(),
-    runs: runLabels,
-    promptCount: promptIds.size,
-    trialsPerPrompt,
-    inputFormat: 'trials',
-  }
-
-  // Assemble report
-  const report: TrialsComparisonReport = {
-    meta,
-    capability,
-    reliability,
-    flakiness,
-    quality: hasQuality ? quality : undefined,
-    performance,
-    headToHead: {
-      capability: capabilityPairwise,
-      reliability: reliabilityPairwise,
-      overall: overallPairwise,
-    },
-    perPrompt: promptComparisons,
-  }
-
-  // Output
-  if (format === 'markdown') {
-    const markdown = formatTrialsReportAsMarkdown(report)
-    await writeOutput(markdown, outputPath, false)
-  } else {
-    await writeOutput(JSON.stringify(report, null, 2), outputPath, false)
-  }
-
-  // Summary statistics
-  logProgress('', progress)
-  logProgress('=== Summary ===', progress)
-
-  for (const [label, cap] of Object.entries(capability)) {
-    const rel = reliability[label]
-    const flak = flakiness[label]
-    const perf = performance[label]
-    const qual = quality[label]
-    const qualStr = qual ? ` avgScore=${qual.avgScore.toFixed(3)}` : ''
-    const perfStr = perf ? ` latencyP50=${perf.latency.p50.toFixed(0)}ms` : ''
-    logProgress(
-      `  ${label}: passAtK=${cap?.avgPassAtK.toFixed(3)} passExpK=${rel?.avgPassExpK.toFixed(3)} flakiness=${flak?.avgFlakiness.toFixed(3)}${qualStr}${perfStr}`,
-      progress,
-    )
-  }
-
-  logProgress('', progress)
-  logProgress('Overall wins:', progress)
-  for (const pw of overallPairwise) {
-    logProgress(`  ${pw.runA} vs ${pw.runB}: ${pw.aWins}-${pw.bWins}-${pw.ties}`, progress)
-  }
-
-  logProgress('Done!', progress)
-
-  return report
-}
-
-/**
- * Format trials comparison report as markdown.
- *
- * @param report - Trials comparison report
- * @returns Markdown string
- */
-const formatTrialsReportAsMarkdown = (report: TrialsComparisonReport): string => {
-  const lines: string[] = []
-
-  lines.push('# Trials Comparison Report')
-  lines.push('')
-  lines.push(`Generated: ${report.meta.generatedAt}`)
-  lines.push(`Runs: ${report.meta.runs.join(', ')}`)
-  lines.push(`Prompts: ${report.meta.promptCount} | Trials per prompt: ${report.meta.trialsPerPrompt}`)
-  lines.push('')
-
-  // Check if any run has confidence intervals (statistical strategy was used)
-  const hasCIs = Object.values(report.capability).some((c) => c.confidenceIntervals)
-
-  // Capability table
-  lines.push('## Capability (passAtK)')
-  lines.push('')
-  if (hasCIs) {
-    lines.push('| Run | Avg | 95% CI | Median | P25 | P75 |')
-    lines.push('|-----|-----|--------|--------|-----|-----|')
-    for (const [label, c] of Object.entries(report.capability)) {
-      const avgCI = formatCI(c.confidenceIntervals?.avgPassAtK)
-      lines.push(
-        `| ${label} | ${c.avgPassAtK.toFixed(3)} | ${avgCI} | ${c.medianPassAtK.toFixed(3)} | ${c.p25PassAtK.toFixed(3)} | ${c.p75PassAtK.toFixed(3)} |`,
-      )
-    }
-  } else {
-    lines.push('| Run | Avg | Median | P25 | P75 |')
-    lines.push('|-----|-----|--------|-----|-----|')
-    for (const [label, c] of Object.entries(report.capability)) {
-      lines.push(
-        `| ${label} | ${c.avgPassAtK.toFixed(3)} | ${c.medianPassAtK.toFixed(3)} | ${c.p25PassAtK.toFixed(3)} | ${c.p75PassAtK.toFixed(3)} |`,
-      )
-    }
-  }
-  lines.push('')
-
-  // Reliability table
-  lines.push('## Reliability (passExpK)')
-  lines.push('')
-  if (hasCIs) {
-    lines.push('| Run | Avg | 95% CI | Median | P25 | P75 |')
-    lines.push('|-----|-----|--------|--------|-----|-----|')
-    for (const [label, r] of Object.entries(report.reliability)) {
-      const avgCI = formatCI(r.confidenceIntervals?.avgPassExpK)
-      lines.push(
-        `| ${label} | ${r.avgPassExpK.toFixed(3)} | ${avgCI} | ${r.medianPassExpK.toFixed(3)} | ${r.p25PassExpK.toFixed(3)} | ${r.p75PassExpK.toFixed(3)} |`,
-      )
-    }
-  } else {
-    lines.push('| Run | Avg | Median | P25 | P75 |')
-    lines.push('|-----|-----|--------|-----|-----|')
-    for (const [label, r] of Object.entries(report.reliability)) {
-      lines.push(
-        `| ${label} | ${r.avgPassExpK.toFixed(3)} | ${r.medianPassExpK.toFixed(3)} | ${r.p25PassExpK.toFixed(3)} | ${r.p75PassExpK.toFixed(3)} |`,
-      )
-    }
-  }
-  lines.push('')
-
-  // Flakiness table
-  lines.push('## Flakiness')
-  lines.push('')
-  lines.push('| Run | Avg | Median | Flaky Prompts |')
-  lines.push('|-----|-----|--------|---------------|')
-  for (const [label, f] of Object.entries(report.flakiness)) {
-    lines.push(`| ${label} | ${f.avgFlakiness.toFixed(3)} | ${f.medianFlakiness.toFixed(3)} | ${f.flakyPromptCount} |`)
-  }
-  lines.push('')
-
-  // Quality table (only when scores present)
-  if (report.quality && Object.keys(report.quality).length > 0) {
-    const hasQualityCIs = Object.values(report.quality).some((q) => q.confidenceIntervals)
-
-    lines.push('## Quality (Scores)')
-    lines.push('')
-    if (hasQualityCIs) {
-      lines.push('| Run | Avg Score | 95% CI | Median | P25 | P75 |')
-      lines.push('|-----|-----------|--------|--------|-----|-----|')
-      for (const [label, q] of Object.entries(report.quality)) {
-        const avgCI = formatCI(q.confidenceIntervals?.avgScore)
-        lines.push(
-          `| ${label} | ${q.avgScore.toFixed(3)} | ${avgCI} | ${q.medianScore.toFixed(3)} | ${q.p25Score.toFixed(3)} | ${q.p75Score.toFixed(3)} |`,
-        )
-      }
-    } else {
-      lines.push('| Run | Avg Score | Median | P25 | P75 |')
-      lines.push('|-----|-----------|--------|-----|-----|')
-      for (const [label, q] of Object.entries(report.quality)) {
-        lines.push(
-          `| ${label} | ${q.avgScore.toFixed(3)} | ${q.medianScore.toFixed(3)} | ${q.p25Score.toFixed(3)} | ${q.p75Score.toFixed(3)} |`,
-        )
-      }
-    }
-    lines.push('')
-  }
-
-  // Performance table (always present)
-  const hasPerfCIs = Object.values(report.performance).some((p) => p.confidenceIntervals)
-
-  lines.push('## Performance (Latency)')
-  lines.push('')
-  if (hasPerfCIs) {
-    lines.push('| Run | P50 (ms) | P90 (ms) | P99 (ms) | Mean (ms) | 95% CI | Total (ms) |')
-    lines.push('|-----|----------|----------|----------|-----------|--------|------------|')
-    for (const [label, p] of Object.entries(report.performance)) {
-      const latencyCI = formatCI(p.confidenceIntervals?.latencyMean, 0)
-      lines.push(
-        `| ${label} | ${p.latency.p50.toFixed(0)} | ${p.latency.p90.toFixed(0)} | ${p.latency.p99.toFixed(0)} | ${p.latency.mean.toFixed(0)} | ${latencyCI} | ${p.totalDuration.toFixed(0)} |`,
-      )
-    }
-  } else {
-    lines.push('| Run | P50 (ms) | P90 (ms) | P99 (ms) | Mean (ms) | Total (ms) |')
-    lines.push('|-----|----------|----------|----------|-----------|------------|')
-    for (const [label, p] of Object.entries(report.performance)) {
-      lines.push(
-        `| ${label} | ${p.latency.p50.toFixed(0)} | ${p.latency.p90.toFixed(0)} | ${p.latency.p99.toFixed(0)} | ${p.latency.mean.toFixed(0)} | ${p.totalDuration.toFixed(0)} |`,
-      )
-    }
-  }
-  lines.push('')
-
-  // Head-to-head
-  lines.push('## Head-to-Head')
-  lines.push('')
-  lines.push('### By Capability')
-  lines.push('| Matchup | A Wins | B Wins | Ties |')
-  lines.push('|---------|--------|--------|------|')
-  for (const p of report.headToHead.capability) {
-    lines.push(`| ${p.runA} vs ${p.runB} | ${p.aWins} | ${p.bWins} | ${p.ties} |`)
-  }
-  lines.push('')
-
-  lines.push('### By Reliability')
-  lines.push('| Matchup | A Wins | B Wins | Ties |')
-  lines.push('|---------|--------|--------|------|')
-  for (const p of report.headToHead.reliability) {
-    lines.push(`| ${p.runA} vs ${p.runB} | ${p.aWins} | ${p.bWins} | ${p.ties} |`)
-  }
-  lines.push('')
-
-  lines.push('### Overall (Weighted)')
-  lines.push('| Matchup | A Wins | B Wins | Ties |')
-  lines.push('|---------|--------|--------|------|')
-  for (const p of report.headToHead.overall) {
-    lines.push(`| ${p.runA} vs ${p.runB} | ${p.aWins} | ${p.bWins} | ${p.ties} |`)
-  }
-  lines.push('')
-
-  return lines.join('\n')
-}
diff --git a/src/pipeline/compare-utils.ts b/src/pipeline/compare-utils.ts
deleted file mode 100644
index 81bea33..0000000
--- a/src/pipeline/compare-utils.ts
+++ /dev/null
@@ -1,85 +0,0 @@
-/**
- * Shared utility functions for comparison modules.
- *
- * @remarks
- * Extracted from compare.ts and compare-trials.ts to avoid duplication.
- * Contains statistical helpers used by both CaptureResult and TrialResult comparisons.
- *
- * @packageDocumentation
- */
-
-import type { LatencyStats, ScoreDistribution } from '../schemas.ts'
-
-/**
- * Compute percentile from sorted array using nearest rank method.
- *
- * @remarks
- * Uses floor indexing (nearest rank method). For an array of length N,
- * returns the element at index `floor(N * p)`, clamped to the last element.
- * This does not interpolate between ranks.
- *
- * @param sorted - Sorted array of numbers
- * @param p - Percentile (0-1)
- * @returns Value at percentile
- *
- * @public
- */
-export const percentile = (sorted: number[], p: number): number => {
-  if (sorted.length === 0) return 0
-  const idx = Math.floor(sorted.length * p)
-  return sorted[Math.min(idx, sorted.length - 1)] ?? 0
-}
-
-/**
- * Compute latency statistics from array of durations.
- *
- * @param durations - Array of durations in milliseconds
- * @returns Latency statistics
- *
- * @public
- */
-export const computeLatencyStats = (durations: number[]): LatencyStats => {
-  if (durations.length === 0) {
-    return { p50: 0, p90: 0, p99: 0, mean: 0, min: 0, max: 0 }
-  }
-
-  const sorted = [...durations].sort((a, b) => a - b)
-  const sum = sorted.reduce((a, b) => a + b, 0)
-
-  return {
-    p50: percentile(sorted, 0.5),
-    p90: percentile(sorted, 0.9),
-    p99: percentile(sorted, 0.99),
-    mean: sum / sorted.length,
-    min: sorted[0] ?? 0,
-    max: sorted[sorted.length - 1] ?? 0,
-  }
-}
-
-/**
- * Compute score distribution histogram.
- *
- * @param scores - Array of scores (0-1)
- * @returns Score distribution histogram
- *
- * @public
- */
-export const computeScoreDistribution = (scores: number[]): ScoreDistribution => {
-  const dist: ScoreDistribution = {
-    '0.0-0.2': 0,
-    '0.2-0.4': 0,
-    '0.4-0.6': 0,
-    '0.6-0.8': 0,
-    '0.8-1.0': 0,
-  }
-
-  for (const score of scores) {
-    if (score < 0.2) dist['0.0-0.2']++
-    else if (score < 0.4) dist['0.2-0.4']++
-    else if (score < 0.6) dist['0.4-0.6']++
-    else if (score < 0.8) dist['0.6-0.8']++
-    else dist['0.8-1.0']++
-  }
-
-  return dist
-}
diff --git a/src/pipeline/compare.ts b/src/pipeline/compare.ts
deleted file mode 100644
index 0d8c1f2..0000000
--- a/src/pipeline/compare.ts
+++ /dev/null
@@ -1,818 +0,0 @@
-/**
- * Pipeline compare command - compare multiple runs of the same prompts.
- *
- * @remarks
- * Compares results from different configurations (agents, MCP servers, models)
- * using either built-in strategies or a user-provided comparison grader.
- *
- * Outputs a holistic ComparisonReport JSON (not JSONL) containing aggregate
- * statistics across quality, performance, reliability, and head-to-head metrics.
- *
- * Terminology: "runs" (not "agents") because comparisons can be:
- * - Same agent, different MCP servers
- * - Same agent, different skills enabled
- * - Same agent, different system prompts
- * - Same agent, different model versions
- * - Different agents entirely
- *
- * Built-in strategies:
- * - `weighted`: Configurable weights for quality, latency, reliability (default)
- * - `statistical`: Bootstrap sampling for confidence intervals
- *
- * @packageDocumentation
- */
-
-import { basename, extname } from 'node:path'
-import { parseArgs } from 'node:util'
-import { buildResultsIndex, logProgress, writeOutput } from '../core.ts'
-import { bootstrap, formatCI, getBootstrapConfigFromEnv } from '../graders/bootstrap.ts'
-import { grade as statisticalGrade } from '../graders/compare-statistical.ts'
-import { grade as weightedGrade } from '../graders/compare-weighted.ts'
-import type {
-  CaptureResult,
-  ComparisonMeta,
-  ComparisonReport,
-  HeadToHead,
-  PairwiseComparison,
-  PerformanceMetrics,
-  PromptComparison,
-  QualityMetrics,
-  ReliabilityMetrics,
-  TrajectoryInfo,
-  TrajectoryRichness,
-} from '../schemas.ts'
-import { type CompareInputFormat, detectAndValidateFormat } from './compare-format-detection.ts'
-import { runTrialsCompare } from './compare-trials.ts'
-import { computeLatencyStats, computeScoreDistribution } from './compare-utils.ts'
-import type {
-  CompareConfig,
-  ComparisonGrader,
-  ComparisonGraderInput,
-  ComparisonResult,
-  LabeledRun,
-} from './pipeline.types.ts'
-
-/** Comparison strategy type */
-export type CompareStrategy = 'weighted' | 'statistical' | 'custom'
-
-/** Extended compare config with strategy support */
-export type ExtendedCompareConfig = Omit<CompareConfig, 'graderPath'> & {
-  /** Comparison strategy (default: weighted) */
-  strategy?: CompareStrategy
-  /** Path to custom grader (required if strategy is 'custom') */
-  graderPath?: string
-  /** Output format (default: json) */
-  format?: 'json' | 'markdown'
-}
-
-/**
- * Load comparison grader from file.
- *
- * @remarks
- * Similar to loadGrader but expects ComparisonGrader interface.
- *
- * @param path - Path to grader module
- * @returns Loaded comparison grader function
- */
-const loadComparisonGrader = async (path: string): Promise<ComparisonGrader> => {
-  const module = await import(path)
-
-  if (typeof module.grade === 'function') {
-    return module.grade as ComparisonGrader
-  }
-  if (typeof module.default === 'function') {
-    return module.default as ComparisonGrader
-  }
-  if (typeof module.compare === 'function') {
-    return module.compare as ComparisonGrader
-  }
-
-  throw new Error(`Comparison grader must export 'grade', 'compare', or 'default' function`)
-}
-
-/**
- * Derive label from file path.
- *
- * @param path - File path
- * @returns Label derived from filename without extension
- */
-const labelFromPath = (path: string): string => {
-  const base = basename(path)
-  const ext = extname(base)
-  return base.slice(0, -ext.length)
-}
-
-/**
- * Parse labeled run argument.
- *
- * @remarks
- * Supports formats:
- * - "path.jsonl" - label derived from filename
- * - "label:path.jsonl" - explicit label
- *
- * @param arg - Run argument string
- * @returns Labeled run object
- */
-const parseLabeledRun = (arg: string): LabeledRun => {
-  const colonIndex = arg.indexOf(':')
-
-  // Check if this looks like a label:path format (not a Windows drive letter)
-  if (colonIndex > 0 && colonIndex !== 1) {
-    return {
-      label: arg.slice(0, colonIndex),
-      path: arg.slice(colonIndex + 1),
-    }
-  }
-
-  return {
-    label: labelFromPath(arg),
-    path: arg,
-  }
-}
-
-/**
- * Validate that all run files exist.
- *
- * @param runs - Labeled runs to validate
- * @throws Error if any file doesn't exist
- */
-const validateRunFiles = async (runs: LabeledRun[]): Promise<void> => {
-  const missing: string[] = []
-
-  for (const run of runs) {
-    const exists = await Bun.file(run.path).exists()
-    if (!exists) {
-      missing.push(`${run.label}: ${run.path}`)
-    }
-  }
-
-  if (missing.length > 0) {
-    throw new Error(`Result file(s) not found:\n  ${missing.join('\n  ')}`)
-  }
-}
-
-/**
- * Infer output format from file extension.
- *
- * @param outputPath - Output file path
- * @param explicitFormat - Explicitly provided format (takes precedence)
- * @returns Inferred format
- */
-const inferFormat = (outputPath: string | undefined, explicitFormat: string | undefined): 'json' | 'markdown' => {
-  // Explicit format takes precedence
-  if (explicitFormat === 'json' || explicitFormat === 'markdown') {
-    return explicitFormat
-  }
-
-  // Infer from file extension
-  if (outputPath) {
-    const ext = extname(outputPath).toLowerCase()
-    if (ext === '.md' || ext === '.markdown') {
-      return 'markdown'
-    }
-  }
-
-  return 'json'
-}
-
-/**
- * Get grader function based on strategy.
- *
- * @param strategy - Comparison strategy
- * @param graderPath - Path to custom grader (for 'custom' strategy)
- * @returns Comparison grader function
- */
-const getGrader = async (strategy: CompareStrategy, graderPath?: string): Promise<ComparisonGrader> => {
-  switch (strategy) {
-    case 'weighted':
-      return weightedGrade
-    case 'statistical':
-      return statisticalGrade
-    case 'custom':
-      if (!graderPath) {
-        throw new Error('Custom strategy requires --grader path')
-      }
-      return loadComparisonGrader(graderPath)
-  }
-}
-
-/**
- * Detect trajectory richness from capture results.
- *
- * @param results - Array of capture results
- * @returns Most common trajectory richness level
- */
-const detectTrajectoryRichness = (results: CaptureResult[]): TrajectoryRichness => {
-  // Check metadata first
-  for (const r of results) {
-    const richness = r.metadata?.trajectoryRichness
-    if (richness === 'full' || richness === 'minimal' || richness === 'messages-only') {
-      return richness as TrajectoryRichness
-    }
-  }
-
-  // Infer from trajectory content
-  for (const r of results) {
-    const hasThought = r.trajectory.some((s) => s.type === 'thought')
-    const hasToolCall = r.trajectory.some((s) => s.type === 'tool_call')
-    if (hasThought || hasToolCall) return 'full'
-  }
-
-  // Check if we have any trajectory at all
-  const hasTrajectory = results.some((r) => r.trajectory.length > 0)
-  return hasTrajectory ? 'messages-only' : 'minimal'
-}
-
-/**
- * Execute pipeline compare and generate aggregate report.
- *
- * @param config - Extended compare configuration
- * @returns Comparison report
- */
-export const runCompare = async (config: ExtendedCompareConfig): Promise<ComparisonReport> => {
-  const { runs, strategy = 'weighted', graderPath, outputPath, progress = false, format = 'json' } = config
-
-  if (runs.length < 2) {
-    throw new Error('At least 2 runs required for comparison')
-  }
-
-  // Get grader based on strategy
-  const grader = await getGrader(strategy, graderPath)
-
-  const strategyLabel = strategy === 'custom' ? `custom: ${graderPath}` : strategy
-  logProgress(`Comparing ${runs.length} runs with strategy: ${strategyLabel}`, progress)
-  for (const run of runs) {
-    logProgress(`  - ${run.label}: ${run.path}`, progress)
-  }
-
-  // Load all runs using indexed streaming (memory-efficient for large files)
-  // Uses Map<id, result> instead of arrays for O(1) lookups
-  const runResults: Record<string, Map<string, CaptureResult>> = {}
-  for (const run of runs) {
-    logProgress(`Loading ${run.label}...`, progress)
-    runResults[run.label] = await buildResultsIndex(run.path)
-  }
-
-  // Build set of all prompt IDs across runs
-  const promptIds = new Set<string>()
-  for (const resultsMap of Object.values(runResults)) {
-    for (const id of resultsMap.keys()) {
-      promptIds.add(id)
-    }
-  }
-
-  logProgress(`Comparing ${promptIds.size} prompts...`, progress)
-
-  // Per-prompt comparison results
-  const perPromptResults: ComparisonResult[] = []
-  const promptComparisons: PromptComparison[] = []
-
-  for (const promptId of promptIds) {
-    logProgress(`  ${promptId}`, progress)
-
-    // Build comparison input
-    const runsData: ComparisonGraderInput['runs'] = {}
-    let input: string | string[] = ''
-    let hint: string | undefined
-    let metadata: Record<string, unknown> | undefined
-
-    for (const [label, resultsMap] of Object.entries(runResults)) {
-      const result = resultsMap.get(promptId)
-      if (result) {
-        runsData[label] = {
-          output: result.output,
-          trajectory: result.trajectory,
-          // Include additional fields for graders that need them
-          ...(result.score && { score: result.score }),
-          ...(result.timing && { duration: result.timing.total }),
-          ...(result.toolErrors !== undefined && { toolErrors: result.toolErrors }),
-        }
-        // Use first found input/hint/metadata as the reference
-        if (!input) {
-          input = result.input
-          hint = result.hint
-          metadata = result.metadata
-        }
-      }
-    }
-
-    // Skip if not present in at least 2 runs
-    if (Object.keys(runsData).length < 2) {
-      logProgress(`    Skipped (only in ${Object.keys(runsData).length} run)`, progress)
-      continue
-    }
-
-    // Apply comparison grader
-    const graderInput: ComparisonGraderInput = {
-      id: promptId,
-      input,
-      hint,
-      metadata,
-      runs: runsData,
-    }
-
-    const graderResult = await grader(graderInput)
-
-    const comparisonResult: ComparisonResult = {
-      id: promptId,
-      input,
-      hint,
-      rankings: graderResult.rankings,
-      reasoning: graderResult.reasoning,
-    }
-
-    perPromptResults.push(comparisonResult)
-
-    // Build prompt comparison for head-to-head
-    const winner = graderResult.rankings.find((r) => r.rank === 1)
-    const scores: Record<string, number> = {}
-    const latencies: Record<string, number> = {}
-    const hadErrors: Record<string, boolean> = {}
-
-    for (const ranking of graderResult.rankings) {
-      scores[ranking.run] = ranking.score
-    }
-
-    for (const [label, data] of Object.entries(runsData)) {
-      latencies[label] = data.duration ?? 0
-      hadErrors[label] = data.toolErrors ?? false
-    }
-
-    promptComparisons.push({
-      id: promptId,
-      winner: winner?.run ?? null,
-      scores,
-      latencies,
-      hadErrors,
-    })
-
-    // Log winner
-    if (winner) {
-      logProgress(`    Winner: ${winner.run} (${winner.score.toFixed(2)})`, progress)
-    }
-  }
-
-  // Compute aggregate metrics
-  const runLabels = runs.map((r) => r.label)
-
-  // Quality metrics (iterate over Map values)
-  const quality: Record<string, QualityMetrics> = {}
-  for (const label of runLabels) {
-    const resultsMap = runResults[label] ?? new Map()
-    const results = [...resultsMap.values()]
-    const scores = results.map((r) => r.score?.score ?? 0)
-    const passes = results.filter((r) => r.score?.pass === true).length
-    const fails = results.length - passes
-
-    quality[label] = {
-      type: 'run',
-      avgScore: scores.length > 0 ? scores.reduce((a, b) => a + b, 0) / scores.length : 0,
-      passRate: results.length > 0 ? passes / results.length : 0,
-      passCount: passes,
-      failCount: fails,
-      scoreDistribution: computeScoreDistribution(scores),
-    }
-  }
-
-  // Performance metrics
-  const performance: Record<string, PerformanceMetrics> = {}
-  for (const label of runLabels) {
-    const resultsMap = runResults[label] ?? new Map()
-    const results = [...resultsMap.values()]
-    const durations = results.map((r) => r.timing?.total ?? 0)
-    const firstResponses = results.map((r) => r.timing?.firstResponse).filter((v): v is number => v !== undefined)
-
-    performance[label] = {
-      latency: computeLatencyStats(durations),
-      firstResponse: firstResponses.length > 0 ? computeLatencyStats(firstResponses) : undefined,
-      totalDuration: durations.reduce((a, b) => a + b, 0),
-    }
-  }
-
-  // Reliability metrics
-  const reliability: Record<string, ReliabilityMetrics> = {}
-  for (const label of runLabels) {
-    const resultsMap = runResults[label] ?? new Map()
-    const results = [...resultsMap.values()]
-    const toolErrorCount = results.filter((r) => r.toolErrors === true).length
-    const timeoutCount = results.filter((r) =>
-      r.errors?.some((e: string) => e.toLowerCase().includes('timeout')),
-    ).length
-    const completedCount = results.filter((r) => r.output && !r.errors?.length).length
-
-    reliability[label] = {
-      type: 'run',
-      toolErrors: toolErrorCount,
-      toolErrorRate: results.length > 0 ? toolErrorCount / results.length : 0,
-      timeouts: timeoutCount,
-      timeoutRate: results.length > 0 ? timeoutCount / results.length : 0,
-      completionRate: results.length > 0 ? completedCount / results.length : 1,
-    }
-  }
-
-  // Compute confidence intervals when using statistical strategy
-  if (strategy === 'statistical') {
-    const bootstrapConfig = getBootstrapConfigFromEnv()
-
-    for (const label of runLabels) {
-      const resultsMap = runResults[label] ?? new Map()
-      const results = [...resultsMap.values()]
-      const scores = results.map((r) => r.score?.score ?? 0)
-      const passes = results.map((r) => (r.score?.pass === true ? 1 : 0))
-      const latencies = results.map((r) => r.timing?.total ?? 0)
-
-      // Quality CIs
-      const qualityMetrics = quality[label]
-      if (qualityMetrics) {
-        qualityMetrics.confidenceIntervals = {
-          avgScore: bootstrap(scores, bootstrapConfig).ci,
-          passRate: bootstrap(passes, bootstrapConfig).ci,
-        }
-      }
-
-      // Performance CIs
-      const performanceMetrics = performance[label]
-      if (performanceMetrics) {
-        performanceMetrics.confidenceIntervals = {
-          latencyMean: bootstrap(latencies, bootstrapConfig).ci,
-        }
-      }
-    }
-  }
-
-  // Trajectory info
-  const trajectoryInfo: Record<string, TrajectoryInfo> = {}
-  for (const label of runLabels) {
-    const resultsMap = runResults[label] ?? new Map()
-    const results = [...resultsMap.values()]
-    const stepCounts = results.map((r) => r.trajectory?.length ?? 0)
-    const avgStepCount = stepCounts.length > 0 ? stepCounts.reduce((a, b) => a + b, 0) / stepCounts.length : 0
-
-    trajectoryInfo[label] = {
-      richness: detectTrajectoryRichness(results),
-      avgStepCount,
-    }
-  }
-
-  // Pairwise comparisons
-  const pairwise: PairwiseComparison[] = []
-  for (let i = 0; i < runLabels.length; i++) {
-    for (let j = i + 1; j < runLabels.length; j++) {
-      const runA = runLabels[i]
-      const runB = runLabels[j]
-
-      // Skip if labels are undefined (shouldn't happen but TypeScript requires check)
-      if (!runA || !runB) continue
-
-      let aWins = 0
-      let bWins = 0
-      let ties = 0
-
-      for (const pc of promptComparisons) {
-        if (pc.winner === runA) aWins++
-        else if (pc.winner === runB) bWins++
-        else ties++
-      }
-
-      pairwise.push({ runA, runB, aWins, bWins, ties })
-    }
-  }
-
-  // Head-to-head
-  const headToHead: HeadToHead = {
-    prompts: promptComparisons,
-    pairwise,
-  }
-
-  // Count prompts where all runs are present
-  const promptsWithAllRuns = promptComparisons.filter((pc) => Object.keys(pc.scores).length === runLabels.length).length
-
-  // Build meta
-  const meta: ComparisonMeta = {
-    generatedAt: new Date().toISOString(),
-    runs: runLabels,
-    promptCount: promptIds.size,
-    promptsWithAllRuns,
-  }
-
-  // Assemble report
-  const report: ComparisonReport = {
-    meta,
-    quality,
-    performance,
-    reliability,
-    trajectoryInfo,
-    headToHead,
-  }
-
-  // Output
-  if (format === 'markdown') {
-    const markdown = formatReportAsMarkdown(report)
-    await writeOutput(markdown, outputPath, false)
-  } else {
-    await writeOutput(JSON.stringify(report, null, 2), outputPath, false)
-  }
-
-  // Summary statistics
-  logProgress('', progress)
-  logProgress('=== Summary ===', progress)
-
-  const winCounts: Record<string, number> = {}
-  for (const label of runLabels) {
-    winCounts[label] = 0
-  }
-
-  for (const pc of promptComparisons) {
-    if (pc.winner && pc.winner in winCounts) {
-      const current = winCounts[pc.winner] ?? 0
-      winCounts[pc.winner] = current + 1
-    }
-  }
-
-  for (const [label, wins] of Object.entries(winCounts)) {
-    const pct = promptComparisons.length > 0 ? ((wins / promptComparisons.length) * 100).toFixed(1) : '0.0'
-    logProgress(`  ${label}: ${wins} wins (${pct}%)`, progress)
-  }
-
-  logProgress('Done!', progress)
-
-  return report
-}
-
-/**
- * Format comparison report as markdown.
- *
- * @param report - Comparison report
- * @returns Markdown string
- */
-const formatReportAsMarkdown = (report: ComparisonReport): string => {
-  const lines: string[] = []
-
-  lines.push('# Comparison Report')
-  lines.push('')
-  lines.push(`Generated: ${report.meta.generatedAt}`)
-  lines.push(`Runs: ${report.meta.runs.join(', ')}`)
-  lines.push(`Prompts: ${report.meta.promptCount} total, ${report.meta.promptsWithAllRuns} with all runs`)
-  lines.push('')
-
-  // Check if any run has confidence intervals (statistical strategy was used)
-  const hasCIs = Object.values(report.quality).some((q) => q.confidenceIntervals)
-
-  // Quality table
-  lines.push('## Quality')
-  lines.push('')
-  if (hasCIs) {
-    lines.push('| Run | Avg Score | 95% CI | Pass Rate | 95% CI | Pass | Fail |')
-    lines.push('|-----|-----------|--------|-----------|--------|------|------|')
-    for (const [label, q] of Object.entries(report.quality)) {
-      const avgScoreCI = formatCI(q.confidenceIntervals?.avgScore)
-      const passRateCI = formatCI(q.confidenceIntervals?.passRate)
-      lines.push(
-        `| ${label} | ${q.avgScore.toFixed(3)} | ${avgScoreCI} | ${(q.passRate * 100).toFixed(1)}% | ${passRateCI} | ${q.passCount} | ${q.failCount} |`,
-      )
-    }
-  } else {
-    lines.push('| Run | Avg Score | Pass Rate | Pass | Fail |')
-    lines.push('|-----|-----------|-----------|------|------|')
-    for (const [label, q] of Object.entries(report.quality)) {
-      lines.push(
-        `| ${label} | ${q.avgScore.toFixed(3)} | ${(q.passRate * 100).toFixed(1)}% | ${q.passCount} | ${q.failCount} |`,
-      )
-    }
-  }
-  lines.push('')
-
-  // Performance table
-  lines.push('## Performance')
-  lines.push('')
-  if (hasCIs) {
-    lines.push('| Run | P50 (ms) | P90 (ms) | P99 (ms) | Mean (ms) | 95% CI |')
-    lines.push('|-----|----------|----------|----------|-----------|--------|')
-    for (const [label, p] of Object.entries(report.performance)) {
-      const latencyCI = formatCI(p.confidenceIntervals?.latencyMean, 0)
-      lines.push(
-        `| ${label} | ${p.latency.p50.toFixed(0)} | ${p.latency.p90.toFixed(0)} | ${p.latency.p99.toFixed(0)} | ${p.latency.mean.toFixed(0)} | ${latencyCI} |`,
-      )
-    }
-  } else {
-    lines.push('| Run | P50 (ms) | P90 (ms) | P99 (ms) | Mean (ms) |')
-    lines.push('|-----|----------|----------|----------|-----------|')
-    for (const [label, p] of Object.entries(report.performance)) {
-      lines.push(
-        `| ${label} | ${p.latency.p50.toFixed(0)} | ${p.latency.p90.toFixed(0)} | ${p.latency.p99.toFixed(0)} | ${p.latency.mean.toFixed(0)} |`,
-      )
-    }
-  }
-  lines.push('')
-
-  // Reliability table
-  lines.push('## Reliability')
-  lines.push('')
-  lines.push('| Run | Tool Errors | Error Rate | Completion Rate |')
-  lines.push('|-----|-------------|------------|-----------------|')
-  for (const [label, r] of Object.entries(report.reliability)) {
-    lines.push(
-      `| ${label} | ${r.toolErrors} | ${(r.toolErrorRate * 100).toFixed(1)}% | ${(r.completionRate * 100).toFixed(1)}% |`,
-    )
-  }
-  lines.push('')
-
-  // Pairwise wins
-  lines.push('## Head-to-Head')
-  lines.push('')
-  lines.push('| Matchup | Wins | Wins | Ties |')
-  lines.push('|---------|------|------|------|')
-  for (const p of report.headToHead.pairwise) {
-    lines.push(`| ${p.runA} vs ${p.runB} | ${p.aWins} | ${p.bWins} | ${p.ties} |`)
-  }
-  lines.push('')
-
-  return lines.join('\n')
-}
-
-/**
- * Pipeline compare command CLI handler.
- *
- * @param args - Command line arguments (after 'compare')
- */
-export const compare = async (args: string[]): Promise<void> => {
-  const { values, positionals } = parseArgs({
-    args,
-    options: {
-      run: { type: 'string', multiple: true },
-      grader: { type: 'string', short: 'g' },
-      strategy: { type: 'string', short: 's' },
-      output: { type: 'string', short: 'o' },
-      format: { type: 'string', short: 'f' },
-      'input-format': { type: 'string' },
-      progress: { type: 'boolean', default: false },
-      help: { type: 'boolean', short: 'h' },
-    },
-    allowPositionals: true,
-  })
-
-  if (values.help) {
-    console.log(`
-Usage: agent-eval-harness compare [files...] [options]
-
-Compare multiple runs of the same prompts and generate aggregate report.
-Supports both CaptureResult (single-run) and TrialResult (multi-run reliability) formats.
-
-Arguments:
-  files...          Result files to compare (positional, unlimited)
-
-Options:
-  --run             Labeled run format: "label:path.jsonl" (alternative to positional)
-  -s, --strategy    Comparison strategy: weighted (default), statistical, or custom
-  -g, --grader      Path to custom grader (required if strategy=custom)
-  -o, --output      Output file (default: stdout)
-  -f, --format      Output format: json (default) or markdown
-  --input-format    Input format: auto (default), capture, or trials
-  --progress        Show progress to stderr
-  -h, --help        Show this help message
-
-Input Formats:
-  auto        Auto-detect from file content (default)
-  capture     CaptureResult format (trajectory/timing fields)
-  trials      TrialResult format (trials/k fields) for pass@k analysis
-
-Built-in Strategies:
-  For CaptureResult (capture format):
-    weighted      Configurable weights for quality, latency, reliability
-                  Env vars: COMPARE_QUALITY, COMPARE_LATENCY, COMPARE_RELIABILITY
-    statistical   Bootstrap sampling for confidence intervals
-                  Env var: COMPARE_BOOTSTRAP_ITERATIONS
-
-  For TrialResult (trials format):
-    weighted      Configurable weights for capability, reliability, consistency
-                  Env vars: COMPARE_CAPABILITY, COMPARE_RELIABILITY, COMPARE_CONSISTENCY
-    statistical   Bootstrap sampling for passAtK confidence intervals
-                  Env var: COMPARE_BOOTSTRAP_ITERATIONS
-
-Custom Grader:
-  Must export 'grade' or 'compare' function with signature:
-    CaptureResult: (params: ComparisonGraderInput) => Promise<ComparisonGraderResult>
-    TrialResult:   (params: TrialsComparisonGraderInput) => Promise<ComparisonGraderResult>
-
-Examples:
-  # Default: auto-detect format, weighted strategy, JSON output
-  agent-eval-harness compare run1.jsonl run2.jsonl -o comparison.json
-
-  # Explicit trials format for pass@k comparison
-  agent-eval-harness compare trials1.jsonl trials2.jsonl --input-format trials -o comparison.json
-
-  # Trials comparison with custom weights
-  COMPARE_CAPABILITY=0.5 COMPARE_RELIABILITY=0.3 COMPARE_CONSISTENCY=0.2 \\
-    agent-eval-harness compare trials1.jsonl trials2.jsonl -o comparison.json
-
-  # Statistical significance strategy
-  agent-eval-harness compare run1.jsonl run2.jsonl --strategy statistical -o comparison.json
-
-  # Markdown report
-  agent-eval-harness compare run1.jsonl run2.jsonl --format markdown -o report.md
-
-  # Custom grader
-  agent-eval-harness compare run1.jsonl run2.jsonl \\
-    --strategy custom --grader ./my-llm-judge.ts -o comparison.json
-
-  # With explicit labels
-  agent-eval-harness compare \\
-    --run "with-bun-mcp:results-bun.jsonl" \\
-    --run "vanilla:results-vanilla.jsonl" \\
-    -o comparison.json
-`)
-    return
-  }
-
-  // Collect runs from positional args and --run flags
-  const runs: LabeledRun[] = []
-
-  // Positional arguments (file paths)
-  for (const arg of positionals) {
-    runs.push(parseLabeledRun(arg))
-  }
-
-  // --run flags
-  if (values.run) {
-    for (const arg of values.run) {
-      runs.push(parseLabeledRun(arg))
-    }
-  }
-
-  if (runs.length < 2) {
-    console.error('Error: At least 2 result files required for comparison')
-    console.error('Example: agent-eval-harness compare run1.jsonl run2.jsonl')
-    process.exit(1)
-  }
-
-  // Validate that all run files exist (early error for better UX)
-  try {
-    await validateRunFiles(runs)
-  } catch (error) {
-    console.error(`Error: ${error instanceof Error ? error.message : error}`)
-    process.exit(1)
-  }
-
-  // Validate strategy
-  const strategy = (values.strategy as CompareStrategy) ?? 'weighted'
-  if (!['weighted', 'statistical', 'custom'].includes(strategy)) {
-    console.error(`Error: Invalid strategy '${strategy}'. Use: weighted, statistical, or custom`)
-    process.exit(1)
-  }
-
-  if (strategy === 'custom' && !values.grader) {
-    console.error('Error: --grader is required when using --strategy custom')
-    process.exit(1)
-  }
-
-  // Validate output format (explicit format takes precedence, otherwise infer from extension)
-  const format = inferFormat(values.output, values.format)
-  if (values.format && !['json', 'markdown'].includes(values.format)) {
-    console.error(`Error: Invalid format '${values.format}'. Use: json or markdown`)
-    process.exit(1)
-  }
-
-  // Validate input format
-  const inputFormatArg = values['input-format']
-  if (inputFormatArg && !['auto', 'capture', 'trials'].includes(inputFormatArg)) {
-    console.error(`Error: Invalid input-format '${inputFormatArg}'. Use: auto, capture, or trials`)
-    process.exit(1)
-  }
-
-  // Detect or use specified input format
-  let inputFormat: CompareInputFormat
-  try {
-    if (inputFormatArg === 'capture') {
-      inputFormat = 'capture'
-    } else if (inputFormatArg === 'trials') {
-      inputFormat = 'trials'
-    } else {
-      // Auto-detect from file content
-      inputFormat = await detectAndValidateFormat(runs.map((r) => r.path))
-    }
-  } catch (error) {
-    console.error(`Error: ${error instanceof Error ? error.message : error}`)
-    process.exit(1)
-  }
-
-  // Route to appropriate comparison function based on input format
-  if (inputFormat === 'trials') {
-    await runTrialsCompare({
-      runs,
-      strategy,
-      graderPath: values.grader,
-      outputPath: values.output,
-      progress: values.progress,
-      format,
-    })
-  } else {
-    await runCompare({
-      runs,
-      strategy,
-      graderPath: values.grader,
-      outputPath: values.output,
-      progress: values.progress,
-      format,
-    })
-  }
-}
diff --git a/src/pipeline/extract.ts b/src/pipeline/extract.ts
deleted file mode 100644
index 69f7464..0000000
--- a/src/pipeline/extract.ts
+++ /dev/null
@@ -1,241 +0,0 @@
-/**
- * Pipeline extract command - parse raw output into trajectories.
- *
- * @remarks
- * Converts RawOutput from `run` command into ExtractedResult with
- * parsed trajectory and final output. Uses the same schema-driven
- * parsing as the capture command.
- *
- * @packageDocumentation
- */
-
-import { parseArgs } from 'node:util'
-import { loadJsonl, logProgress, writeOutput } from '../core.ts'
-import { parseHeadlessConfig } from '../headless/headless.schemas.ts'
-import { createOutputParser } from '../headless/headless-output-parser.ts'
-import type { TrajectoryStep } from '../schemas.ts'
-import type { ExtractedResult, RawOutput } from './pipeline.types.ts'
-
-/**
- * Extract trajectory from raw output using schema parser.
- *
- * @param rawOutput - Raw output from run command
- * @param parser - Output parser created from schema
- * @returns Extracted result with trajectory
- */
-const extractFromRaw = (rawOutput: RawOutput, parser: ReturnType<typeof createOutputParser>): ExtractedResult => {
-  const trajectory: TrajectoryStep[] = []
-  let finalOutput = ''
-  let toolErrors = false
-
-  // Parse each raw line
-  for (const line of rawOutput.rawLines) {
-    // Try to parse as trajectory update
-    const parsed = parser.parseLine(line)
-    if (parsed) {
-      const updates = Array.isArray(parsed) ? parsed : [parsed]
-      for (const update of updates) {
-        const timestamp = Date.now() - rawOutput.timing.start
-
-        if (update.type === 'thought') {
-          trajectory.push({
-            type: 'thought',
-            content: update.content ?? '',
-            timestamp,
-          })
-        } else if (update.type === 'message') {
-          trajectory.push({
-            type: 'message',
-            content: update.content ?? '',
-            timestamp,
-          })
-        } else if (update.type === 'tool_call') {
-          trajectory.push({
-            type: 'tool_call',
-            name: update.title ?? 'unknown',
-            status: update.status ?? 'pending',
-            timestamp,
-          })
-          if (update.status === 'failed') {
-            toolErrors = true
-          }
-        } else if (update.type === 'plan') {
-          trajectory.push({
-            type: 'plan',
-            entries: [],
-            timestamp,
-          })
-        }
-      }
-    }
-
-    // Try to parse as result
-    const result = parser.parseResult(line)
-    if (result.isResult) {
-      finalOutput = result.content
-    }
-  }
-
-  // If no explicit result, extract from messages
-  if (!finalOutput) {
-    finalOutput = trajectory
-      .filter((step): step is TrajectoryStep & { type: 'message' } => step.type === 'message')
-      .map((step) => step.content)
-      .join('\n')
-  }
-
-  return {
-    id: rawOutput.id,
-    input: rawOutput.input,
-    hint: rawOutput.hint,
-    output: finalOutput,
-    trajectory,
-    toolErrors: toolErrors || !!rawOutput.error,
-    metadata: rawOutput.metadata,
-    timing: rawOutput.timing,
-    ...(rawOutput.error && { error: rawOutput.error }),
-  }
-}
-
-/**
- * Execute pipeline extract with configuration.
- *
- * @param schemaPath - Path to headless adapter schema
- * @param rawOutputs - Raw outputs from run command
- * @param outputPath - Optional output file path
- * @param progress - Show progress to stderr
- */
-export const runExtract = async (
-  schemaPath: string,
-  rawOutputs: RawOutput[],
-  outputPath?: string,
-  progress = false,
-): Promise<void> => {
-  // Load and validate schema
-  const schemaFile = Bun.file(schemaPath)
-  if (!(await schemaFile.exists())) {
-    throw new Error(`Schema file not found: ${schemaPath}`)
-  }
-
-  const rawSchema = await schemaFile.json()
-  const schema = parseHeadlessConfig(rawSchema)
-  const parser = createOutputParser(schema)
-
-  logProgress(`Extracting with schema: ${schema.name}`, progress)
-
-  let isFirstOutput = true
-
-  // Clear output file if specified
-  if (outputPath) {
-    await Bun.write(outputPath, '')
-  }
-
-  for (let i = 0; i < rawOutputs.length; i++) {
-    const rawOutput = rawOutputs[i]
-    if (!rawOutput) continue
-
-    logProgress(`[${i + 1}/${rawOutputs.length}] ${rawOutput.id}`, progress)
-
-    const extracted = extractFromRaw(rawOutput, parser)
-
-    await writeOutput(JSON.stringify(extracted), outputPath, !isFirstOutput)
-    isFirstOutput = false
-  }
-
-  logProgress('Done!', progress)
-}
-
-/**
- * Read raw outputs from stdin.
- *
- * @returns Array of parsed raw outputs or null if stdin is empty
- */
-const readStdinRawOutputs = async (): Promise<RawOutput[] | null> => {
-  if (process.stdin.isTTY) {
-    return null
-  }
-
-  const chunks: Buffer[] = []
-  for await (const chunk of process.stdin) {
-    chunks.push(chunk)
-  }
-
-  const content = Buffer.concat(chunks).toString('utf-8').trim()
-  if (!content) return null
-
-  return content
-    .split('\n')
-    .filter(Boolean)
-    .map((line) => JSON.parse(line) as RawOutput)
-}
-
-/**
- * Pipeline extract command CLI handler.
- *
- * @param args - Command line arguments (after 'extract')
- */
-export const extract = async (args: string[]): Promise<void> => {
-  const { values, positionals } = parseArgs({
-    args,
-    options: {
-      schema: { type: 'string', short: 's' },
-      output: { type: 'string', short: 'o' },
-      progress: { type: 'boolean', default: false },
-      help: { type: 'boolean', short: 'h' },
-    },
-    allowPositionals: true,
-  })
-
-  if (values.help) {
-    console.log(`
-Usage: agent-eval-harness extract [raw.jsonl] --schema <schema.json> [options]
-
-Parse raw output into trajectories and final output.
-
-Arguments:
-  raw.jsonl         Input file from 'run' command (or pipe from stdin)
-
-Options:
-  -s, --schema      Path to headless adapter schema (required)
-  -o, --output      Output file (default: stdout)
-  --progress        Show progress to stderr
-  -h, --help        Show this help message
-
-Examples:
-  # From file
-  agent-eval-harness extract raw.jsonl --schema claude.json -o extracted.jsonl
-
-  # Piped from run
-  agent-eval-harness run prompts.jsonl -s claude.json | agent-eval-harness extract -s claude.json
-
-  # Full pipeline
-  cat prompts.jsonl | \\
-    agent-eval-harness run -s claude.json | \\
-    agent-eval-harness extract -s claude.json | \\
-    agent-eval-harness grade --grader ./grader.ts
-`)
-    return
-  }
-
-  if (!values.schema) {
-    console.error('Error: --schema is required')
-    process.exit(1)
-  }
-
-  // Load raw outputs from file or stdin
-  const inputPath = positionals[0]
-  let rawOutputs: RawOutput[]
-
-  if (inputPath) {
-    rawOutputs = await loadJsonl<RawOutput>(inputPath)
-  } else {
-    const stdinOutputs = await readStdinRawOutputs()
-    if (!stdinOutputs || stdinOutputs.length === 0) {
-      console.error('Error: No raw output provided (use file argument or pipe to stdin)')
-      process.exit(1)
-    }
-    rawOutputs = stdinOutputs
-  }
-
-  await runExtract(values.schema, rawOutputs, values.output, values.progress)
-}
diff --git a/src/pipeline/format.ts b/src/pipeline/format.ts
deleted file mode 100644
index 54cdb6c..0000000
--- a/src/pipeline/format.ts
+++ /dev/null
@@ -1,291 +0,0 @@
-/**
- * Pipeline format command - convert results to different output formats.
- *
- * @remarks
- * Transforms graded or extracted results into various formats:
- * - jsonl: Pass-through JSONL (default)
- * - markdown: Human-readable report
- * - csv: Comma-separated values for spreadsheets
- *
- * @packageDocumentation
- */
-
-import { parseArgs } from 'node:util'
-import { loadJsonl, logProgress, writeOutput } from '../core.ts'
-import type { CaptureResult } from '../schemas.ts'
-import type { ExtractedResult, FormatStyle, GradedResult } from './pipeline.types.ts'
-
-/** Union of all formattable result types */
-type FormattableResult = ExtractedResult | GradedResult | CaptureResult
-
-/**
- * Check if result has a score (graded).
- */
-const isGraded = (
-  result: FormattableResult,
-): result is GradedResult | (CaptureResult & { score: NonNullable<CaptureResult['score']> }) => {
-  return 'score' in result && result.score !== undefined
-}
-
-/**
- * Format results as markdown report.
- *
- * @param results - Results to format
- * @returns Markdown string
- */
-const formatMarkdown = (results: FormattableResult[]): string => {
-  const lines: string[] = [
-    '# Evaluation Results',
-    '',
-    `Generated: ${new Date().toISOString()}`,
-    `Total: ${results.length} test cases`,
-    '',
-  ]
-
-  // Summary statistics if graded
-  const gradedResults = results.filter(isGraded)
-  if (gradedResults.length > 0) {
-    const passed = gradedResults.filter((r) => r.score.pass).length
-    const avgScore = gradedResults.reduce((sum, r) => sum + r.score.score, 0) / gradedResults.length
-
-    lines.push('## Summary')
-    lines.push('')
-    lines.push(
-      `- **Pass rate**: ${passed}/${gradedResults.length} (${((passed / gradedResults.length) * 100).toFixed(1)}%)`,
-    )
-    lines.push(`- **Average score**: ${avgScore.toFixed(3)}`)
-    lines.push('')
-  }
-
-  lines.push('## Results')
-  lines.push('')
-
-  for (const result of results) {
-    const input = Array.isArray(result.input) ? result.input.join(' → ') : result.input
-    const inputPreview = input.length > 100 ? `${input.slice(0, 100)}...` : input
-
-    lines.push(`### ${result.id}`)
-    lines.push('')
-    lines.push(`**Input**: ${inputPreview}`)
-    lines.push('')
-
-    if (result.hint) {
-      lines.push(`**Hint**: ${result.hint}`)
-      lines.push('')
-    }
-
-    const outputPreview = result.output.length > 500 ? `${result.output.slice(0, 500)}...` : result.output
-    lines.push(`**Output**:`)
-    lines.push('```')
-    lines.push(outputPreview)
-    lines.push('```')
-    lines.push('')
-
-    if (isGraded(result)) {
-      const icon = result.score.pass ? '✅' : '❌'
-      lines.push(`**Score**: ${icon} ${result.score.score.toFixed(3)} (${result.score.pass ? 'PASS' : 'FAIL'})`)
-      if (result.score.reasoning) {
-        lines.push(`**Reasoning**: ${result.score.reasoning}`)
-      }
-      lines.push('')
-    }
-
-    if (result.toolErrors) {
-      lines.push('⚠️ **Tool errors detected**')
-      lines.push('')
-    }
-
-    if ('error' in result && result.error) {
-      lines.push(`❌ **Error**: ${result.error}`)
-      lines.push('')
-    }
-
-    lines.push('---')
-    lines.push('')
-  }
-
-  return lines.join('\n')
-}
-
-/**
- * Format results as CSV.
- *
- * @param results - Results to format
- * @returns CSV string
- */
-const formatCsv = (results: FormattableResult[]): string => {
-  const lines: string[] = []
-
-  // Header
-  const hasScores = results.some(isGraded)
-  const headers = ['id', 'input', 'hint', 'output', 'tool_errors', 'duration_ms']
-  if (hasScores) {
-    headers.push('pass', 'score', 'reasoning')
-  }
-  lines.push(headers.join(','))
-
-  // Data rows
-  for (const result of results) {
-    const input = Array.isArray(result.input) ? result.input.join(' | ') : result.input
-    const escapeCsv = (str: string) => `"${str.replace(/"/g, '""').replace(/\n/g, '\\n')}"`
-
-    const row = [
-      escapeCsv(result.id),
-      escapeCsv(input),
-      escapeCsv(result.hint ?? ''),
-      escapeCsv(result.output),
-      result.toolErrors ? 'true' : 'false',
-      String(result.timing.total),
-    ]
-
-    if (hasScores) {
-      if (isGraded(result)) {
-        row.push(
-          result.score.pass ? 'true' : 'false',
-          result.score.score.toFixed(3),
-          escapeCsv(result.score.reasoning ?? ''),
-        )
-      } else {
-        row.push('', '', '')
-      }
-    }
-
-    lines.push(row.join(','))
-  }
-
-  return lines.join('\n')
-}
-
-/**
- * Execute pipeline format with configuration.
- *
- * @param style - Output format style
- * @param results - Results to format
- * @param outputPath - Optional output file path
- * @param progress - Show progress to stderr
- */
-export const runFormat = async (
-  style: FormatStyle,
-  results: FormattableResult[],
-  outputPath?: string,
-  progress = false,
-): Promise<void> => {
-  logProgress(`Formatting ${results.length} results as ${style}`, progress)
-
-  let output: string
-
-  switch (style) {
-    case 'jsonl':
-      // Pass-through as JSONL
-      output = results.map((r) => JSON.stringify(r)).join('\n')
-      break
-
-    case 'markdown':
-      output = formatMarkdown(results)
-      break
-
-    case 'csv':
-      output = formatCsv(results)
-      break
-  }
-
-  await writeOutput(output, outputPath, false)
-  logProgress('Done!', progress)
-}
-
-/**
- * Read results from stdin.
- *
- * @returns Array of parsed results or null if stdin is empty
- */
-const readStdinResults = async (): Promise<FormattableResult[] | null> => {
-  if (process.stdin.isTTY) {
-    return null
-  }
-
-  const chunks: Buffer[] = []
-  for await (const chunk of process.stdin) {
-    chunks.push(chunk)
-  }
-
-  const content = Buffer.concat(chunks).toString('utf-8').trim()
-  if (!content) return null
-
-  return content
-    .split('\n')
-    .filter(Boolean)
-    .map((line) => JSON.parse(line) as FormattableResult)
-}
-
-/**
- * Pipeline format command CLI handler.
- *
- * @param args - Command line arguments (after 'format')
- */
-export const format = async (args: string[]): Promise<void> => {
-  const { values, positionals } = parseArgs({
-    args,
-    options: {
-      style: { type: 'string', short: 'f', default: 'jsonl' },
-      output: { type: 'string', short: 'o' },
-      progress: { type: 'boolean', default: false },
-      help: { type: 'boolean', short: 'h' },
-    },
-    allowPositionals: true,
-  })
-
-  if (values.help) {
-    console.log(`
-Usage: agent-eval-harness format [results.jsonl] [options]
-
-Convert results to different output formats.
-
-Arguments:
-  results.jsonl     Input file (or pipe from stdin)
-
-Options:
-  -f, --style       Output format: jsonl, markdown, csv (default: jsonl)
-  -o, --output      Output file (default: stdout)
-  --progress        Show progress to stderr
-  -h, --help        Show this help message
-
-Examples:
-  # Convert to markdown report
-  agent-eval-harness format graded.jsonl --style markdown -o report.md
-
-  # Piped from grade
-  agent-eval-harness grade extracted.jsonl -g ./grader.ts | agent-eval-harness format -f csv
-
-  # Full pipeline to markdown
-  cat prompts.jsonl | \\
-    agent-eval-harness run -s claude.json | \\
-    agent-eval-harness extract -s claude.json | \\
-    agent-eval-harness grade -g ./grader.ts | \\
-    agent-eval-harness format -f markdown > report.md
-`)
-    return
-  }
-
-  const style = values.style as FormatStyle
-  if (!['jsonl', 'markdown', 'csv'].includes(style)) {
-    console.error(`Error: Invalid format style '${style}'. Must be: jsonl, markdown, csv`)
-    process.exit(1)
-  }
-
-  // Load results from file or stdin
-  const inputPath = positionals[0]
-  let results: FormattableResult[]
-
-  if (inputPath) {
-    results = await loadJsonl<FormattableResult>(inputPath)
-  } else {
-    const stdinResults = await readStdinResults()
-    if (!stdinResults || stdinResults.length === 0) {
-      console.error('Error: No results provided (use file argument or pipe to stdin)')
-      process.exit(1)
-    }
-    results = stdinResults
-  }
-
-  await runFormat(style, results, values.output, values.progress)
-}
diff --git a/src/pipeline/grade.ts b/src/pipeline/grade.ts
deleted file mode 100644
index 16ad736..0000000
--- a/src/pipeline/grade.ts
+++ /dev/null
@@ -1,175 +0,0 @@
-/**
- * Pipeline grade command - apply grader to extracted results.
- *
- * @remarks
- * Takes ExtractedResult from `extract` command and adds grader scores.
- * Uses the same grader loading mechanism as the capture command.
- *
- * @packageDocumentation
- */
-
-import { parseArgs } from 'node:util'
-import { loadJsonl, logProgress, writeOutput } from '../core.ts'
-import { loadGrader } from '../schemas/grader-loader.ts'
-import type { ExtractedResult, GradedResult } from './pipeline.types.ts'
-
-/**
- * Execute pipeline grade with configuration.
- *
- * @param graderPath - Path to grader module or executable
- * @param extractedResults - Extracted results from extract command
- * @param outputPath - Optional output file path
- * @param progress - Show progress to stderr
- */
-export const runGrade = async (
-  graderPath: string,
-  extractedResults: ExtractedResult[],
-  outputPath?: string,
-  progress = false,
-): Promise<void> => {
-  // Load grader
-  const grader = await loadGrader(graderPath)
-
-  logProgress(`Grading with: ${graderPath}`, progress)
-
-  let isFirstOutput = true
-
-  // Clear output file if specified
-  if (outputPath) {
-    await Bun.write(outputPath, '')
-  }
-
-  for (let i = 0; i < extractedResults.length; i++) {
-    const extracted = extractedResults[i]
-    if (!extracted) continue
-
-    logProgress(`[${i + 1}/${extractedResults.length}] ${extracted.id}`, progress)
-
-    // Apply grader
-    const score = await grader({
-      input: extracted.input,
-      output: extracted.output,
-      hint: extracted.hint,
-      trajectory: extracted.trajectory,
-      metadata: extracted.metadata,
-      cwd: extracted.cwd,
-    })
-
-    const graded: GradedResult = {
-      ...extracted,
-      score,
-    }
-
-    // Merge outcome from grader if present
-    if (score.outcome) {
-      graded.outcome = score.outcome
-    }
-
-    const icon = score.pass ? '✓' : '✗'
-    logProgress(`  ${icon} score=${score.score.toFixed(2)}`, progress)
-
-    await writeOutput(JSON.stringify(graded), outputPath, !isFirstOutput)
-    isFirstOutput = false
-  }
-
-  logProgress('Done!', progress)
-}
-
-/**
- * Read extracted results from stdin.
- *
- * @returns Array of parsed extracted results or null if stdin is empty
- */
-const readStdinExtracted = async (): Promise<ExtractedResult[] | null> => {
-  if (process.stdin.isTTY) {
-    return null
-  }
-
-  const chunks: Buffer[] = []
-  for await (const chunk of process.stdin) {
-    chunks.push(chunk)
-  }
-
-  const content = Buffer.concat(chunks).toString('utf-8').trim()
-  if (!content) return null
-
-  return content
-    .split('\n')
-    .filter(Boolean)
-    .map((line) => JSON.parse(line) as ExtractedResult)
-}
-
-/**
- * Pipeline grade command CLI handler.
- *
- * @param args - Command line arguments (after 'grade')
- */
-export const grade = async (args: string[]): Promise<void> => {
-  const { values, positionals } = parseArgs({
-    args,
-    options: {
-      grader: { type: 'string', short: 'g' },
-      output: { type: 'string', short: 'o' },
-      progress: { type: 'boolean', default: false },
-      help: { type: 'boolean', short: 'h' },
-    },
-    allowPositionals: true,
-  })
-
-  if (values.help) {
-    console.log(`
-Usage: agent-eval-harness grade [extracted.jsonl] --grader <grader> [options]
-
-Apply grader to extracted results.
-
-Arguments:
-  extracted.jsonl   Input file from 'extract' command (or pipe from stdin)
-
-Options:
-  -g, --grader      Path to grader (.ts/.js module or executable script) (required)
-  -o, --output      Output file (default: stdout)
-  --progress        Show progress to stderr
-  -h, --help        Show this help message
-
-Graders:
-  TS/JS modules must export a 'grade' function.
-  Executable scripts (Python, etc.) use stdin/stdout JSON protocol.
-
-Examples:
-  # From file
-  agent-eval-harness grade extracted.jsonl --grader ./grader.ts -o graded.jsonl
-
-  # Piped from extract
-  agent-eval-harness extract raw.jsonl -s claude.json | agent-eval-harness grade -g ./grader.ts
-
-  # Full pipeline
-  cat prompts.jsonl | \\
-    agent-eval-harness run -s claude.json | \\
-    agent-eval-harness extract -s claude.json | \\
-    agent-eval-harness grade -g ./grader.ts > results.jsonl
-`)
-    return
-  }
-
-  if (!values.grader) {
-    console.error('Error: --grader is required')
-    process.exit(1)
-  }
-
-  // Load extracted results from file or stdin
-  const inputPath = positionals[0]
-  let extractedResults: ExtractedResult[]
-
-  if (inputPath) {
-    extractedResults = await loadJsonl<ExtractedResult>(inputPath)
-  } else {
-    const stdinResults = await readStdinExtracted()
-    if (!stdinResults || stdinResults.length === 0) {
-      console.error('Error: No extracted results provided (use file argument or pipe to stdin)')
-      process.exit(1)
-    }
-    extractedResults = stdinResults
-  }
-
-  await runGrade(values.grader, extractedResults, values.output, values.progress)
-}
diff --git a/src/pipeline/pipeline.ts b/src/pipeline/pipeline.ts
deleted file mode 100644
index 6d607a8..0000000
--- a/src/pipeline/pipeline.ts
+++ /dev/null
@@ -1,42 +0,0 @@
-/**
- * Pipeline commands for Unix-style composable evaluation.
- *
- * @remarks
- * Re-exports pipeline commands and types.
- *
- * Commands:
- * - run: Execute prompts and output raw results
- * - extract: Parse raw output into trajectories
- * - grade: Apply grader to extracted results
- * - format: Convert results to different output formats
- * - compare: Compare multiple runs of the same prompts
- *
- * @packageDocumentation
- */
-
-// Commands
-export { type CompareStrategy, compare, type ExtendedCompareConfig, runCompare } from './compare.ts'
-export { extract } from './extract.ts'
-export { format } from './format.ts'
-export { grade } from './grade.ts'
-// Types
-export type {
-  CompareConfig,
-  ComparisonGrader,
-  ComparisonGraderInput,
-  ComparisonGraderResult,
-  ComparisonRanking,
-  ComparisonResult,
-  ComparisonRunData,
-  ExtractConfig,
-  ExtractedResult,
-  FormatConfig,
-  FormatStyle,
-  GradeConfig,
-  GradedResult,
-  LabeledRun,
-  RawOutput,
-  RunConfig,
-  RunMode,
-} from './pipeline.types.ts'
-export { run } from './run.ts'
diff --git a/src/pipeline/pipeline.types.ts b/src/pipeline/pipeline.types.ts
deleted file mode 100644
index d56a824..0000000
--- a/src/pipeline/pipeline.types.ts
+++ /dev/null
@@ -1,325 +0,0 @@
-/**
- * Type definitions for pipeline commands.
- *
- * @remarks
- * These types define the data flow between pipeline stages:
- * run → extract → grade → format
- *
- * Each stage transforms the data, enabling Unix-style piping.
- *
- * @packageDocumentation
- */
-
-import type { GraderResult, TrajectoryStep, TrialEntry } from '../schemas.ts'
-
-/**
- * Raw output from the `run` command.
- *
- * @remarks
- * Captures the raw agent output before trajectory extraction.
- * Used when piping `run` output to `extract`.
- */
-export type RawOutput = {
-  /** Test case identifier */
-  id: string
-  /** Original prompt input (string for single turn, array for multi-turn) */
-  input: string | string[]
-  /** Grader context hint */
-  hint?: string
-  /** Optional metadata from original prompt */
-  metadata?: Record<string, unknown>
-  /** Raw output lines from the agent (JSON strings) */
-  rawLines: string[]
-  /** Timing metadata */
-  timing: {
-    start: number
-    end: number
-    total: number
-  }
-  /** Error message if execution failed */
-  error?: string
-}
-
-/**
- * Extracted result from the `extract` command.
- *
- * @remarks
- * Converts raw output lines into structured trajectory and output.
- * Ready for grading or formatting.
- */
-export type ExtractedResult = {
-  /** Test case identifier */
-  id: string
-  /** Original prompt input */
-  input: string | string[]
-  /** Grader context hint */
-  hint?: string
-  /** Final agent output (extracted from trajectory) */
-  output: string
-  /** Parsed trajectory steps */
-  trajectory: TrajectoryStep[]
-  /** Whether tool errors were detected */
-  toolErrors: boolean
-  /** Optional metadata from original prompt */
-  metadata?: Record<string, unknown>
-  /** Working directory path (optional, for git-based grading) */
-  cwd?: string
-  /** Timing metadata */
-  timing: {
-    start: number
-    end: number
-    total: number
-  }
-  /** Error message if extraction failed */
-  error?: string
-}
-
-/**
- * Graded result from the `grade` command.
- *
- * @remarks
- * Adds grader score to extracted result.
- * Outcome field is merged from grader result if present.
- */
-export type GradedResult = ExtractedResult & {
-  /** Grader score */
-  score: GraderResult
-  /** Outcome data from grader (if grader returned outcome) */
-  outcome?: Record<string, unknown>
-}
-
-/**
- * Run mode for the pipeline run command.
- *
- * @remarks
- * - `schema`: Use headless adapter with schema file
- * - `simple`: Use Bun shell with placeholder substitution
- * - `shell`: Use Bun shell with PROMPT env variable
- */
-export type RunMode = 'schema' | 'simple' | 'shell'
-
-/**
- * Configuration for pipeline run command.
- */
-export type RunConfig = {
-  /** Run mode */
-  mode: RunMode
-  /** Path to schema file (for 'schema' mode) */
-  schemaPath?: string
-  /** Command template (for 'simple' mode) - {} is replaced with prompt */
-  simpleCommand?: string
-  /** Shell template (for 'shell' mode) - $PROMPT env var is available */
-  shellTemplate?: string
-  /** Working directory */
-  cwd?: string
-  /** Timeout per prompt in milliseconds */
-  timeout?: number
-  /** Show progress to stderr */
-  progress?: boolean
-}
-
-/**
- * Configuration for pipeline extract command.
- */
-export type ExtractConfig = {
-  /** Path to schema file for output parsing */
-  schemaPath: string
-  /** Show progress to stderr */
-  progress?: boolean
-}
-
-/**
- * Configuration for pipeline grade command.
- */
-export type GradeConfig = {
-  /** Path to grader module or executable */
-  graderPath: string
-  /** Show progress to stderr */
-  progress?: boolean
-}
-
-/**
- * Output format for pipeline format command.
- */
-export type FormatStyle = 'jsonl' | 'markdown' | 'csv'
-
-/**
- * Configuration for pipeline format command.
- */
-export type FormatConfig = {
-  /** Output format style */
-  style: FormatStyle
-  /** Show progress to stderr */
-  progress?: boolean
-}
-
-/**
- * Labeled run for comparison.
- *
- * @remarks
- * Associates a results file with a human-readable label
- * for the compare command output.
- */
-export type LabeledRun = {
-  /** Human-readable label (derived from filename or explicit) */
-  label: string
-  /** Path to results JSONL file */
-  path: string
-}
-
-/**
- * Run data provided to comparison graders.
- *
- * @remarks
- * Extended run data includes optional fields that built-in graders use:
- * - `score`: Grader result if the run was previously graded
- * - `duration`: Total duration from timing
- * - `toolErrors`: Whether tool errors occurred
- */
-export type ComparisonRunData = {
-  /** Final agent output */
-  output: string
-  /** Execution trajectory (optional, varies by adapter) */
-  trajectory?: TrajectoryStep[]
-  /** Grader score (if run was graded) */
-  score?: GraderResult
-  /** Total duration in milliseconds */
-  duration?: number
-  /** Whether tool errors occurred */
-  toolErrors?: boolean
-}
-
-/**
- * Input to comparison grader function.
- *
- * @remarks
- * Provides all runs' results for a single prompt ID
- * so the grader can compare and rank them.
- */
-export type ComparisonGraderInput = {
-  /** Test case identifier */
-  id: string
-  /** Original prompt input */
-  input: string | string[]
-  /** Grader context hint */
-  hint?: string
-  /** Optional metadata from original prompt */
-  metadata?: Record<string, unknown>
-  /** Results keyed by run label */
-  runs: Record<string, ComparisonRunData>
-}
-
-/**
- * Single ranking entry in comparison result.
- */
-export type ComparisonRanking = {
-  /** Run label */
-  run: string
-  /** Rank position (1 = best) */
-  rank: number
-  /** Numeric score */
-  score: number
-}
-
-/**
- * Result from comparison grader function.
- *
- * @remarks
- * Rankings should be ordered from best to worst.
- */
-export type ComparisonGraderResult = {
-  /** Rankings from best to worst */
-  rankings: ComparisonRanking[]
-  /** Optional reasoning for the rankings */
-  reasoning?: string
-}
-
-/**
- * Comparison grader function type.
- *
- * @remarks
- * User-provided graders implement this interface to compare
- * multiple runs of the same prompt.
- */
-export type ComparisonGrader = (params: ComparisonGraderInput) => Promise<ComparisonGraderResult>
-
-/**
- * Configuration for pipeline compare command.
- */
-export type CompareConfig = {
-  /** Labeled runs to compare */
-  runs: LabeledRun[]
-  /** Path to comparison grader */
-  graderPath: string
-  /** Output file path */
-  outputPath?: string
-  /** Show progress to stderr */
-  progress?: boolean
-}
-
-/**
- * Comparison result for a single prompt.
- */
-export type ComparisonResult = {
-  /** Test case identifier */
-  id: string
-  /** Original prompt input */
-  input: string | string[]
-  /** Grader context hint */
-  hint?: string
-  /** Rankings from comparison grader */
-  rankings: ComparisonRanking[]
-  /** Optional reasoning */
-  reasoning?: string
-}
-
-// ============================================================================
-// Trials Comparison Types
-// ============================================================================
-
-/**
- * Run data for trials comparison.
- *
- * @remarks
- * Contains the trials-specific metrics (passAtK, passExpK) plus
- * the individual trial entries for deeper analysis.
- */
-export type TrialsComparisonRunData = {
-  /** Simple pass rate: passes / k */
-  passRate?: number
-  /** pass@k: probability of at least one pass in k samples */
-  passAtK?: number
-  /** pass^k: probability of all k samples passing */
-  passExpK?: number
-  /** Number of trials (k) */
-  k: number
-  /** Individual trial results */
-  trials: TrialEntry[]
-}
-
-/**
- * Input to trials comparison grader function.
- *
- * @remarks
- * Provides all runs' trial results for a single prompt ID
- * so the grader can compare capability and reliability.
- */
-export type TrialsComparisonGraderInput = {
-  /** Test case identifier */
-  id: string
-  /** Original prompt input */
-  input: string | string[]
-  /** Grader context hint */
-  hint?: string
-  /** Results keyed by run label */
-  runs: Record<string, TrialsComparisonRunData>
-}
-
-/**
- * Trials comparison grader function type.
- *
- * @remarks
- * User-provided graders implement this interface to compare
- * multiple runs of the same prompt using trials data.
- */
-export type TrialsComparisonGrader = (params: TrialsComparisonGraderInput) => Promise<ComparisonGraderResult>
diff --git a/src/pipeline/run.ts b/src/pipeline/run.ts
deleted file mode 100644
index 7b5a84a..0000000
--- a/src/pipeline/run.ts
+++ /dev/null
@@ -1,414 +0,0 @@
-/**
- * Pipeline run command - execute prompts and output raw results.
- *
- * @remarks
- * Supports three modes:
- * - `schema`: Use headless adapter with schema file (full trajectory capture)
- * - `simple`: Use Bun shell with `{}` placeholder for prompt
- * - `shell`: Use Bun shell with `$PROMPT` environment variable
- *
- * Output is RawOutput JSONL suitable for piping to `extract`.
- *
- * @packageDocumentation
- */
-
-import { parseArgs } from 'node:util'
-import { loadPrompts, logProgress, writeOutput } from '../core.ts'
-import { parseHeadlessConfig } from '../headless/headless.schemas.ts'
-import { createSessionManager } from '../headless/headless-session-manager.ts'
-import { DEFAULT_HARNESS_TIMEOUT } from '../schemas/constants.ts'
-import type { RawOutput, RunConfig } from './pipeline.types.ts'
-
-/**
- * Execute a single prompt in simple mode.
- *
- * @remarks
- * Replaces `{}` placeholder in command with the prompt text.
- * Uses Bun shell for execution.
- *
- * @param prompt - Prompt text to execute
- * @param command - Command template with `{}` placeholder
- * @param timeout - Execution timeout in milliseconds
- * @returns Object with output lines and optional stderr error
- */
-const runSimple = async (
-  prompt: string,
-  command: string,
-  timeout: number,
-): Promise<{ lines: string[]; error?: string }> => {
-  const escapedPrompt = prompt.replace(/'/g, "'\\''")
-  const finalCmd = command.replace('{}', `'${escapedPrompt}'`)
-
-  const proc = Bun.spawn(['sh', '-c', finalCmd], {
-    stdout: 'pipe',
-    stderr: 'pipe',
-  })
-
-  const timeoutId = setTimeout(() => proc.kill(), timeout)
-
-  try {
-    const [stdout, stderr] = await Promise.all([new Response(proc.stdout).text(), new Response(proc.stderr).text()])
-    clearTimeout(timeoutId)
-    const lines = stdout.trim().split('\n').filter(Boolean)
-    return stderr.trim() ? { lines, error: stderr.trim() } : { lines }
-  } catch (err) {
-    clearTimeout(timeoutId)
-    return { lines: [], error: err instanceof Error ? err.message : String(err) }
-  }
-}
-
-/**
- * Execute a single prompt in shell mode.
- *
- * @remarks
- * Sets PROMPT environment variable and executes shell template.
- *
- * @param prompt - Prompt text to execute
- * @param template - Shell command template
- * @param timeout - Execution timeout in milliseconds
- * @returns Object with output lines and optional stderr error
- */
-const runShell = async (
-  prompt: string,
-  template: string,
-  timeout: number,
-): Promise<{ lines: string[]; error?: string }> => {
-  const proc = Bun.spawn(['sh', '-c', template], {
-    stdout: 'pipe',
-    stderr: 'pipe',
-    env: { ...process.env, PROMPT: prompt },
-  })
-
-  const timeoutId = setTimeout(() => proc.kill(), timeout)
-
-  try {
-    const [stdout, stderr] = await Promise.all([new Response(proc.stdout).text(), new Response(proc.stderr).text()])
-    clearTimeout(timeoutId)
-    const lines = stdout.trim().split('\n').filter(Boolean)
-    return stderr.trim() ? { lines, error: stderr.trim() } : { lines }
-  } catch (err) {
-    clearTimeout(timeoutId)
-    return { lines: [], error: err instanceof Error ? err.message : String(err) }
-  }
-}
-
-/**
- * Execute pipeline run with configuration object.
- *
- * @remarks
- * Processes prompts from stdin (if available) or from a file,
- * executing each and outputting RawOutput JSONL.
- *
- * @param config - Run configuration
- * @param prompts - Array of prompts to execute
- * @param outputPath - Optional output file path
- */
-export const runPipeline = async (
-  config: RunConfig,
-  prompts: Array<{ id: string; input: string | string[]; hint?: string; metadata?: Record<string, unknown> }>,
-  outputPath?: string,
-): Promise<void> => {
-  const {
-    mode,
-    schemaPath,
-    simpleCommand,
-    shellTemplate,
-    cwd,
-    timeout = DEFAULT_HARNESS_TIMEOUT,
-    progress = false,
-  } = config
-
-  const workingDir = cwd ?? process.cwd()
-  let isFirstOutput = true
-
-  // Clear output file if specified
-  if (outputPath) {
-    await Bun.write(outputPath, '')
-  }
-
-  if (mode === 'schema') {
-    // Schema mode: use headless adapter
-    if (!schemaPath) {
-      throw new Error('Schema path required for schema mode')
-    }
-
-    const schemaFile = Bun.file(schemaPath)
-    if (!(await schemaFile.exists())) {
-      throw new Error(`Schema file not found: ${schemaPath}`)
-    }
-
-    const rawSchema = await schemaFile.json()
-    const schema = parseHeadlessConfig(rawSchema)
-
-    const sessions = createSessionManager({
-      schema,
-      timeout,
-      verbose: progress,
-    })
-
-    logProgress(`Schema mode: ${schema.name}`, progress)
-
-    for (let i = 0; i < prompts.length; i++) {
-      const promptCase = prompts[i]
-      if (!promptCase) continue
-
-      logProgress(`[${i + 1}/${prompts.length}] ${promptCase.id}`, progress)
-
-      const startTime = Date.now()
-      const rawLines: string[] = []
-      let error: string | undefined
-
-      try {
-        const session = await sessions.create(workingDir)
-        const inputs = Array.isArray(promptCase.input) ? promptCase.input : [promptCase.input]
-
-        for (const turnInput of inputs) {
-          const result = await sessions.prompt(session.id, turnInput)
-          // Collect raw JSON lines from updates
-          for (const update of result.updates) {
-            rawLines.push(JSON.stringify(update.raw))
-          }
-        }
-
-        sessions.destroy(session.id)
-      } catch (err) {
-        error = err instanceof Error ? err.message : String(err)
-      }
-
-      const endTime = Date.now()
-
-      const output: RawOutput = {
-        id: promptCase.id,
-        input: promptCase.input,
-        hint: promptCase.hint,
-        metadata: promptCase.metadata,
-        rawLines,
-        timing: {
-          start: startTime,
-          end: endTime,
-          total: endTime - startTime,
-        },
-        ...(error && { error }),
-      }
-
-      await writeOutput(JSON.stringify(output), outputPath, !isFirstOutput)
-      isFirstOutput = false
-    }
-  } else if (mode === 'simple') {
-    // Simple mode: placeholder substitution
-    if (!simpleCommand) {
-      throw new Error('Command required for simple mode')
-    }
-
-    logProgress(`Simple mode: ${simpleCommand}`, progress)
-
-    for (let i = 0; i < prompts.length; i++) {
-      const promptCase = prompts[i]
-      if (!promptCase) continue
-
-      logProgress(`[${i + 1}/${prompts.length}] ${promptCase.id}`, progress)
-
-      const startTime = Date.now()
-      const inputs = Array.isArray(promptCase.input) ? promptCase.input : [promptCase.input]
-      const allLines: string[] = []
-      const errors: string[] = []
-
-      for (const input of inputs) {
-        const result = await runSimple(input, simpleCommand, timeout)
-        allLines.push(...result.lines)
-        if (result.error) errors.push(result.error)
-      }
-
-      const endTime = Date.now()
-
-      const output: RawOutput = {
-        id: promptCase.id,
-        input: promptCase.input,
-        hint: promptCase.hint,
-        metadata: promptCase.metadata,
-        rawLines: allLines,
-        timing: {
-          start: startTime,
-          end: endTime,
-          total: endTime - startTime,
-        },
-        ...(errors.length > 0 && { error: errors.join('\n') }),
-      }
-
-      await writeOutput(JSON.stringify(output), outputPath, !isFirstOutput)
-      isFirstOutput = false
-    }
-  } else if (mode === 'shell') {
-    // Shell mode: PROMPT env variable
-    if (!shellTemplate) {
-      throw new Error('Shell template required for shell mode')
-    }
-
-    logProgress(`Shell mode: ${shellTemplate}`, progress)
-
-    for (let i = 0; i < prompts.length; i++) {
-      const promptCase = prompts[i]
-      if (!promptCase) continue
-
-      logProgress(`[${i + 1}/${prompts.length}] ${promptCase.id}`, progress)
-
-      const startTime = Date.now()
-      const inputs = Array.isArray(promptCase.input) ? promptCase.input : [promptCase.input]
-      const allLines: string[] = []
-      const errors: string[] = []
-
-      for (const input of inputs) {
-        const result = await runShell(input, shellTemplate, timeout)
-        allLines.push(...result.lines)
-        if (result.error) errors.push(result.error)
-      }
-
-      const endTime = Date.now()
-
-      const output: RawOutput = {
-        id: promptCase.id,
-        input: promptCase.input,
-        hint: promptCase.hint,
-        metadata: promptCase.metadata,
-        rawLines: allLines,
-        timing: {
-          start: startTime,
-          end: endTime,
-          total: endTime - startTime,
-        },
-        ...(errors.length > 0 && { error: errors.join('\n') }),
-      }
-
-      await writeOutput(JSON.stringify(output), outputPath, !isFirstOutput)
-      isFirstOutput = false
-    }
-  }
-
-  logProgress('Done!', progress)
-}
-
-/**
- * Read prompts from stdin if available.
- *
- * @returns Array of parsed prompts or null if stdin is empty
- */
-const readStdinPrompts = async (): Promise<Array<{ id: string; input: string | string[]; hint?: string }> | null> => {
-  // Check if stdin has data (not a TTY)
-  if (process.stdin.isTTY) {
-    return null
-  }
-
-  const chunks: Buffer[] = []
-  for await (const chunk of process.stdin) {
-    chunks.push(chunk)
-  }
-
-  const content = Buffer.concat(chunks).toString('utf-8').trim()
-  if (!content) return null
-
-  return content
-    .split('\n')
-    .filter(Boolean)
-    .map((line) => JSON.parse(line))
-}
-
-/**
- * Pipeline run command CLI handler.
- *
- * @param args - Command line arguments (after 'run')
- */
-export const run = async (args: string[]): Promise<void> => {
-  const { values, positionals } = parseArgs({
-    args,
-    options: {
-      schema: { type: 'string', short: 's' },
-      simple: { type: 'string' },
-      shell: { type: 'string' },
-      output: { type: 'string', short: 'o' },
-      cwd: { type: 'string', short: 'c' },
-      timeout: { type: 'string', short: 't' },
-      progress: { type: 'boolean', default: false },
-      help: { type: 'boolean', short: 'h' },
-    },
-    allowPositionals: true,
-  })
-
-  if (values.help) {
-    console.log(`
-Usage: agent-eval-harness run [prompts.jsonl] [options]
-
-Execute prompts and output raw results for pipeline processing.
-
-Arguments:
-  prompts.jsonl     Input file (or pipe from stdin)
-
-Modes (choose one):
-  -s, --schema      Path to headless adapter schema (recommended)
-  --simple          Command template with {} placeholder
-  --shell           Shell template with $PROMPT env variable
-
-Options:
-  -o, --output      Output file (default: stdout)
-  -c, --cwd         Working directory for agent
-  -t, --timeout     Request timeout in ms (default: ${DEFAULT_HARNESS_TIMEOUT})
-  --progress        Show progress to stderr
-  -h, --help        Show this help message
-
-Examples:
-  # Schema mode (recommended)
-  agent-eval-harness run prompts.jsonl --schema claude.json | agent-eval-harness extract
-
-  # Simple mode with placeholder
-  agent-eval-harness run prompts.jsonl --simple "claude -p {} --output-format stream-json"
-
-  # Shell mode with env variable
-  agent-eval-harness run prompts.jsonl --shell 'claude -p "$PROMPT" --output-format stream-json'
-
-  # Pipe from stdin
-  cat prompts.jsonl | agent-eval-harness run --schema claude.json
-`)
-    return
-  }
-
-  // Determine mode
-  let mode: 'schema' | 'simple' | 'shell'
-  if (values.schema) {
-    mode = 'schema'
-  } else if (values.simple) {
-    mode = 'simple'
-  } else if (values.shell) {
-    mode = 'shell'
-  } else {
-    console.error('Error: Must specify --schema, --simple, or --shell mode')
-    process.exit(1)
-  }
-
-  // Load prompts from file or stdin
-  const promptsPath = positionals[0]
-  let prompts: Array<{ id: string; input: string | string[]; hint?: string; metadata?: Record<string, unknown> }>
-
-  if (promptsPath) {
-    prompts = await loadPrompts(promptsPath)
-  } else {
-    const stdinPrompts = await readStdinPrompts()
-    if (!stdinPrompts || stdinPrompts.length === 0) {
-      console.error('Error: No prompts provided (use file argument or pipe to stdin)')
-      process.exit(1)
-    }
-    prompts = stdinPrompts
-  }
-
-  await runPipeline(
-    {
-      mode,
-      schemaPath: values.schema,
-      simpleCommand: values.simple,
-      shellTemplate: values.shell,
-      cwd: values.cwd,
-      timeout: values.timeout ? Number.parseInt(values.timeout, 10) : undefined,
-      progress: values.progress,
-    },
-    prompts,
-    values.output,
-  )
-}
diff --git a/src/pipeline/tests/compare-format-detection.spec.ts b/src/pipeline/tests/compare-format-detection.spec.ts
deleted file mode 100644
index 4a958aa..0000000
--- a/src/pipeline/tests/compare-format-detection.spec.ts
+++ /dev/null
@@ -1,142 +0,0 @@
-/**
- * Unit tests for compare format detection.
- *
- * @remarks
- * Tests for auto-detecting CaptureResult vs TrialResult format.
- *
- * @packageDocumentation
- */
-
-import { afterAll, beforeAll, describe, expect, test } from 'bun:test'
-import { detectAndValidateFormat, detectInputFormat } from '../compare-format-detection.ts'
-
-// ============================================================================
-// Test Fixtures
-// ============================================================================
-
-const CAPTURE_RESULT = JSON.stringify({
-  id: 'test-001',
-  input: 'Hello',
-  output: 'Hi there',
-  trajectory: [{ type: 'message', content: 'Hi', timestamp: 1234567890 }],
-  timing: { start: 1234567890, end: 1234567891, total: 1, sessionCreation: 0 },
-  metadata: {},
-  toolErrors: false,
-})
-
-const TRIAL_RESULT = JSON.stringify({
-  id: 'test-001',
-  input: 'Hello',
-  k: 3,
-  passRate: 0.67,
-  passAtK: 0.9,
-  passExpK: 0.3,
-  trials: [
-    { trialNum: 1, output: 'Hi', trajectory: [], duration: 100, pass: true, score: 1.0 },
-    { trialNum: 2, output: 'Hello', trajectory: [], duration: 120, pass: true, score: 0.8 },
-    { trialNum: 3, output: 'Error', trajectory: [], duration: 150, pass: false, score: 0.2 },
-  ],
-})
-
-const tempDir = `${import.meta.dir}/.test-tmp/format-detection`
-
-beforeAll(async () => {
-  await Bun.$`mkdir -p ${tempDir}`
-})
-
-afterAll(async () => {
-  await Bun.$`rm -rf ${tempDir}`
-})
-
-// ============================================================================
-// detectInputFormat Tests
-// ============================================================================
-
-describe('detectInputFormat', () => {
-  test('detects CaptureResult format', async () => {
-    const path = `${tempDir}/capture.jsonl`
-    await Bun.write(path, `${CAPTURE_RESULT}\n`)
-
-    const format = await detectInputFormat(path)
-
-    expect(format).toBe('capture')
-  })
-
-  test('detects TrialResult format', async () => {
-    const path = `${tempDir}/trial.jsonl`
-    await Bun.write(path, `${TRIAL_RESULT}\n`)
-
-    const format = await detectInputFormat(path)
-
-    expect(format).toBe('trials')
-  })
-
-  test('throws on empty file', async () => {
-    const path = `${tempDir}/empty.jsonl`
-    await Bun.write(path, '')
-
-    await expect(detectInputFormat(path)).rejects.toThrow('Empty file')
-  })
-
-  test('throws on invalid JSON', async () => {
-    const path = `${tempDir}/invalid.jsonl`
-    await Bun.write(path, 'not json\n')
-
-    await expect(detectInputFormat(path)).rejects.toThrow('Invalid JSON')
-  })
-
-  test('throws on unrecognized format', async () => {
-    const path = `${tempDir}/unknown.jsonl`
-    await Bun.write(path, `${JSON.stringify({ id: 'test', foo: 'bar' })}\n`)
-
-    await expect(detectInputFormat(path)).rejects.toThrow('Unable to detect format')
-  })
-
-  test('ignores empty lines and uses first non-empty line', async () => {
-    const path = `${tempDir}/with-empty.jsonl`
-    await Bun.write(path, `\n\n${CAPTURE_RESULT}\n`)
-
-    const format = await detectInputFormat(path)
-
-    expect(format).toBe('capture')
-  })
-})
-
-// ============================================================================
-// detectAndValidateFormat Tests
-// ============================================================================
-
-describe('detectAndValidateFormat', () => {
-  test('validates all files have same format', async () => {
-    const path1 = `${tempDir}/capture1.jsonl`
-    const path2 = `${tempDir}/capture2.jsonl`
-    await Bun.write(path1, `${CAPTURE_RESULT}\n`)
-    await Bun.write(path2, `${CAPTURE_RESULT}\n`)
-
-    const format = await detectAndValidateFormat([path1, path2])
-
-    expect(format).toBe('capture')
-  })
-
-  test('throws on format mismatch', async () => {
-    const capturePath = `${tempDir}/capture-mixed.jsonl`
-    const trialPath = `${tempDir}/trial-mixed.jsonl`
-    await Bun.write(capturePath, `${CAPTURE_RESULT}\n`)
-    await Bun.write(trialPath, `${TRIAL_RESULT}\n`)
-
-    await expect(detectAndValidateFormat([capturePath, trialPath])).rejects.toThrow('Format mismatch')
-  })
-
-  test('throws on empty file list', async () => {
-    await expect(detectAndValidateFormat([])).rejects.toThrow('No files provided')
-  })
-
-  test('works with single file', async () => {
-    const path = `${tempDir}/single-trial.jsonl`
-    await Bun.write(path, `${TRIAL_RESULT}\n`)
-
-    const format = await detectAndValidateFormat([path])
-
-    expect(format).toBe('trials')
-  })
-})
diff --git a/src/pipeline/tests/compare-statistical.spec.ts b/src/pipeline/tests/compare-statistical.spec.ts
deleted file mode 100644
index 8daf9a8..0000000
--- a/src/pipeline/tests/compare-statistical.spec.ts
+++ /dev/null
@@ -1,289 +0,0 @@
-/**
- * Integration tests for compare command statistical strategy.
- *
- * @remarks
- * Tests verify confidence interval computation for the statistical strategy
- * in the compare command with CaptureResult format.
- *
- * @packageDocumentation
- */
-
-import { afterAll, beforeAll, describe, expect, test } from 'bun:test'
-import type { CaptureResult } from '../../schemas.ts'
-import { runCompare } from '../compare.ts'
-
-// ============================================================================
-// Test Fixtures
-// ============================================================================
-
-const createCaptureResult = (id: string, score: number, pass: boolean, duration: number = 1000): CaptureResult => ({
-  id,
-  input: `Prompt for ${id}`,
-  output: `Output for ${id}`,
-  trajectory: [{ type: 'message', content: `Output for ${id}`, timestamp: Date.now() }],
-  metadata: {},
-  timing: {
-    start: Date.now(),
-    end: Date.now() + duration,
-    sessionCreation: 100,
-    total: duration,
-  },
-  toolErrors: false,
-  score: {
-    pass,
-    score,
-    reasoning: pass ? 'Passed' : 'Failed',
-  },
-})
-
-const tempDir = `${import.meta.dir}/.test-tmp/compare-statistical`
-
-beforeAll(async () => {
-  await Bun.$`mkdir -p ${tempDir}`
-})
-
-afterAll(async () => {
-  await Bun.$`rm -rf ${tempDir}`
-})
-
-// ============================================================================
-// Statistical Strategy CI Tests
-// ============================================================================
-
-describe('runCompare statistical strategy', () => {
-  test('computes confidence intervals for quality metrics', async () => {
-    const run1Path = `${tempDir}/ci-qual-run1.jsonl`
-    const run2Path = `${tempDir}/ci-qual-run2.jsonl`
-
-    // Create multiple prompts with varying scores for meaningful CI computation
-    const results1 = [
-      createCaptureResult('p1', 0.9, true, 1000),
-      createCaptureResult('p2', 0.85, true, 1100),
-      createCaptureResult('p3', 0.95, true, 900),
-      createCaptureResult('p4', 0.8, true, 1200),
-    ]
-    const results2 = [
-      createCaptureResult('p1', 0.6, false, 2000),
-      createCaptureResult('p2', 0.5, false, 2100),
-      createCaptureResult('p3', 0.7, true, 1900),
-      createCaptureResult('p4', 0.55, false, 2200),
-    ]
-
-    await Bun.write(run1Path, results1.map((r) => JSON.stringify(r)).join('\n'))
-    await Bun.write(run2Path, results2.map((r) => JSON.stringify(r)).join('\n'))
-
-    const report = await runCompare({
-      runs: [
-        { label: 'high', path: run1Path },
-        { label: 'low', path: run2Path },
-      ],
-      strategy: 'statistical',
-      progress: false,
-    })
-
-    // Verify confidence intervals are computed for quality
-    const highQuality = report.quality.high
-    expect(highQuality).toBeDefined()
-    expect(highQuality?.confidenceIntervals).toBeDefined()
-    expect(highQuality?.confidenceIntervals?.avgScore).toBeDefined()
-    expect(highQuality?.confidenceIntervals?.passRate).toBeDefined()
-
-    // avgScore CI should be a tuple [lower, upper]
-    const avgScoreCI = highQuality?.confidenceIntervals?.avgScore
-    expect(avgScoreCI).toHaveLength(2)
-    expect(avgScoreCI?.[0]).toBeLessThanOrEqual(avgScoreCI?.[1] ?? 0)
-
-    // CI should contain the average (within reasonable bounds)
-    expect(avgScoreCI?.[0]).toBeLessThanOrEqual(highQuality?.avgScore ?? 0)
-    expect(avgScoreCI?.[1]).toBeGreaterThanOrEqual(highQuality?.avgScore ?? 1)
-
-    // passRate CI should also be valid
-    const passRateCI = highQuality?.confidenceIntervals?.passRate
-    expect(passRateCI).toHaveLength(2)
-    expect(passRateCI?.[0]).toBeLessThanOrEqual(passRateCI?.[1] ?? 0)
-
-    // Verify reliability metrics include type discriminator
-    expect(report.reliability.high?.type).toBe('run')
-    expect(report.reliability.low?.type).toBe('run')
-
-    // Verify quality metrics include type discriminator
-    expect(report.quality.high?.type).toBe('run')
-    expect(report.quality.low?.type).toBe('run')
-  })
-
-  test('computes confidence intervals for performance metrics', async () => {
-    const run1Path = `${tempDir}/ci-perf-run1.jsonl`
-    const run2Path = `${tempDir}/ci-perf-run2.jsonl`
-
-    // Create results with varying latencies
-    const results1 = [
-      createCaptureResult('p1', 0.9, true, 1000),
-      createCaptureResult('p2', 0.85, true, 1100),
-      createCaptureResult('p3', 0.95, true, 900),
-      createCaptureResult('p4', 0.8, true, 1050),
-    ]
-    const results2 = [
-      createCaptureResult('p1', 0.7, true, 2000),
-      createCaptureResult('p2', 0.65, true, 2200),
-      createCaptureResult('p3', 0.75, true, 1800),
-      createCaptureResult('p4', 0.6, true, 2100),
-    ]
-
-    await Bun.write(run1Path, results1.map((r) => JSON.stringify(r)).join('\n'))
-    await Bun.write(run2Path, results2.map((r) => JSON.stringify(r)).join('\n'))
-
-    const report = await runCompare({
-      runs: [
-        { label: 'fast', path: run1Path },
-        { label: 'slow', path: run2Path },
-      ],
-      strategy: 'statistical',
-      progress: false,
-    })
-
-    // Verify confidence intervals are computed for performance
-    const fastPerf = report.performance.fast
-    expect(fastPerf).toBeDefined()
-    expect(fastPerf?.confidenceIntervals).toBeDefined()
-    expect(fastPerf?.confidenceIntervals?.latencyMean).toBeDefined()
-
-    // latencyMean CI should be a tuple [lower, upper]
-    const latencyCI = fastPerf?.confidenceIntervals?.latencyMean
-    expect(latencyCI).toHaveLength(2)
-    expect(latencyCI?.[0]).toBeLessThanOrEqual(latencyCI?.[1] ?? 0)
-
-    // Fast run should have lower latency CI than slow run
-    const slowPerf = report.performance.slow
-    const slowLatencyCI = slowPerf?.confidenceIntervals?.latencyMean
-    expect(latencyCI?.[1]).toBeLessThan(slowLatencyCI?.[0] ?? 0)
-  })
-
-  test('weighted strategy does not compute confidence intervals', async () => {
-    const run1Path = `${tempDir}/no-ci-run1.jsonl`
-    const run2Path = `${tempDir}/no-ci-run2.jsonl`
-
-    const results1 = [createCaptureResult('p1', 0.9, true), createCaptureResult('p2', 0.85, true)]
-    const results2 = [createCaptureResult('p1', 0.6, false), createCaptureResult('p2', 0.5, false)]
-
-    await Bun.write(run1Path, results1.map((r) => JSON.stringify(r)).join('\n'))
-    await Bun.write(run2Path, results2.map((r) => JSON.stringify(r)).join('\n'))
-
-    const report = await runCompare({
-      runs: [
-        { label: 'run1', path: run1Path },
-        { label: 'run2', path: run2Path },
-      ],
-      strategy: 'weighted', // Default strategy
-      progress: false,
-    })
-
-    // Confidence intervals should NOT be present for weighted strategy
-    const quality = report.quality.run1
-    expect(quality?.confidenceIntervals).toBeUndefined()
-
-    const perf = report.performance.run1
-    expect(perf?.confidenceIntervals).toBeUndefined()
-  })
-
-  test('statistical strategy includes CIs in markdown output', async () => {
-    const run1Path = `${tempDir}/ci-md-run1.jsonl`
-    const run2Path = `${tempDir}/ci-md-run2.jsonl`
-    const outputPath = `${tempDir}/ci-report.md`
-
-    const results1 = [createCaptureResult('p1', 0.9, true, 1000), createCaptureResult('p2', 0.85, true, 1100)]
-    const results2 = [createCaptureResult('p1', 0.6, false, 2000), createCaptureResult('p2', 0.5, false, 2100)]
-
-    await Bun.write(run1Path, results1.map((r) => JSON.stringify(r)).join('\n'))
-    await Bun.write(run2Path, results2.map((r) => JSON.stringify(r)).join('\n'))
-
-    await runCompare({
-      runs: [
-        { label: 'agent1', path: run1Path },
-        { label: 'agent2', path: run2Path },
-      ],
-      strategy: 'statistical',
-      outputPath,
-      format: 'markdown',
-      progress: false,
-    })
-
-    const content = await Bun.file(outputPath).text()
-
-    // Markdown should include 95% CI column headers
-    expect(content).toContain('95% CI')
-    // Should contain CI values in bracket format [lower, upper]
-    expect(content).toMatch(/\[\d+\.\d+, \d+\.\d+\]/)
-  })
-
-  test('handles single sample gracefully with degenerate CI', async () => {
-    const run1Path = `${tempDir}/single-run1.jsonl`
-    const run2Path = `${tempDir}/single-run2.jsonl`
-
-    // Single sample per run
-    const result1 = createCaptureResult('p1', 0.9, true)
-    const result2 = createCaptureResult('p1', 0.5, false)
-
-    await Bun.write(run1Path, JSON.stringify(result1))
-    await Bun.write(run2Path, JSON.stringify(result2))
-
-    const report = await runCompare({
-      runs: [
-        { label: 'single1', path: run1Path },
-        { label: 'single2', path: run2Path },
-      ],
-      strategy: 'statistical',
-      progress: false,
-    })
-
-    // Should still compute CIs (they will be degenerate for single sample)
-    const quality = report.quality.single1
-    expect(quality?.confidenceIntervals).toBeDefined()
-    expect(quality?.confidenceIntervals?.avgScore).toBeDefined()
-
-    // For single sample, CI should collapse to the value
-    const ci = quality?.confidenceIntervals?.avgScore
-    expect(ci?.[0]).toBeCloseTo(ci?.[1] ?? 0, 2)
-    expect(ci?.[0]).toBeCloseTo(quality?.avgScore ?? 0, 2)
-  })
-
-  test('JSON output includes confidence intervals structure', async () => {
-    const run1Path = `${tempDir}/json-ci-run1.jsonl`
-    const run2Path = `${tempDir}/json-ci-run2.jsonl`
-    const outputPath = `${tempDir}/ci-report.json`
-
-    const results1 = [
-      createCaptureResult('p1', 0.9, true),
-      createCaptureResult('p2', 0.85, true),
-      createCaptureResult('p3', 0.95, true),
-    ]
-    const results2 = [
-      createCaptureResult('p1', 0.6, false),
-      createCaptureResult('p2', 0.5, false),
-      createCaptureResult('p3', 0.7, true),
-    ]
-
-    await Bun.write(run1Path, results1.map((r) => JSON.stringify(r)).join('\n'))
-    await Bun.write(run2Path, results2.map((r) => JSON.stringify(r)).join('\n'))
-
-    await runCompare({
-      runs: [
-        { label: 'high', path: run1Path },
-        { label: 'low', path: run2Path },
-      ],
-      strategy: 'statistical',
-      outputPath,
-      format: 'json',
-      progress: false,
-    })
-
-    const content = await Bun.file(outputPath).text()
-    const parsed = JSON.parse(content)
-
-    // Verify JSON structure includes confidenceIntervals
-    expect(parsed.quality.high.confidenceIntervals).toBeDefined()
-    expect(parsed.quality.high.confidenceIntervals.avgScore).toBeInstanceOf(Array)
-    expect(parsed.quality.high.confidenceIntervals.avgScore.length).toBe(2)
-    expect(parsed.performance.high.confidenceIntervals).toBeDefined()
-    expect(parsed.performance.high.confidenceIntervals.latencyMean).toBeInstanceOf(Array)
-  })
-})
diff --git a/src/pipeline/tests/compare-trials.spec.ts b/src/pipeline/tests/compare-trials.spec.ts
deleted file mode 100644
index 9064344..0000000
--- a/src/pipeline/tests/compare-trials.spec.ts
+++ /dev/null
@@ -1,592 +0,0 @@
-/**
- * Unit tests for trials comparison module.
- *
- * @remarks
- * Tests for runTrialsCompare and supporting functions.
- *
- * @packageDocumentation
- */
-
-import { afterAll, beforeAll, describe, expect, test } from 'bun:test'
-import { buildTrialsIndex, runTrialsCompare } from '../compare-trials.ts'
-
-// ============================================================================
-// Test Fixtures
-// ============================================================================
-
-const createTrialResult = (
-  id: string,
-  passAtK: number,
-  passExpK: number,
-  k: number = 3,
-  includeScores: boolean = true,
-) => ({
-  id,
-  input: `Prompt for ${id}`,
-  k,
-  ...(includeScores && { passRate: passAtK, passAtK, passExpK }),
-  trials: Array.from({ length: k }, (_, i) => ({
-    trialNum: i + 1,
-    output: `Output ${i + 1}`,
-    trajectory: [],
-    duration: 100 + i * 10,
-    ...(includeScores && { pass: Math.random() < passAtK, score: passAtK }),
-  })),
-})
-
-const tempDir = `${import.meta.dir}/.test-tmp/compare-trials`
-
-beforeAll(async () => {
-  await Bun.$`mkdir -p ${tempDir}`
-})
-
-afterAll(async () => {
-  await Bun.$`rm -rf ${tempDir}`
-})
-
-// ============================================================================
-// buildTrialsIndex Tests
-// ============================================================================
-
-describe('buildTrialsIndex', () => {
-  test('builds index from JSONL file', async () => {
-    const path = `${tempDir}/trials-index.jsonl`
-    const trial1 = createTrialResult('test-001', 0.9, 0.3)
-    const trial2 = createTrialResult('test-002', 0.8, 0.6)
-    await Bun.write(path, [JSON.stringify(trial1), JSON.stringify(trial2)].join('\n'))
-
-    const index = await buildTrialsIndex(path)
-
-    expect(index.size).toBe(2)
-    expect(index.get('test-001')?.passAtK).toBe(0.9)
-    expect(index.get('test-002')?.passExpK).toBe(0.6)
-  })
-
-  test('handles empty file', async () => {
-    const path = `${tempDir}/empty-trials.jsonl`
-    await Bun.write(path, '')
-
-    const index = await buildTrialsIndex(path)
-
-    expect(index.size).toBe(0)
-  })
-
-  test('throws on invalid JSON', async () => {
-    const path = `${tempDir}/invalid-trials.jsonl`
-    await Bun.write(path, 'not json\n')
-
-    await expect(buildTrialsIndex(path)).rejects.toThrow()
-  })
-})
-
-// ============================================================================
-// runTrialsCompare Tests
-// ============================================================================
-
-describe('runTrialsCompare', () => {
-  test('compares two trial runs and produces report', async () => {
-    const run1Path = `${tempDir}/run1.jsonl`
-    const run2Path = `${tempDir}/run2.jsonl`
-
-    const trial1a = createTrialResult('test-001', 0.9, 0.7)
-    const trial1b = createTrialResult('test-002', 0.8, 0.5)
-    const trial2a = createTrialResult('test-001', 0.95, 0.9)
-    const trial2b = createTrialResult('test-002', 0.6, 0.4)
-
-    await Bun.write(run1Path, [JSON.stringify(trial1a), JSON.stringify(trial1b)].join('\n'))
-    await Bun.write(run2Path, [JSON.stringify(trial2a), JSON.stringify(trial2b)].join('\n'))
-
-    const outputPath = `${tempDir}/comparison.json`
-    const report = await runTrialsCompare({
-      runs: [
-        { label: 'baseline', path: run1Path },
-        { label: 'variant', path: run2Path },
-      ],
-      outputPath,
-      progress: false,
-    })
-
-    expect(report.meta.inputFormat).toBe('trials')
-    expect(report.meta.runs).toEqual(['baseline', 'variant'])
-    expect(report.meta.promptCount).toBe(2)
-    expect(report.capability).toBeDefined()
-    expect(report.reliability).toBeDefined()
-    expect(report.reliability.baseline?.type).toBe('trial')
-    expect(report.reliability.variant?.type).toBe('trial')
-    expect(report.flakiness).toBeDefined()
-    expect(report.headToHead.capability.length).toBeGreaterThan(0)
-
-    // Verify output file was written
-    const outputExists = await Bun.file(outputPath).exists()
-    expect(outputExists).toBe(true)
-  })
-
-  test('throws with fewer than 2 runs', async () => {
-    const run1Path = `${tempDir}/single-run.jsonl`
-    await Bun.write(run1Path, JSON.stringify(createTrialResult('test-001', 0.9, 0.7)))
-
-    await expect(
-      runTrialsCompare({
-        runs: [{ label: 'only', path: run1Path }],
-        progress: false,
-      }),
-    ).rejects.toThrow('At least 2 runs required')
-  })
-
-  test('skips prompts only in one run', async () => {
-    const run1Path = `${tempDir}/partial1.jsonl`
-    const run2Path = `${tempDir}/partial2.jsonl`
-
-    // Only run1 has test-001
-    const trial1a = createTrialResult('test-001', 0.9, 0.7)
-    // Both have test-002
-    const trial1b = createTrialResult('test-002', 0.8, 0.5)
-    const trial2b = createTrialResult('test-002', 0.6, 0.4)
-
-    await Bun.write(run1Path, [JSON.stringify(trial1a), JSON.stringify(trial1b)].join('\n'))
-    await Bun.write(run2Path, JSON.stringify(trial2b))
-
-    const report = await runTrialsCompare({
-      runs: [
-        { label: 'run1', path: run1Path },
-        { label: 'run2', path: run2Path },
-      ],
-      progress: false,
-    })
-
-    // Only test-002 should be compared (both runs have it)
-    expect(report.headToHead.overall.length).toBeGreaterThan(0)
-    // Per-prompt should only have test-002
-    const perPromptIds = report.perPrompt?.map((p) => p.id) ?? []
-    expect(perPromptIds).toContain('test-002')
-    expect(perPromptIds).not.toContain('test-001')
-  })
-
-  test('generates markdown output when format is markdown', async () => {
-    const run1Path = `${tempDir}/md-run1.jsonl`
-    const run2Path = `${tempDir}/md-run2.jsonl`
-    const outputPath = `${tempDir}/report.md`
-
-    const trial1 = createTrialResult('test-001', 0.9, 0.7)
-    const trial2 = createTrialResult('test-001', 0.8, 0.6)
-
-    await Bun.write(run1Path, JSON.stringify(trial1))
-    await Bun.write(run2Path, JSON.stringify(trial2))
-
-    await runTrialsCompare({
-      runs: [
-        { label: 'agent1', path: run1Path },
-        { label: 'agent2', path: run2Path },
-      ],
-      outputPath,
-      format: 'markdown',
-      progress: false,
-    })
-
-    const content = await Bun.file(outputPath).text()
-    expect(content).toContain('# Trials Comparison Report')
-    expect(content).toContain('## Capability')
-    expect(content).toContain('## Reliability')
-    expect(content).toContain('## Flakiness')
-    expect(content).toContain('agent1')
-    expect(content).toContain('agent2')
-  })
-
-  test('uses statistical strategy when specified', async () => {
-    const run1Path = `${tempDir}/stat-run1.jsonl`
-    const run2Path = `${tempDir}/stat-run2.jsonl`
-
-    const trial1 = createTrialResult('test-001', 0.9, 0.7)
-    const trial2 = createTrialResult('test-001', 0.5, 0.3)
-
-    await Bun.write(run1Path, JSON.stringify(trial1))
-    await Bun.write(run2Path, JSON.stringify(trial2))
-
-    const report = await runTrialsCompare({
-      runs: [
-        { label: 'better', path: run1Path },
-        { label: 'worse', path: run2Path },
-      ],
-      strategy: 'statistical',
-      progress: false,
-    })
-
-    // Report should be generated without error
-    expect(report.meta.runs).toEqual(['better', 'worse'])
-  })
-
-  test('statistical strategy computes confidence intervals for capability metrics', async () => {
-    const run1Path = `${tempDir}/ci-cap-run1.jsonl`
-    const run2Path = `${tempDir}/ci-cap-run2.jsonl`
-
-    // Create multiple prompts for meaningful CI computation
-    const trials1 = [
-      createTrialResult('p1', 0.9, 0.8),
-      createTrialResult('p2', 0.85, 0.7),
-      createTrialResult('p3', 0.95, 0.9),
-    ]
-    const trials2 = [
-      createTrialResult('p1', 0.6, 0.4),
-      createTrialResult('p2', 0.5, 0.3),
-      createTrialResult('p3', 0.7, 0.5),
-    ]
-
-    await Bun.write(run1Path, trials1.map((t) => JSON.stringify(t)).join('\n'))
-    await Bun.write(run2Path, trials2.map((t) => JSON.stringify(t)).join('\n'))
-
-    const report = await runTrialsCompare({
-      runs: [
-        { label: 'high', path: run1Path },
-        { label: 'low', path: run2Path },
-      ],
-      strategy: 'statistical',
-      progress: false,
-    })
-
-    // Verify confidence intervals are computed for capability
-    const highCap = report.capability.high
-    expect(highCap).toBeDefined()
-    expect(highCap?.confidenceIntervals).toBeDefined()
-    expect(highCap?.confidenceIntervals?.avgPassAtK).toBeDefined()
-
-    // CI should be a tuple [lower, upper]
-    const ci = highCap?.confidenceIntervals?.avgPassAtK
-    expect(ci).toHaveLength(2)
-    expect(ci?.[0]).toBeLessThanOrEqual(ci?.[1] ?? 0)
-
-    // CI should contain the average (within reasonable bounds)
-    expect(ci?.[0]).toBeLessThanOrEqual(highCap?.avgPassAtK ?? 0)
-    expect(ci?.[1]).toBeGreaterThanOrEqual(highCap?.avgPassAtK ?? 1)
-  })
-
-  test('statistical strategy computes confidence intervals for reliability metrics', async () => {
-    const run1Path = `${tempDir}/ci-rel-run1.jsonl`
-    const run2Path = `${tempDir}/ci-rel-run2.jsonl`
-
-    const trials1 = [
-      createTrialResult('p1', 0.9, 0.85),
-      createTrialResult('p2', 0.8, 0.75),
-      createTrialResult('p3', 0.85, 0.8),
-    ]
-    const trials2 = [
-      createTrialResult('p1', 0.7, 0.3),
-      createTrialResult('p2', 0.6, 0.2),
-      createTrialResult('p3', 0.65, 0.25),
-    ]
-
-    await Bun.write(run1Path, trials1.map((t) => JSON.stringify(t)).join('\n'))
-    await Bun.write(run2Path, trials2.map((t) => JSON.stringify(t)).join('\n'))
-
-    const report = await runTrialsCompare({
-      runs: [
-        { label: 'reliable', path: run1Path },
-        { label: 'flaky', path: run2Path },
-      ],
-      strategy: 'statistical',
-      progress: false,
-    })
-
-    // Verify confidence intervals are computed for reliability
-    const reliableRel = report.reliability.reliable
-    expect(reliableRel).toBeDefined()
-    expect(reliableRel?.type).toBe('trial')
-    expect(reliableRel?.confidenceIntervals).toBeDefined()
-    expect(reliableRel?.confidenceIntervals?.avgPassExpK).toBeDefined()
-
-    // CI should be a tuple [lower, upper]
-    const ci = reliableRel?.confidenceIntervals?.avgPassExpK
-    expect(ci).toHaveLength(2)
-    expect(ci?.[0]).toBeLessThanOrEqual(ci?.[1] ?? 0)
-  })
-
-  test('weighted strategy does not compute confidence intervals', async () => {
-    const run1Path = `${tempDir}/no-ci-run1.jsonl`
-    const run2Path = `${tempDir}/no-ci-run2.jsonl`
-
-    const trial1 = createTrialResult('test-001', 0.9, 0.7)
-    const trial2 = createTrialResult('test-001', 0.5, 0.3)
-
-    await Bun.write(run1Path, JSON.stringify(trial1))
-    await Bun.write(run2Path, JSON.stringify(trial2))
-
-    const report = await runTrialsCompare({
-      runs: [
-        { label: 'run1', path: run1Path },
-        { label: 'run2', path: run2Path },
-      ],
-      strategy: 'weighted', // Default strategy
-      progress: false,
-    })
-
-    // Confidence intervals should NOT be present for weighted strategy
-    const cap = report.capability.run1
-    expect(cap?.confidenceIntervals).toBeUndefined()
-
-    const rel = report.reliability.run1
-    expect(rel?.confidenceIntervals).toBeUndefined()
-  })
-
-  test('statistical strategy includes CIs in markdown output', async () => {
-    const run1Path = `${tempDir}/ci-md-run1.jsonl`
-    const run2Path = `${tempDir}/ci-md-run2.jsonl`
-    const outputPath = `${tempDir}/ci-report.md`
-
-    const trials1 = [createTrialResult('p1', 0.9, 0.8), createTrialResult('p2', 0.85, 0.75)]
-    const trials2 = [createTrialResult('p1', 0.6, 0.4), createTrialResult('p2', 0.5, 0.3)]
-
-    await Bun.write(run1Path, trials1.map((t) => JSON.stringify(t)).join('\n'))
-    await Bun.write(run2Path, trials2.map((t) => JSON.stringify(t)).join('\n'))
-
-    await runTrialsCompare({
-      runs: [
-        { label: 'agent1', path: run1Path },
-        { label: 'agent2', path: run2Path },
-      ],
-      strategy: 'statistical',
-      outputPath,
-      format: 'markdown',
-      progress: false,
-    })
-
-    const content = await Bun.file(outputPath).text()
-
-    // Markdown should include 95% CI column headers
-    expect(content).toContain('95% CI')
-    // Should contain CI values in bracket format [lower, upper]
-    expect(content).toMatch(/\[\d+\.\d+, \d+\.\d+\]/)
-  })
-
-  test('computes correct capability metrics', async () => {
-    const run1Path = `${tempDir}/cap-run1.jsonl`
-
-    // Create 3 prompts with known passAtK values
-    const trials = [
-      createTrialResult('p1', 1.0, 0.8), // passAtK = 1.0
-      createTrialResult('p2', 0.5, 0.3), // passAtK = 0.5
-      createTrialResult('p3', 0.8, 0.6), // passAtK = 0.8
-    ]
-    // Average passAtK = (1.0 + 0.5 + 0.8) / 3 = 0.767
-    // Sorted: 0.5, 0.8, 1.0 -> median = 0.8
-
-    await Bun.write(run1Path, trials.map((t) => JSON.stringify(t)).join('\n'))
-
-    const run2Path = `${tempDir}/cap-run2.jsonl`
-    await Bun.write(run2Path, trials.map((t) => JSON.stringify(t)).join('\n'))
-
-    const report = await runTrialsCompare({
-      runs: [
-        { label: 'test', path: run1Path },
-        { label: 'test2', path: run2Path },
-      ],
-      progress: false,
-    })
-
-    const cap = report.capability.test
-    expect(cap).toBeDefined()
-    // Average should be approximately 0.767
-    expect(cap?.avgPassAtK).toBeCloseTo(0.767, 2)
-    // Median of [0.5, 0.8, 1.0] = 0.8
-    expect(cap?.medianPassAtK).toBeCloseTo(0.8, 2)
-  })
-
-  test('identifies flaky prompts correctly', async () => {
-    const run1Path = `${tempDir}/flaky-run1.jsonl`
-
-    // Create prompts with varying flakiness
-    const trials = [
-      createTrialResult('consistent', 0.9, 0.9), // flakiness = 0
-      createTrialResult('flaky', 0.9, 0.1), // flakiness = 0.8
-      createTrialResult('moderate', 0.7, 0.5), // flakiness = 0.2
-    ]
-
-    await Bun.write(run1Path, trials.map((t) => JSON.stringify(t)).join('\n'))
-
-    const run2Path = `${tempDir}/flaky-run2.jsonl`
-    await Bun.write(run2Path, trials.map((t) => JSON.stringify(t)).join('\n'))
-
-    const report = await runTrialsCompare({
-      runs: [
-        { label: 'test', path: run1Path },
-        { label: 'test2', path: run2Path },
-      ],
-      progress: false,
-    })
-
-    const flak = report.flakiness.test
-    expect(flak).toBeDefined()
-    // 2 prompts have non-zero flakiness
-    expect(flak?.flakyPromptCount).toBe(2)
-    // Top flaky should include 'flaky' prompt
-    const topFlakyIds = flak?.topFlakyPrompts.map((p) => p.id) ?? []
-    expect(topFlakyIds).toContain('flaky')
-  })
-
-  test('includes performance metrics with latency stats', async () => {
-    const run1Path = `${tempDir}/perf-run1.jsonl`
-    const run2Path = `${tempDir}/perf-run2.jsonl`
-
-    const trial1 = createTrialResult('test-001', 0.9, 0.7)
-    const trial2 = createTrialResult('test-001', 0.8, 0.6)
-
-    await Bun.write(run1Path, JSON.stringify(trial1))
-    await Bun.write(run2Path, JSON.stringify(trial2))
-
-    const report = await runTrialsCompare({
-      runs: [
-        { label: 'run1', path: run1Path },
-        { label: 'run2', path: run2Path },
-      ],
-      progress: false,
-    })
-
-    // Performance should always be present
-    expect(report.performance).toBeDefined()
-    expect(report.performance.run1).toBeDefined()
-    expect(report.performance.run2).toBeDefined()
-
-    const perf = report.performance.run1
-    expect(perf?.latency).toBeDefined()
-    expect(perf?.latency.p50).toBeGreaterThan(0)
-    expect(perf?.latency.mean).toBeGreaterThan(0)
-    expect(perf?.latency.min).toBeGreaterThan(0)
-    expect(perf?.latency.max).toBeGreaterThan(0)
-    expect(perf?.totalDuration).toBeGreaterThan(0)
-  })
-
-  test('includes quality metrics when scores are present', async () => {
-    const run1Path = `${tempDir}/qual-run1.jsonl`
-    const run2Path = `${tempDir}/qual-run2.jsonl`
-
-    // createTrialResult always includes score fields
-    const trial1 = createTrialResult('test-001', 0.9, 0.7)
-    const trial2 = createTrialResult('test-001', 0.8, 0.6)
-
-    await Bun.write(run1Path, JSON.stringify(trial1))
-    await Bun.write(run2Path, JSON.stringify(trial2))
-
-    const report = await runTrialsCompare({
-      runs: [
-        { label: 'run1', path: run1Path },
-        { label: 'run2', path: run2Path },
-      ],
-      progress: false,
-    })
-
-    // Quality should be present since trials have scores
-    expect(report.quality).toBeDefined()
-    expect(report.quality?.run1).toBeDefined()
-
-    const qual = report.quality?.run1
-    expect(qual?.type).toBe('trial')
-    expect(qual?.avgScore).toBeGreaterThan(0)
-    expect(qual?.medianScore).toBeGreaterThan(0)
-    expect(qual?.p25Score).toBeDefined()
-    expect(qual?.p75Score).toBeDefined()
-  })
-
-  test('omits quality metrics when scores are absent', async () => {
-    const run1Path = `${tempDir}/noqual-run1.jsonl`
-    const run2Path = `${tempDir}/noqual-run2.jsonl`
-
-    // Create trials without scores (includeScores=false)
-    const trial1 = createTrialResult('test-001', 0, 0, 3, false)
-    const trial2 = createTrialResult('test-001', 0, 0, 3, false)
-
-    await Bun.write(run1Path, JSON.stringify(trial1))
-    await Bun.write(run2Path, JSON.stringify(trial2))
-
-    const report = await runTrialsCompare({
-      runs: [
-        { label: 'run1', path: run1Path },
-        { label: 'run2', path: run2Path },
-      ],
-      progress: false,
-    })
-
-    // Quality should NOT be present since no trials have scores
-    expect(report.quality).toBeUndefined()
-
-    // Performance should still be present
-    expect(report.performance).toBeDefined()
-    expect(report.performance.run1?.latency.mean).toBeGreaterThan(0)
-  })
-
-  test('statistical strategy computes CIs for quality and performance', async () => {
-    const run1Path = `${tempDir}/ci-qp-run1.jsonl`
-    const run2Path = `${tempDir}/ci-qp-run2.jsonl`
-
-    const trials1 = [
-      createTrialResult('p1', 0.9, 0.8),
-      createTrialResult('p2', 0.85, 0.7),
-      createTrialResult('p3', 0.95, 0.9),
-    ]
-    const trials2 = [
-      createTrialResult('p1', 0.6, 0.4),
-      createTrialResult('p2', 0.5, 0.3),
-      createTrialResult('p3', 0.7, 0.5),
-    ]
-
-    await Bun.write(run1Path, trials1.map((t) => JSON.stringify(t)).join('\n'))
-    await Bun.write(run2Path, trials2.map((t) => JSON.stringify(t)).join('\n'))
-
-    const report = await runTrialsCompare({
-      runs: [
-        { label: 'high', path: run1Path },
-        { label: 'low', path: run2Path },
-      ],
-      strategy: 'statistical',
-      progress: false,
-    })
-
-    // Quality CIs
-    const highQual = report.quality?.high
-    expect(highQual).toBeDefined()
-    expect(highQual?.confidenceIntervals).toBeDefined()
-    expect(highQual?.confidenceIntervals?.avgScore).toBeDefined()
-
-    const qualCI = highQual?.confidenceIntervals?.avgScore
-    expect(qualCI).toHaveLength(2)
-    expect(qualCI?.[0]).toBeLessThanOrEqual(qualCI?.[1] ?? 0)
-
-    // Performance CIs
-    const highPerf = report.performance.high
-    expect(highPerf).toBeDefined()
-    expect(highPerf?.confidenceIntervals).toBeDefined()
-    expect(highPerf?.confidenceIntervals?.latencyMean).toBeDefined()
-
-    const perfCI = highPerf?.confidenceIntervals?.latencyMean
-    expect(perfCI).toHaveLength(2)
-    expect(perfCI?.[0]).toBeLessThanOrEqual(perfCI?.[1] ?? 0)
-  })
-
-  test('markdown output includes quality and performance tables', async () => {
-    const run1Path = `${tempDir}/md-qp-run1.jsonl`
-    const run2Path = `${tempDir}/md-qp-run2.jsonl`
-    const outputPath = `${tempDir}/qp-report.md`
-
-    const trial1 = createTrialResult('test-001', 0.9, 0.7)
-    const trial2 = createTrialResult('test-001', 0.8, 0.6)
-
-    await Bun.write(run1Path, JSON.stringify(trial1))
-    await Bun.write(run2Path, JSON.stringify(trial2))
-
-    await runTrialsCompare({
-      runs: [
-        { label: 'agent1', path: run1Path },
-        { label: 'agent2', path: run2Path },
-      ],
-      outputPath,
-      format: 'markdown',
-      progress: false,
-    })
-
-    const content = await Bun.file(outputPath).text()
-
-    // Should contain quality and performance sections
-    expect(content).toContain('## Quality (Scores)')
-    expect(content).toContain('## Performance (Latency)')
-    expect(content).toContain('Avg Score')
-    expect(content).toContain('P50 (ms)')
-    expect(content).toContain('Mean (ms)')
-  })
-})
diff --git a/src/pipeline/tests/compare-utils.spec.ts b/src/pipeline/tests/compare-utils.spec.ts
deleted file mode 100644
index 31cf933..0000000
--- a/src/pipeline/tests/compare-utils.spec.ts
+++ /dev/null
@@ -1,128 +0,0 @@
-/**
- * Unit tests for compare-utils shared utilities.
- *
- * @remarks
- * Tests for percentile, computeLatencyStats, and computeScoreDistribution.
- *
- * @packageDocumentation
- */
-
-import { describe, expect, test } from 'bun:test'
-import { computeLatencyStats, computeScoreDistribution, percentile } from '../compare-utils.ts'
-
-// ============================================================================
-// percentile Tests
-// ============================================================================
-
-describe('percentile', () => {
-  test('computes correct percentile values', () => {
-    const sorted = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
-
-    expect(percentile(sorted, 0.5)).toBe(60)
-    expect(percentile(sorted, 0.25)).toBe(30)
-    expect(percentile(sorted, 0.75)).toBe(80)
-    expect(percentile(sorted, 0.9)).toBe(100)
-  })
-
-  test('returns 0 for empty array', () => {
-    expect(percentile([], 0.5)).toBe(0)
-  })
-
-  test('handles single-element array', () => {
-    expect(percentile([42], 0.5)).toBe(42)
-    expect(percentile([42], 0.0)).toBe(42)
-    expect(percentile([42], 1.0)).toBe(42)
-  })
-
-  test('handles p=0 and p=1 boundary values', () => {
-    const sorted = [10, 20, 30]
-
-    expect(percentile(sorted, 0)).toBe(10)
-    expect(percentile(sorted, 1)).toBe(30)
-  })
-})
-
-// ============================================================================
-// computeLatencyStats Tests
-// ============================================================================
-
-describe('computeLatencyStats', () => {
-  test('returns correct stats for typical durations', () => {
-    const durations = [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]
-    const stats = computeLatencyStats(durations)
-
-    expect(stats.min).toBe(100)
-    expect(stats.max).toBe(1000)
-    expect(stats.mean).toBe(550)
-    expect(stats.p50).toBe(600)
-    expect(stats.p90).toBe(1000)
-  })
-
-  test('returns zeros for empty array', () => {
-    const stats = computeLatencyStats([])
-
-    expect(stats.p50).toBe(0)
-    expect(stats.p90).toBe(0)
-    expect(stats.p99).toBe(0)
-    expect(stats.mean).toBe(0)
-    expect(stats.min).toBe(0)
-    expect(stats.max).toBe(0)
-  })
-
-  test('handles single-element array', () => {
-    const stats = computeLatencyStats([42])
-
-    expect(stats.p50).toBe(42)
-    expect(stats.p90).toBe(42)
-    expect(stats.mean).toBe(42)
-    expect(stats.min).toBe(42)
-    expect(stats.max).toBe(42)
-  })
-
-  test('sorts unsorted input', () => {
-    const stats = computeLatencyStats([500, 100, 300, 200, 400])
-
-    expect(stats.min).toBe(100)
-    expect(stats.max).toBe(500)
-    expect(stats.mean).toBe(300)
-  })
-})
-
-// ============================================================================
-// computeScoreDistribution Tests
-// ============================================================================
-
-describe('computeScoreDistribution', () => {
-  test('distributes scores into correct buckets', () => {
-    const scores = [0.1, 0.3, 0.5, 0.7, 0.9]
-    const dist = computeScoreDistribution(scores)
-
-    expect(dist['0.0-0.2']).toBe(1)
-    expect(dist['0.2-0.4']).toBe(1)
-    expect(dist['0.4-0.6']).toBe(1)
-    expect(dist['0.6-0.8']).toBe(1)
-    expect(dist['0.8-1.0']).toBe(1)
-  })
-
-  test('handles empty scores array', () => {
-    const dist = computeScoreDistribution([])
-
-    expect(dist['0.0-0.2']).toBe(0)
-    expect(dist['0.2-0.4']).toBe(0)
-    expect(dist['0.4-0.6']).toBe(0)
-    expect(dist['0.6-0.8']).toBe(0)
-    expect(dist['0.8-1.0']).toBe(0)
-  })
-
-  test('handles boundary values correctly', () => {
-    // 0.0 → first bucket, 0.2 → second bucket (not first), 1.0 → last bucket
-    const scores = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
-    const dist = computeScoreDistribution(scores)
-
-    expect(dist['0.0-0.2']).toBe(1) // 0.0
-    expect(dist['0.2-0.4']).toBe(1) // 0.2
-    expect(dist['0.4-0.6']).toBe(1) // 0.4
-    expect(dist['0.6-0.8']).toBe(1) // 0.6
-    expect(dist['0.8-1.0']).toBe(2) // 0.8, 1.0
-  })
-})
diff --git a/src/pipeline/tests/pipeline.spec.ts b/src/pipeline/tests/pipeline.spec.ts
deleted file mode 100644
index 2a1fb30..0000000
--- a/src/pipeline/tests/pipeline.spec.ts
+++ /dev/null
@@ -1,356 +0,0 @@
-/**
- * Unit tests for pipeline commands.
- *
- * @remarks
- * Tests for the Unix-style pipeline commands:
- * - format: formatMarkdown, formatCsv helpers
- * - compare: parseLabeledRun helper
- * - type validation
- *
- * @packageDocumentation
- */
-
-import { describe, expect, test } from 'bun:test'
-import type {
-  ComparisonGraderInput,
-  ComparisonGraderResult,
-  ExtractedResult,
-  FormatStyle,
-  GradedResult,
-  LabeledRun,
-  RawOutput,
-} from '../pipeline.types.ts'
-
-// ============================================================================
-// Type Validation Tests
-// ============================================================================
-
-describe('RawOutput type', () => {
-  test('accepts valid raw output', () => {
-    const raw: RawOutput = {
-      id: 'test-001',
-      input: 'What is 2+2?',
-      rawLines: ['{"type":"message","content":"4"}'],
-      timing: {
-        start: 1000,
-        end: 2000,
-        total: 1000,
-      },
-    }
-    expect(raw.id).toBe('test-001')
-    expect(raw.timing.total).toBe(1000)
-  })
-
-  test('accepts array input for multi-turn', () => {
-    const raw: RawOutput = {
-      id: 'multi-001',
-      input: ['Hello', 'How are you?'],
-      rawLines: [],
-      timing: { start: 0, end: 100, total: 100 },
-    }
-    expect(Array.isArray(raw.input)).toBe(true)
-    expect((raw.input as string[]).length).toBe(2)
-  })
-
-  test('accepts optional hint', () => {
-    const raw: RawOutput = {
-      id: 'hint-001',
-      input: 'Calculate something',
-      hint: 'Expected: numeric answer',
-      rawLines: [],
-      timing: { start: 0, end: 0, total: 0 },
-    }
-    expect(raw.hint).toBe('Expected: numeric answer')
-  })
-
-  test('accepts optional error', () => {
-    const raw: RawOutput = {
-      id: 'error-001',
-      input: 'fail test',
-      rawLines: [],
-      timing: { start: 0, end: 100, total: 100 },
-      error: 'Timeout exceeded',
-    }
-    expect(raw.error).toBe('Timeout exceeded')
-  })
-})
-
-describe('ExtractedResult type', () => {
-  test('accepts valid extracted result', () => {
-    const extracted: ExtractedResult = {
-      id: 'test-001',
-      input: 'What is 2+2?',
-      output: '4',
-      trajectory: [
-        {
-          type: 'message',
-          content: '4',
-          timestamp: 100,
-        },
-      ],
-      toolErrors: false,
-      timing: { start: 0, end: 100, total: 100 },
-    }
-    expect(extracted.output).toBe('4')
-    expect(extracted.trajectory.length).toBe(1)
-    expect(extracted.toolErrors).toBe(false)
-  })
-
-  test('accepts thought and tool_call steps', () => {
-    const extracted: ExtractedResult = {
-      id: 'complex-001',
-      input: 'Create a file',
-      output: 'Done',
-      trajectory: [
-        { type: 'thought', content: 'I need to create a file', timestamp: 50 },
-        {
-          type: 'tool_call',
-          name: 'Write',
-          input: { path: '/tmp/test.txt', content: 'hello' },
-          status: 'completed',
-          timestamp: 200,
-        },
-        { type: 'message', content: 'Done', timestamp: 250 },
-      ],
-      toolErrors: false,
-      timing: { start: 0, end: 300, total: 300 },
-    }
-    expect(extracted.trajectory.length).toBe(3)
-    expect(extracted.trajectory[1]?.type).toBe('tool_call')
-  })
-})
-
-describe('GradedResult type', () => {
-  test('extends ExtractedResult with score', () => {
-    const graded: GradedResult = {
-      id: 'graded-001',
-      input: 'What is 2+2?',
-      output: '4',
-      trajectory: [],
-      toolErrors: false,
-      timing: { start: 0, end: 100, total: 100 },
-      score: {
-        pass: true,
-        score: 1.0,
-        reasoning: 'Correct answer',
-      },
-    }
-    expect(graded.score.pass).toBe(true)
-    expect(graded.score.score).toBe(1.0)
-    expect(graded.score.reasoning).toBe('Correct answer')
-  })
-
-  test('accepts failing score', () => {
-    const graded: GradedResult = {
-      id: 'fail-001',
-      input: 'What is 2+2?',
-      output: '5',
-      trajectory: [],
-      toolErrors: false,
-      timing: { start: 0, end: 100, total: 100 },
-      score: {
-        pass: false,
-        score: 0.0,
-        reasoning: 'Incorrect answer',
-      },
-    }
-    expect(graded.score.pass).toBe(false)
-    expect(graded.score.score).toBe(0.0)
-  })
-})
-
-describe('FormatStyle type', () => {
-  test('accepts valid format styles', () => {
-    const styles: FormatStyle[] = ['jsonl', 'markdown', 'csv']
-    expect(styles).toContain('jsonl')
-    expect(styles).toContain('markdown')
-    expect(styles).toContain('csv')
-  })
-})
-
-describe('LabeledRun type', () => {
-  test('accepts label and path', () => {
-    const run: LabeledRun = {
-      label: 'baseline',
-      path: './results/baseline.jsonl',
-    }
-    expect(run.label).toBe('baseline')
-    expect(run.path).toBe('./results/baseline.jsonl')
-  })
-})
-
-describe('ComparisonGraderInput type', () => {
-  test('accepts multiple runs', () => {
-    const input: ComparisonGraderInput = {
-      id: 'compare-001',
-      input: 'What is 2+2?',
-      runs: {
-        baseline: { output: '4' },
-        experiment: { output: 'Four', trajectory: [] },
-      },
-    }
-    expect(Object.keys(input.runs).length).toBe(2)
-    expect(input.runs.baseline?.output).toBe('4')
-    expect(input.runs.experiment?.trajectory).toEqual([])
-  })
-})
-
-describe('ComparisonGraderResult type', () => {
-  test('accepts rankings with reasoning', () => {
-    const result: ComparisonGraderResult = {
-      rankings: [
-        { run: 'baseline', rank: 1, score: 0.95 },
-        { run: 'experiment', rank: 2, score: 0.8 },
-      ],
-      reasoning: 'Baseline was more concise',
-    }
-    expect(result.rankings.length).toBe(2)
-    expect(result.rankings[0]?.rank).toBe(1)
-    expect(result.reasoning).toBeDefined()
-  })
-})
-
-// ============================================================================
-// Helper Function Tests (via import)
-// ============================================================================
-
-// Note: Some helper functions are not exported from the modules.
-// These tests verify the type contracts that the helpers must satisfy.
-
-describe('pipeline data flow', () => {
-  test('RawOutput can flow to ExtractedResult', () => {
-    const raw: RawOutput = {
-      id: 'flow-001',
-      input: 'test',
-      hint: 'expected: something',
-      rawLines: ['{"type":"message","content":"result"}'],
-      timing: { start: 0, end: 100, total: 100 },
-    }
-
-    // Simulate extraction
-    const extracted: ExtractedResult = {
-      id: raw.id,
-      input: raw.input,
-      hint: raw.hint,
-      output: 'result',
-      trajectory: [{ type: 'message', content: 'result', timestamp: 100 }],
-      toolErrors: false,
-      timing: raw.timing,
-    }
-
-    expect(extracted.id).toBe(raw.id)
-    expect(extracted.input).toBe(raw.input)
-    expect(extracted.hint).toBe(raw.hint)
-  })
-
-  test('ExtractedResult can flow to GradedResult', () => {
-    const extracted: ExtractedResult = {
-      id: 'grade-flow-001',
-      input: 'test',
-      output: 'result',
-      trajectory: [],
-      toolErrors: false,
-      timing: { start: 0, end: 100, total: 100 },
-    }
-
-    // Simulate grading
-    const graded: GradedResult = {
-      ...extracted,
-      score: { pass: true, score: 1.0 },
-    }
-
-    expect(graded.id).toBe(extracted.id)
-    expect(graded.score.pass).toBe(true)
-  })
-})
-
-describe('comparison data structures', () => {
-  test('LabeledRun derived from filename', () => {
-    // Simulate parseLabeledRun behavior
-    const path = '/path/to/results-baseline.jsonl'
-    const basename = path.split('/').pop() ?? ''
-    const label = basename.replace('.jsonl', '')
-
-    const run: LabeledRun = { label, path }
-    expect(run.label).toBe('results-baseline')
-  })
-
-  test('LabeledRun with explicit label', () => {
-    // Simulate explicit label:path format
-    const arg = 'my-baseline:/path/to/results.jsonl'
-    const colonIdx = arg.indexOf(':')
-    const label = arg.slice(0, colonIdx)
-    const path = arg.slice(colonIdx + 1)
-
-    const run: LabeledRun = { label, path }
-    expect(run.label).toBe('my-baseline')
-    expect(run.path).toBe('/path/to/results.jsonl')
-  })
-
-  test('comparison aggregates results by prompt ID', () => {
-    const results1 = [
-      { id: 'p1', output: 'a' },
-      { id: 'p2', output: 'b' },
-    ]
-    const results2 = [
-      { id: 'p1', output: 'x' },
-      { id: 'p2', output: 'y' },
-    ]
-
-    // Simulate comparison aggregation
-    const promptIds = new Set([...results1.map((r) => r.id), ...results2.map((r) => r.id)])
-    expect(promptIds.size).toBe(2)
-
-    const comparisonInput: ComparisonGraderInput = {
-      id: 'p1',
-      input: 'test prompt',
-      runs: {
-        run1: { output: results1.find((r) => r.id === 'p1')?.output ?? '' },
-        run2: { output: results2.find((r) => r.id === 'p1')?.output ?? '' },
-      },
-    }
-    expect(comparisonInput.runs.run1?.output).toBe('a')
-    expect(comparisonInput.runs.run2?.output).toBe('x')
-  })
-})
-
-describe('format style contracts', () => {
-  test('markdown format includes summary when graded', () => {
-    // Verify the type contract for markdown formatting
-    const gradedResults: GradedResult[] = [
-      {
-        id: 't1',
-        input: 'a',
-        output: 'x',
-        trajectory: [],
-        toolErrors: false,
-        timing: { start: 0, end: 100, total: 100 },
-        score: { pass: true, score: 1.0 },
-      },
-      {
-        id: 't2',
-        input: 'b',
-        output: 'y',
-        trajectory: [],
-        toolErrors: false,
-        timing: { start: 0, end: 100, total: 100 },
-        score: { pass: false, score: 0.5 },
-      },
-    ]
-
-    const passed = gradedResults.filter((r) => r.score.pass).length
-    const total = gradedResults.length
-    const passRate = passed / total
-
-    expect(passRate).toBe(0.5)
-  })
-
-  test('csv format escapes special characters', () => {
-    // Test CSV escaping contract
-    const escapeCsv = (str: string) => `"${str.replace(/"/g, '""').replace(/\n/g, '\\n')}"`
-
-    expect(escapeCsv('hello')).toBe('"hello"')
-    expect(escapeCsv('say "hello"')).toBe('"say ""hello"""')
-    expect(escapeCsv('line1\nline2')).toBe('"line1\\nline2"')
-  })
-})
diff --git a/src/schemas.ts b/src/schemas.ts
deleted file mode 100644
index deac478..0000000
--- a/src/schemas.ts
+++ /dev/null
@@ -1,134 +0,0 @@
-/**
- * Schemas and types for agent evaluation harness.
- *
- * @remarks
- * Re-exports all Zod schemas and inferred types for capture results,
- * trajectories, grader results, and CLI data structures.
- *
- * @packageDocumentation
- */
-
-// Constants
-export {
-  DEFAULT_CALIBRATION_SAMPLE_SIZE,
-  DEFAULT_HARNESS_TIMEOUT,
-  DEFAULT_TRIAL_COUNT,
-  HEAD_LINES,
-  MAX_CONTENT_LENGTH,
-  TAIL_LINES,
-} from './schemas/constants.ts'
-// Grader loader
-export { loadGrader, loadGraderOrExit } from './schemas/grader-loader.ts'
-// Core session types
-// JSON-RPC types (MCP compatibility)
-// MCP server configuration
-// Prompt and grading
-// Trajectory types
-// Timing and richness
-// Result types
-export {
-  type BalanceAnalysis,
-  BalanceAnalysisSchema,
-  type CalibrationSample,
-  CalibrationSampleSchema,
-  type CaptureResult,
-  CaptureResultSchema,
-  type CategoryDistribution,
-  CategoryDistributionSchema,
-  // Comparison report types
-  type ComparisonMeta,
-  ComparisonMetaSchema,
-  type ComparisonReport,
-  ComparisonReportSchema,
-  EnvVariableSchema,
-  type Grader,
-  type GraderResult,
-  GraderResultSchema,
-  type HeadToHead,
-  HeadToHeadSchema,
-  HttpHeaderSchema,
-  type IndexedStep,
-  type JsonRpcError,
-  type JsonRpcErrorResponse,
-  JsonRpcErrorResponseSchema,
-  JsonRpcErrorSchema,
-  type JsonRpcMessage,
-  JsonRpcMessageSchema,
-  type JsonRpcNotification,
-  JsonRpcNotificationSchema,
-  type JsonRpcRequest,
-  JsonRpcRequestSchema,
-  type JsonRpcResponse,
-  JsonRpcResponseSchema,
-  type JsonRpcSuccessResponse,
-  JsonRpcSuccessResponseSchema,
-  type LatencyStats,
-  LatencyStatsSchema,
-  type McpServerConfig,
-  McpServerHttpSchema,
-  McpServerSchema,
-  McpServerStdioSchema,
-  MessageStepSchema,
-  type PairwiseComparison,
-  PairwiseComparisonSchema,
-  type PerformanceMetrics,
-  PerformanceMetricsSchema,
-  PlanStepSchema,
-  type PromptCase,
-  PromptCaseSchema,
-  type PromptComparison,
-  PromptComparisonSchema,
-  type QualityMetrics,
-  QualityMetricsSchema,
-  type ReliabilityMetrics,
-  ReliabilityMetricsSchema,
-  type ScoreDistribution,
-  ScoreDistributionSchema,
-  type Session,
-  SessionSchema,
-  type SummaryResult,
-  SummaryResultSchema,
-  ThoughtStepSchema,
-  type Timing,
-  TimingSchema,
-  ToolCallStepSchema,
-  type ToolInput,
-  ToolInputSchema,
-  type TrajectoryInfo,
-  TrajectoryInfoSchema,
-  type TrajectoryRichness,
-  TrajectoryRichnessSchema,
-  type TrajectoryStep,
-  TrajectoryStepSchema,
-  type TrialEntry,
-  TrialEntrySchema,
-  type TrialResult,
-  TrialResultSchema,
-  // Trials comparison report types
-  type TrialsCapabilityMetrics,
-  TrialsCapabilityMetricsSchema,
-  type TrialsComparisonMeta,
-  TrialsComparisonMetaSchema,
-  type TrialsComparisonReport,
-  TrialsComparisonReportSchema,
-  type TrialsFlakinessMetrics,
-  TrialsFlakinessMetricsSchema,
-  type TrialsPerformanceConfidenceIntervals,
-  TrialsPerformanceConfidenceIntervalsSchema,
-  type TrialsPerformanceMetrics,
-  TrialsPerformanceMetricsSchema,
-  type TrialsPromptComparison,
-  TrialsPromptComparisonSchema,
-  type TrialsQualityConfidenceIntervals,
-  TrialsQualityConfidenceIntervalsSchema,
-  type TrialsQualityMetrics,
-  TrialsQualityMetricsSchema,
-  type TrialsReliabilityMetrics,
-  TrialsReliabilityMetricsSchema,
-  type ValidationResult,
-  ValidationResultSchema,
-} from './schemas/schemas.ts'
-
-// Schemas CLI
-export type { SchemasConfig } from './schemas/schemas-cli.ts'
-export { runSchemas, schemasCli } from './schemas/schemas-cli.ts'
diff --git a/src/schemas/constants.ts b/src/schemas/constants.ts
deleted file mode 100644
index e7dbd2f..0000000
--- a/src/schemas/constants.ts
+++ /dev/null
@@ -1,94 +0,0 @@
-/**
- * Constants for harness and JSON-RPC protocol operations.
- *
- * @remarks
- * Contains all constant values used across the implementation:
- * - JSON-RPC method names and protocol version
- * - JSON-RPC error codes
- * - Harness defaults (timeouts, preview limits)
- *
- * @packageDocumentation
- */
-
-// ============================================================================
-// JSON-RPC Protocol Methods
-// ============================================================================
-
-/** JSON-RPC method names for headless adapter protocol */
-export const PROTOCOL_METHODS = {
-  // Lifecycle
-  INITIALIZE: 'initialize',
-  SHUTDOWN: 'shutdown',
-
-  // Sessions
-  CREATE_SESSION: 'session/new',
-  LOAD_SESSION: 'session/load',
-  PROMPT: 'session/prompt',
-  CANCEL: 'session/cancel',
-  UPDATE: 'session/update',
-  REQUEST_PERMISSION: 'session/request_permission',
-  SET_MODEL: 'session/set_model',
-
-  // Protocol-level
-  CANCEL_REQUEST: '$/cancel_request',
-} as const
-
-// ============================================================================
-// Protocol Version
-// ============================================================================
-
-/** Current protocol version */
-export const PROTOCOL_VERSION = 1 as const
-
-// ============================================================================
-// JSON-RPC Error Codes
-// ============================================================================
-
-/** Standard JSON-RPC error codes */
-export const JSON_RPC_ERRORS = {
-  PARSE_ERROR: -32700,
-  INVALID_REQUEST: -32600,
-  METHOD_NOT_FOUND: -32601,
-  INVALID_PARAMS: -32602,
-  INTERNAL_ERROR: -32603,
-  REQUEST_CANCELLED: -32800,
-} as const
-
-// ============================================================================
-// Client Defaults
-// ============================================================================
-
-/** Default client name for protocol handshake */
-export const DEFAULT_CLIENT_NAME = 'plaited-eval-harness'
-
-/** Default timeout for protocol operations in milliseconds */
-export const DEFAULT_PROTOCOL_TIMEOUT = 30000
-
-/** Default polling interval for streaming updates in milliseconds */
-export const DEFAULT_POLLING_INTERVAL = 50
-
-// ============================================================================
-// Harness Preview Configuration
-// ============================================================================
-
-/** Number of lines to show at the head of content previews */
-export const HEAD_LINES = 8
-
-/** Number of lines to show at the tail of content previews */
-export const TAIL_LINES = 4
-
-/** Maximum content length before applying head/tail preview */
-export const MAX_CONTENT_LENGTH = 500
-
-// ============================================================================
-// Harness Defaults
-// ============================================================================
-
-/** Default timeout for prompt evaluation in milliseconds */
-export const DEFAULT_HARNESS_TIMEOUT = 60000
-
-/** Default number of trials for pass@k analysis */
-export const DEFAULT_TRIAL_COUNT = 5
-
-/** Default sample size for calibration */
-export const DEFAULT_CALIBRATION_SAMPLE_SIZE = 10
diff --git a/src/schemas/grader-loader.ts b/src/schemas/grader-loader.ts
deleted file mode 100644
index e2b4175..0000000
--- a/src/schemas/grader-loader.ts
+++ /dev/null
@@ -1,203 +0,0 @@
-/**
- * Polyglot grader loader module.
- *
- * @remarks
- * Supports loading graders from:
- * - TypeScript/JavaScript modules (import as ES module)
- * - Executable scripts (Python, Ruby, shell, etc. via subprocess)
- *
- * Executable graders use stdin/stdout JSON protocol:
- * - Input: `{"input": "...", "output": "...", "expected": "...", "trajectory": [...]}`
- * - Output: `{"pass": true, "score": 1.0, "reasoning": "..."}`
- *
- * @packageDocumentation
- */
-
-import { resolvePath } from '../core.ts'
-import type { Grader, TrajectoryStep } from './schemas.ts'
-import { GraderResultSchema } from './schemas.ts'
-
-// ============================================================================
-// Constants
-// ============================================================================
-
-/** File extensions that are imported as ES modules */
-const JS_EXTENSIONS = ['.ts', '.js', '.mjs', '.cjs']
-
-// ============================================================================
-// Helpers
-// ============================================================================
-
-/** Check if a file path is a JavaScript/TypeScript module */
-const isJsModule = (path: string): boolean => JS_EXTENSIONS.some((ext) => path.endsWith(ext))
-
-// ============================================================================
-// Executable Grader
-// ============================================================================
-
-/**
- * Input format for executable graders (stdin JSON).
- *
- * @remarks
- * The metadata field contains arbitrary key-value pairs from the original
- * prompt JSONL (e.g., category, difficulty, tags). Use this to implement
- * category-specific grading logic or filter calibration samples.
- * The cwd field provides the working directory path for git-based outcome detection.
- */
-type ExecGraderInput = {
-  input: string | string[]
-  output: string
-  hint?: string
-  trajectory?: TrajectoryStep[]
-  metadata?: Record<string, unknown>
-  cwd?: string
-}
-
-/**
- * Create a grader function that executes an external script.
- *
- * @remarks
- * The script receives JSON on stdin and must output JSON on stdout.
- * Non-zero exit codes are treated as errors.
- *
- * @param execPath - Absolute path to the executable script
- * @returns Grader function
- */
-const createExecGrader = (execPath: string): Grader => {
-  return async (params) => {
-    const input: ExecGraderInput = {
-      input: params.input,
-      output: params.output,
-      hint: params.hint,
-      trajectory: params.trajectory,
-      metadata: params.metadata,
-      cwd: params.cwd,
-    }
-
-    const inputJson = JSON.stringify(input)
-
-    const proc = Bun.spawn([execPath], {
-      stdin: new TextEncoder().encode(inputJson),
-      stdout: 'pipe',
-      stderr: 'pipe',
-    })
-
-    const [stdout, stderr, exitCode] = await Promise.all([
-      new Response(proc.stdout).text(),
-      new Response(proc.stderr).text(),
-      proc.exited,
-    ])
-
-    if (exitCode !== 0) {
-      throw new Error(`Grader exited with code ${exitCode}: ${stderr.trim() || 'No error output'}`)
-    }
-
-    const trimmedStdout = stdout.trim()
-    if (!trimmedStdout) {
-      throw new Error('Grader produced no output')
-    }
-
-    let parsed: unknown
-    try {
-      parsed = JSON.parse(trimmedStdout)
-    } catch {
-      throw new Error(`Grader output is not valid JSON: ${trimmedStdout.slice(0, 100)}`)
-    }
-
-    const result = GraderResultSchema.safeParse(parsed)
-    if (!result.success) {
-      throw new Error(`Invalid grader result: ${result.error.message}`)
-    }
-
-    return result.data
-  }
-}
-
-// ============================================================================
-// Module Grader
-// ============================================================================
-
-/**
- * Load a grader from a JavaScript/TypeScript module.
- *
- * @remarks
- * The module must export a `grade` function matching the `Grader` type.
- *
- * @param modulePath - Absolute path to the module
- * @returns Grader function
- */
-const loadModuleGrader = async (modulePath: string): Promise<Grader> => {
-  const graderModule = await import(modulePath)
-
-  if (typeof graderModule.grade !== 'function') {
-    throw new Error(`Grader module must export a 'grade' function`)
-  }
-
-  return graderModule.grade as Grader
-}
-
-// ============================================================================
-// Public API
-// ============================================================================
-
-/**
- * Load a grader from a file path.
- *
- * @remarks
- * Detection logic:
- * - `.ts`, `.js`, `.mjs`, `.cjs` → Import as ES module
- * - Everything else → Execute as subprocess
- *
- * @param graderPath - Path to the grader (relative or absolute)
- * @returns Grader function
- * @throws Error if grader not found or invalid
- *
- * @example
- * ```typescript
- * // TypeScript grader
- * const grader = await loadGrader('./grader.ts')
- *
- * // Python grader
- * const grader = await loadGrader('./grader.py')
- *
- * // Any executable
- * const grader = await loadGrader('./my-grader')
- * ```
- */
-/**
- * Load a grader from a file path, exiting on failure.
- *
- * @remarks
- * CLI-friendly wrapper around `loadGrader` that prints the error to stderr
- * and calls `process.exit(1)` on failure. Eliminates the duplicated
- * try/catch pattern across CLI handlers.
- *
- * @param graderPath - Path to the grader (relative or absolute)
- * @returns Grader function (never returns on failure)
- *
- * @public
- */
-export const loadGraderOrExit = async (graderPath: string): Promise<Grader> => {
-  try {
-    return await loadGrader(graderPath)
-  } catch (error) {
-    console.error(`Error: ${error instanceof Error ? error.message : error}`)
-    process.exit(1)
-  }
-}
-
-export const loadGrader = async (graderPath: string): Promise<Grader> => {
-  const resolvedPath = resolvePath(graderPath)
-
-  // Check file exists
-  const file = Bun.file(resolvedPath)
-  if (!(await file.exists())) {
-    throw new Error(`Grader not found: ${resolvedPath}`)
-  }
-
-  if (isJsModule(resolvedPath)) {
-    return loadModuleGrader(resolvedPath)
-  }
-
-  return createExecGrader(resolvedPath)
-}
diff --git a/src/schemas/schemas-cli.ts b/src/schemas/schemas-cli.ts
deleted file mode 100644
index 7b3e66a..0000000
--- a/src/schemas/schemas-cli.ts
+++ /dev/null
@@ -1,227 +0,0 @@
-/**
- * Schemas command - export JSON schemas for non-TypeScript users.
- *
- * @remarks
- * Uses Zod 4's native `z.toJSONSchema()` to generate JSON Schema from
- * the harness schemas. Useful for validation in other languages/tools.
- *
- * @packageDocumentation
- */
-
-import { parseArgs } from 'node:util'
-import { z } from 'zod'
-import { resolvePath } from '../core.ts'
-import * as schemas from './schemas.ts'
-
-// ============================================================================
-// Schema Registry
-// ============================================================================
-
-/** Available schemas for export */
-const SCHEMA_REGISTRY: Record<string, z.ZodSchema> = {
-  PromptCase: schemas.PromptCaseSchema,
-  GraderResult: schemas.GraderResultSchema,
-  TrajectoryStep: schemas.TrajectoryStepSchema,
-  CaptureResult: schemas.CaptureResultSchema,
-  SummaryResult: schemas.SummaryResultSchema,
-  TrialEntry: schemas.TrialEntrySchema,
-  TrialResult: schemas.TrialResultSchema,
-  CalibrationSample: schemas.CalibrationSampleSchema,
-  BalanceAnalysis: schemas.BalanceAnalysisSchema,
-  ValidationResult: schemas.ValidationResultSchema,
-  McpServerConfig: schemas.McpServerSchema,
-  Session: schemas.SessionSchema,
-  JsonRpcRequest: schemas.JsonRpcRequestSchema,
-  JsonRpcResponse: schemas.JsonRpcResponseSchema,
-  JsonRpcError: schemas.JsonRpcErrorSchema,
-}
-
-// ============================================================================
-// Types
-// ============================================================================
-
-/** Configuration for schemas command */
-export type SchemasConfig = {
-  /** Specific schema name to export (undefined = all) */
-  schemaName?: string
-  /** Output file path */
-  outputPath?: string
-  /** Output as JSON (vs list) */
-  json?: boolean
-  /** Split into separate files */
-  split?: boolean
-  /** List available schemas */
-  list?: boolean
-}
-
-// ============================================================================
-// Helpers
-// ============================================================================
-
-/** Generate JSON Schema from Zod schema */
-const toJsonSchema = (schema: z.ZodSchema, name: string): object => {
-  try {
-    // Zod 4's native JSON Schema generation
-    const jsonSchema = z.toJSONSchema(schema)
-    return {
-      $schema: 'https://json-schema.org/draft/2020-12/schema',
-      title: name,
-      ...jsonSchema,
-    }
-  } catch (error) {
-    // Fallback for schemas that can't be converted
-    return {
-      $schema: 'https://json-schema.org/draft/2020-12/schema',
-      title: name,
-      description: `Schema for ${name} (auto-generation failed: ${error instanceof Error ? error.message : 'unknown error'})`,
-    }
-  }
-}
-
-// ============================================================================
-// Schemas Implementation
-// ============================================================================
-
-/**
- * Execute schemas command with configuration object.
- *
- * @param config - Schemas configuration
- * @returns Generated JSON schemas
- */
-export const runSchemas = async (config: SchemasConfig): Promise<Record<string, object> | string[]> => {
-  const { schemaName, outputPath, json = false, split = false, list = false } = config
-
-  // List mode
-  if (list) {
-    const names = Object.keys(SCHEMA_REGISTRY)
-    console.log('Available schemas:')
-    for (const name of names) {
-      console.log(`  - ${name}`)
-    }
-    return names
-  }
-
-  // Single schema mode
-  if (schemaName) {
-    const schema = SCHEMA_REGISTRY[schemaName]
-    if (!schema) {
-      console.error(`Error: Unknown schema '${schemaName}'`)
-      console.error(`Available: ${Object.keys(SCHEMA_REGISTRY).join(', ')}`)
-      process.exit(1)
-    }
-
-    const jsonSchema = toJsonSchema(schema, schemaName)
-    const output = JSON.stringify(jsonSchema, null, 2)
-
-    if (outputPath) {
-      await Bun.write(resolvePath(outputPath), output)
-    } else {
-      console.log(output)
-    }
-
-    return { [schemaName]: jsonSchema }
-  }
-
-  // All schemas mode
-  const allSchemas: Record<string, object> = {}
-
-  for (const [name, schema] of Object.entries(SCHEMA_REGISTRY)) {
-    allSchemas[name] = toJsonSchema(schema, name)
-  }
-
-  if (split && outputPath) {
-    // Create directory and write separate files
-    const dir = resolvePath(outputPath)
-    await Bun.$`mkdir -p ${dir}`
-
-    for (const [name, jsonSchema] of Object.entries(allSchemas)) {
-      const filePath = `${dir}/${name}.json`
-      await Bun.write(filePath, JSON.stringify(jsonSchema, null, 2))
-    }
-
-    console.error(`Wrote ${Object.keys(allSchemas).length} schema files to ${dir}/`)
-  } else if (json) {
-    const output = JSON.stringify(allSchemas, null, 2)
-
-    if (outputPath) {
-      await Bun.write(resolvePath(outputPath), output)
-    } else {
-      console.log(output)
-    }
-  } else {
-    // Default: list schemas
-    console.log('Available schemas (use --json to export):')
-    for (const name of Object.keys(allSchemas)) {
-      console.log(`  - ${name}`)
-    }
-  }
-
-  return allSchemas
-}
-
-// ============================================================================
-// CLI Entry Point
-// ============================================================================
-
-/**
- * Schemas command CLI handler.
- *
- * @param args - Command line arguments (after 'schemas')
- */
-export const schemasCli = async (args: string[]): Promise<void> => {
-  const { values, positionals } = parseArgs({
-    args,
-    options: {
-      output: { type: 'string', short: 'o' },
-      json: { type: 'boolean', short: 'j', default: false },
-      split: { type: 'boolean', short: 's', default: false },
-      list: { type: 'boolean', short: 'l', default: false },
-      help: { type: 'boolean', short: 'h' },
-    },
-    allowPositionals: true,
-  })
-
-  if (values.help) {
-    console.log(`
-Usage: agent-eval-harness schemas [schema-name] [options]
-
-Arguments:
-  schema-name       Specific schema to export (optional)
-
-Options:
-  -o, --output      Output file or directory (with --split)
-  -j, --json        Export as JSON (default: list names)
-  -s, --split       Split into separate files (requires --output dir)
-  -l, --list        List available schemas
-  -h, --help        Show this help message
-
-Available Schemas:
-  PromptCase, GraderResult, TrajectoryStep, CaptureResult, SummaryResult,
-  TrialEntry, TrialResult, CalibrationSample, BalanceAnalysis, ValidationResult,
-  McpServerConfig, Session, JsonRpcRequest, JsonRpcResponse, JsonRpcError
-
-Examples:
-  # List available schemas
-  agent-eval-harness schemas --list
-
-  # Export all schemas as single JSON file
-  agent-eval-harness schemas --json -o schemas.json
-
-  # Export specific schema
-  agent-eval-harness schemas CaptureResult --json
-  agent-eval-harness schemas TrialResult --json -o trial-schema.json
-
-  # Export all schemas as separate files
-  agent-eval-harness schemas --json --split -o schemas/
-`)
-    return
-  }
-
-  await runSchemas({
-    schemaName: positionals[0],
-    outputPath: values.output,
-    json: values.json ?? false,
-    split: values.split ?? false,
-    list: values.list ?? false,
-  })
-}
diff --git a/src/schemas/schemas.ts b/src/schemas/schemas.ts
deleted file mode 100644
index e084159..0000000
--- a/src/schemas/schemas.ts
+++ /dev/null
@@ -1,1073 +0,0 @@
-/**
- * Unified Zod schemas and types for the agent eval harness.
- *
- * @remarks
- * This module follows a schema-first approach where Zod schemas are the
- * single source of truth. TypeScript types are derived using `z.infer<>`.
- *
- * **Exports:**
- * - Harness schemas: PromptCaseSchema, GraderResultSchema, CaptureResultSchema, etc.
- * - JSON-RPC schemas: JsonRpcRequestSchema, JsonRpcResponseSchema, etc. (for headless adapter)
- * - All inferred types via `z.infer<>`
- *
- * **JSON Schema generation (Zod 4):**
- * ```typescript
- * import { z } from 'zod'
- * import { CaptureResultSchema } from '@plaited/agent-eval-harness/schemas'
- * const jsonSchema = z.toJSONSchema(CaptureResultSchema)
- * ```
- *
- * @packageDocumentation
- */
-
-import { z } from 'zod'
-
-// ============================================================================
-// Session Types
-// ============================================================================
-
-/**
- * Session schema for session creation responses.
- */
-export const SessionSchema = z.object({
-  id: z.string(),
-  _meta: z.record(z.string(), z.unknown()).nullish(),
-})
-
-/** Session object returned from session creation */
-export type Session = z.infer<typeof SessionSchema>
-
-// ============================================================================
-// JSON-RPC 2.0 Schemas (for headless adapter)
-// ============================================================================
-
-/** JSON-RPC version literal */
-const JsonRpcVersionSchema = z.literal('2.0')
-
-/** Request/response identifier */
-const RequestIdSchema = z.union([z.string(), z.number()])
-
-/**
- * JSON-RPC 2.0 error object schema.
- *
- * @remarks
- * Standard error codes:
- * - `-32700`: Parse error
- * - `-32600`: Invalid request
- * - `-32601`: Method not found
- * - `-32602`: Invalid params
- * - `-32603`: Internal error
- */
-export const JsonRpcErrorSchema = z.object({
-  code: z.number(),
-  message: z.string(),
-  data: z.unknown().optional(),
-})
-
-/** JSON-RPC 2.0 error object */
-export type JsonRpcError = z.infer<typeof JsonRpcErrorSchema>
-
-/** JSON-RPC 2.0 request schema */
-export const JsonRpcRequestSchema = z.object({
-  jsonrpc: JsonRpcVersionSchema,
-  id: RequestIdSchema,
-  method: z.string(),
-  params: z.unknown().optional(),
-})
-
-/** JSON-RPC 2.0 request structure */
-export type JsonRpcRequest<T = unknown> = Omit<z.infer<typeof JsonRpcRequestSchema>, 'params'> & {
-  params?: T
-}
-
-/** JSON-RPC 2.0 notification schema (no id, no response expected) */
-export const JsonRpcNotificationSchema = z.object({
-  jsonrpc: JsonRpcVersionSchema,
-  method: z.string(),
-  params: z.unknown().optional(),
-})
-
-/** JSON-RPC 2.0 notification structure (no id, no response expected) */
-export type JsonRpcNotification<T = unknown> = Omit<z.infer<typeof JsonRpcNotificationSchema>, 'params'> & {
-  params?: T
-}
-
-/** JSON-RPC 2.0 success response schema */
-export const JsonRpcSuccessResponseSchema = z.object({
-  jsonrpc: JsonRpcVersionSchema,
-  id: RequestIdSchema,
-  result: z.unknown(),
-})
-
-/** JSON-RPC 2.0 success response */
-export type JsonRpcSuccessResponse<T = unknown> = Omit<z.infer<typeof JsonRpcSuccessResponseSchema>, 'result'> & {
-  result: T
-}
-
-/** JSON-RPC 2.0 error response schema */
-export const JsonRpcErrorResponseSchema = z.object({
-  jsonrpc: JsonRpcVersionSchema,
-  id: z.union([RequestIdSchema, z.null()]),
-  error: JsonRpcErrorSchema,
-})
-
-/** JSON-RPC 2.0 error response */
-export type JsonRpcErrorResponse = z.infer<typeof JsonRpcErrorResponseSchema>
-
-/** Union of all JSON-RPC response types */
-export const JsonRpcResponseSchema = z.union([JsonRpcSuccessResponseSchema, JsonRpcErrorResponseSchema])
-
-/** Union of all JSON-RPC response types */
-export type JsonRpcResponse<T = unknown> = JsonRpcSuccessResponse<T> | JsonRpcErrorResponse
-
-/**
- * Union of all JSON-RPC message types.
- *
- * @remarks
- * Use `safeParse` at transport boundaries for runtime validation.
- */
-export const JsonRpcMessageSchema = z.union([JsonRpcRequestSchema, JsonRpcNotificationSchema, JsonRpcResponseSchema])
-
-/** Union of all JSON-RPC message types */
-export type JsonRpcMessage<T = unknown> = JsonRpcRequest<T> | JsonRpcNotification<T> | JsonRpcResponse<T>
-
-// ============================================================================
-// MCP Server Configuration Schemas
-// ============================================================================
-
-/** Environment variable configuration */
-export const EnvVariableSchema = z.object({
-  name: z.string(),
-  value: z.string(),
-})
-
-/** HTTP header configuration */
-export const HttpHeaderSchema = z.object({
-  name: z.string(),
-  value: z.string(),
-})
-
-/** MCP server stdio transport configuration */
-export const McpServerStdioSchema = z.object({
-  type: z.literal('stdio').optional(),
-  name: z.string(),
-  command: z.string(),
-  args: z.array(z.string()),
-  env: z.array(EnvVariableSchema),
-})
-
-/** MCP server HTTP transport configuration */
-export const McpServerHttpSchema = z.object({
-  type: z.literal('http'),
-  name: z.string(),
-  url: z.string(),
-  headers: z.array(HttpHeaderSchema),
-})
-
-/** MCP server configuration (stdio or HTTP) */
-export const McpServerSchema = z.union([McpServerStdioSchema, McpServerHttpSchema])
-
-/** MCP server configuration type */
-export type McpServerConfig = z.infer<typeof McpServerSchema>
-
-// ============================================================================
-// Harness Input Schemas
-// ============================================================================
-
-/**
- * Prompt case schema for evaluation inputs.
- *
- * @remarks
- * Each line in a prompts.jsonl file should match this schema.
- * - Single turn: `input: "Hello"` - one prompt, one session
- * - Multi-turn: `input: ["Hello", "How are you?", "Goodbye"]` - sequential turns in one session
- */
-export const PromptCaseSchema = z.object({
-  /** Unique identifier for the test case */
-  id: z.string(),
-  /** Prompt text(s) - string for single turn, array for multi-turn conversation */
-  input: z.union([z.string(), z.array(z.string())]),
-  /** Optional grader context hint (not a strict expected match) */
-  hint: z.string().optional(),
-  /** Optional reference solution for validation */
-  reference: z.string().optional(),
-  /** Optional metadata for categorization and analysis */
-  metadata: z.record(z.string(), z.unknown()).optional(),
-  /** Optional per-case timeout override in milliseconds */
-  timeout: z.number().optional(),
-})
-
-/** Prompt case type */
-export type PromptCase = z.infer<typeof PromptCaseSchema>
-
-// ============================================================================
-// Grader Schemas
-// ============================================================================
-
-/**
- * Grader result schema.
- *
- * @remarks
- * Result returned by user-provided grader functions.
- * - `outcome`: Optional structured outcome data detected by the grader
- */
-export const GraderResultSchema = z.object({
-  /** Whether the output passes the evaluation criteria */
-  pass: z.boolean(),
-  /** Numeric score from 0.0 to 1.0 */
-  score: z.number().min(0).max(1),
-  /** Optional explanation for the score */
-  reasoning: z.string().optional(),
-  /** Optional outcome data (e.g., files created, tests passed) */
-  outcome: z.record(z.string(), z.unknown()).optional(),
-})
-
-/** Grader result type */
-export type GraderResult = z.infer<typeof GraderResultSchema>
-
-/**
- * Grader function type.
- *
- * @remarks
- * User-provided graders implement this interface to score agent outputs.
- * - `input` is the original prompt (string or array for multi-turn)
- * - `hint` provides grader context (renamed from `expected`)
- * - `metadata` contains arbitrary key-value pairs from the original prompt JSONL
- * - `cwd` is the working directory path (optional, enables git-based outcome detection)
- */
-export type Grader = (params: {
-  input: string | string[]
-  output: string
-  hint?: string
-  trajectory?: TrajectoryStep[]
-  metadata?: Record<string, unknown>
-  cwd?: string
-}) => Promise<GraderResult>
-
-// ============================================================================
-// Trajectory Schemas
-// ============================================================================
-
-/** Tool input schema for extracting file paths and content */
-export const ToolInputSchema = z
-  .object({
-    file_path: z.string().optional(),
-    path: z.string().optional(),
-    content: z.string().optional(),
-    new_string: z.string().optional(),
-  })
-  .passthrough()
-
-/** Tool input type */
-export type ToolInput = z.infer<typeof ToolInputSchema>
-
-/** Thought trajectory step */
-export const ThoughtStepSchema = z.object({
-  type: z.literal('thought'),
-  content: z.string(),
-  timestamp: z.number(),
-  stepId: z.string().optional(),
-})
-
-/** Message trajectory step */
-export const MessageStepSchema = z.object({
-  type: z.literal('message'),
-  content: z.string(),
-  timestamp: z.number(),
-  stepId: z.string().optional(),
-})
-
-/** Tool call trajectory step */
-export const ToolCallStepSchema = z.object({
-  type: z.literal('tool_call'),
-  name: z.string(),
-  status: z.string(),
-  input: z.unknown().optional(),
-  output: z.unknown().optional(),
-  duration: z.number().optional(),
-  timestamp: z.number(),
-  stepId: z.string().optional(),
-})
-
-/** Plan trajectory step */
-export const PlanStepSchema = z.object({
-  type: z.literal('plan'),
-  entries: z.array(z.unknown()),
-  timestamp: z.number(),
-  stepId: z.string().optional(),
-})
-
-/**
- * Trajectory step schema (discriminated union).
- *
- * @remarks
- * Represents a single step in the agent's execution trajectory.
- */
-export const TrajectoryStepSchema = z.discriminatedUnion('type', [
-  ThoughtStepSchema,
-  MessageStepSchema,
-  ToolCallStepSchema,
-  PlanStepSchema,
-])
-
-/** Trajectory step type */
-export type TrajectoryStep = z.infer<typeof TrajectoryStepSchema>
-
-/** Indexed trajectory step with unique ID for correlation */
-export type IndexedStep = TrajectoryStep & { stepId: string }
-
-// ============================================================================
-// Capture Result Schemas
-// ============================================================================
-
-/**
- * Timing information for a capture result.
- *
- * @remarks
- * Captures both absolute timestamps and derived durations for analysis:
- * - `sessionCreation`: Time to initialize session (agent startup overhead)
- * - `total`: End-to-end duration including all turns
- * - `firstResponse`: Latency to first agent output (optional)
- *
- * Token counts are adapter-dependent and only present if the adapter
- * exposes usage information (e.g., Claude Code includes them, others may not).
- *
- * @public
- */
-export const TimingSchema = z.object({
-  /** Epoch timestamp when capture started */
-  start: z.number(),
-  /** Epoch timestamp when capture ended */
-  end: z.number(),
-  /** Time to first response (ms from start) */
-  firstResponse: z.number().optional(),
-  /** Time to create session (ms) - measures agent initialization overhead */
-  sessionCreation: z.number(),
-  /** Total duration (end - start) in milliseconds */
-  total: z.number(),
-  /** Input tokens consumed (if available from headless adapter) */
-  inputTokens: z.number().optional(),
-  /** Output tokens generated (if available from headless adapter) */
-  outputTokens: z.number().optional(),
-})
-
-/**
- * Timing information type inferred from TimingSchema.
- *
- * @public
- */
-export type Timing = z.infer<typeof TimingSchema>
-
-/**
- * Trajectory richness level indicating the depth of captured agent activity.
- *
- * @remarks
- * Different adapters provide varying levels of detail:
- * - `full`: Thoughts, tool calls, plans (e.g., Claude Code adapter)
- * - `minimal`: Basic output only (e.g., Droid adapter)
- * - `messages-only`: Messages without internal reasoning
- */
-export const TrajectoryRichnessSchema = z.enum(['full', 'minimal', 'messages-only'])
-
-/** Trajectory richness type */
-export type TrajectoryRichness = z.infer<typeof TrajectoryRichnessSchema>
-
-/**
- * Capture result schema.
- *
- * @remarks
- * Full trajectory output from the `capture` command.
- * - `input` can be string (single turn) or string[] (multi-turn)
- * - `hint` provides grader context (renamed from `expected`)
- * - `toolErrors` replaces misleading `status: 'passed'|'failed'`
- * - `outcome` is merged from grader result if grader returns outcome data
- * Real pass/fail determination comes from your grader.
- */
-export const CaptureResultSchema = z.object({
-  /** Test case identifier */
-  id: z.string(),
-  /** Original prompt input (string for single turn, array for multi-turn) */
-  input: z.union([z.string(), z.array(z.string())]),
-  /** Final agent output */
-  output: z.string(),
-  /** Grader context hint (renamed from expected) */
-  hint: z.string().optional(),
-  /** Full execution trajectory */
-  trajectory: z.array(TrajectoryStepSchema),
-  /** Metadata including category, agent info, trajectoryRichness, turnCount */
-  metadata: z.record(z.string(), z.unknown()),
-  /** Timing information */
-  timing: TimingSchema,
-  /** Whether any tool calls failed */
-  toolErrors: z.boolean(),
-  /** Error messages (if any) */
-  errors: z.array(z.string()).optional(),
-  /** Grader score (if grader was provided) */
-  score: GraderResultSchema.optional(),
-  /** Outcome data from grader (if grader provided and returned outcome) */
-  outcome: z.record(z.string(), z.unknown()).optional(),
-})
-
-/** Capture result type */
-export type CaptureResult = z.infer<typeof CaptureResultSchema>
-
-// ============================================================================
-// Summary Result Schemas
-// ============================================================================
-
-/**
- * Summary result schema.
- *
- * @remarks
- * Compact view derived from full capture results via the `summarize` command.
- */
-export const SummaryResultSchema = z.object({
-  /** Test case identifier */
-  id: z.string(),
-  /** Original prompt input */
-  input: z.string(),
-  /** Final agent output */
-  output: z.string(),
-  /** List of tool names called */
-  toolCalls: z.array(z.string()),
-  /** Duration in milliseconds */
-  duration: z.number(),
-})
-
-/** Summary result type */
-export type SummaryResult = z.infer<typeof SummaryResultSchema>
-
-// ============================================================================
-// Trial Result Schemas
-// ============================================================================
-
-/** Single trial within a trial run */
-export const TrialEntrySchema = z.object({
-  /** Trial number (1-indexed) */
-  trialNum: z.number(),
-  /** Agent output for this trial */
-  output: z.string(),
-  /** Full trajectory for this trial */
-  trajectory: z.array(TrajectoryStepSchema),
-  /** Duration in milliseconds */
-  duration: z.number(),
-  /** Pass/fail (if grader provided) */
-  pass: z.boolean().optional(),
-  /** Numeric score (if grader provided) */
-  score: z.number().optional(),
-  /** Grader reasoning (if grader provided) */
-  reasoning: z.string().optional(),
-  /** Outcome data from grader (if grader provided and returned outcome) */
-  outcome: z.record(z.string(), z.unknown()).optional(),
-})
-
-/** Trial entry type */
-export type TrialEntry = z.infer<typeof TrialEntrySchema>
-
-/**
- * Trial result schema.
- *
- * @remarks
- * Output from the `trials` command for pass@k/pass^k analysis.
- * Metrics (passRate, passAtK, passExpK) are only present when a grader is provided.
- */
-export const TrialResultSchema = z.object({
-  /** Test case identifier */
-  id: z.string(),
-  /** Original prompt input (string for single turn, array for multi-turn) */
-  input: z.union([z.string(), z.array(z.string())]),
-  /** Grader context hint (renamed from expected) */
-  hint: z.string().optional(),
-  /** Number of trials (k) */
-  k: z.number(),
-  /** Simple pass rate: passes / k (with grader only) */
-  passRate: z.number().optional(),
-  /** pass@k: probability of at least one pass in k samples (with grader only) */
-  passAtK: z.number().optional(),
-  /** pass^k: probability of all k samples passing (with grader only) */
-  passExpK: z.number().optional(),
-  /** Individual trial results */
-  trials: z.array(TrialEntrySchema),
-  /** Metadata including agent info, workspaceDir, and custom fields */
-  metadata: z.record(z.string(), z.unknown()).optional(),
-})
-
-/** Trial result type */
-export type TrialResult = z.infer<typeof TrialResultSchema>
-
-// ============================================================================
-// Calibration Schemas
-// ============================================================================
-
-/** Calibration sample for grader review */
-export const CalibrationSampleSchema = z.object({
-  /** Test case identifier */
-  id: z.string(),
-  /** Original prompt input (string for single turn, array for multi-turn) */
-  input: z.union([z.string(), z.array(z.string())]),
-  /** Agent output */
-  output: z.string(),
-  /** Grader context hint (renamed from expected) */
-  hint: z.string().optional(),
-  /** Original grader score */
-  originalScore: GraderResultSchema,
-  /** Re-scored result (if different grader provided) */
-  rescoredResult: GraderResultSchema.optional(),
-  /** Key trajectory snippets */
-  trajectorySnippet: z.array(TrajectoryStepSchema),
-})
-
-/** Calibration sample type */
-export type CalibrationSample = z.infer<typeof CalibrationSampleSchema>
-
-// ============================================================================
-// Balance Analysis Schemas
-// ============================================================================
-
-/** Category distribution in test set */
-export const CategoryDistributionSchema = z.object({
-  /** Category name */
-  name: z.string(),
-  /** Number of test cases */
-  count: z.number(),
-  /** Percentage of total */
-  percentage: z.number(),
-})
-
-/** Category distribution type */
-export type CategoryDistribution = z.infer<typeof CategoryDistributionSchema>
-
-/** Balance analysis result */
-export const BalanceAnalysisSchema = z.object({
-  /** Total number of test cases */
-  totalCases: z.number(),
-  /** Distribution by category */
-  categories: z.array(CategoryDistributionSchema),
-  /** Categories that may need more test cases */
-  underrepresented: z.array(z.string()),
-  /** Suggested improvements */
-  suggestions: z.array(z.string()),
-})
-
-/** Balance analysis type */
-export type BalanceAnalysis = z.infer<typeof BalanceAnalysisSchema>
-
-// ============================================================================
-// Validation Reference Schemas
-// ============================================================================
-
-/** Validation result for a reference solution */
-export const ValidationResultSchema = z.object({
-  /** Test case identifier */
-  id: z.string(),
-  /** Reference solution provided */
-  reference: z.string(),
-  /** Whether reference passes the grader */
-  passes: z.boolean(),
-  /** Grader result */
-  graderResult: GraderResultSchema,
-})
-
-/** Validation result type */
-export type ValidationResult = z.infer<typeof ValidationResultSchema>
-
-// ============================================================================
-// Comparison Report Schemas
-// ============================================================================
-
-/**
- * Confidence interval schema as [lower, upper] bounds.
- *
- * @remarks
- * Used for bootstrap-computed confidence intervals when strategy=statistical.
- */
-export const ConfidenceIntervalSchema = z.tuple([z.number(), z.number()])
-
-/** Confidence interval type */
-export type ConfidenceInterval = z.infer<typeof ConfidenceIntervalSchema>
-
-/**
- * Score distribution histogram for quality analysis.
- *
- * @remarks
- * Buckets divide the 0-1 score range into 5 equal bins.
- */
-export const ScoreDistributionSchema = z.object({
-  '0.0-0.2': z.number(),
-  '0.2-0.4': z.number(),
-  '0.4-0.6': z.number(),
-  '0.6-0.8': z.number(),
-  '0.8-1.0': z.number(),
-})
-
-/** Score distribution type */
-export type ScoreDistribution = z.infer<typeof ScoreDistributionSchema>
-
-/**
- * Confidence intervals for quality metrics.
- */
-export const QualityConfidenceIntervalsSchema = z.object({
-  /** CI for avgScore */
-  avgScore: ConfidenceIntervalSchema.optional(),
-  /** CI for passRate */
-  passRate: ConfidenceIntervalSchema.optional(),
-})
-
-/** Quality confidence intervals type */
-export type QualityConfidenceIntervals = z.infer<typeof QualityConfidenceIntervalsSchema>
-
-/**
- * Quality metrics for a single run in comparison.
- */
-export const QualityMetricsSchema = z.object({
-  /** Discriminator for run-level quality metrics */
-  type: z.literal('run'),
-  /** Mean grader score (0-1) */
-  avgScore: z.number(),
-  /** Percentage of pass=true results */
-  passRate: z.number(),
-  /** Count of passing results */
-  passCount: z.number(),
-  /** Count of failing results */
-  failCount: z.number(),
-  /** Score distribution histogram */
-  scoreDistribution: ScoreDistributionSchema,
-  /** Confidence intervals (only with strategy=statistical) */
-  confidenceIntervals: QualityConfidenceIntervalsSchema.optional(),
-})
-
-/** Quality metrics type */
-export type QualityMetrics = z.infer<typeof QualityMetricsSchema>
-
-/**
- * Latency statistics for performance analysis.
- */
-export const LatencyStatsSchema = z.object({
-  /** 50th percentile (median) in milliseconds */
-  p50: z.number(),
-  /** 90th percentile in milliseconds */
-  p90: z.number(),
-  /** 99th percentile in milliseconds */
-  p99: z.number(),
-  /** Mean latency in milliseconds */
-  mean: z.number(),
-  /** Minimum latency in milliseconds */
-  min: z.number(),
-  /** Maximum latency in milliseconds */
-  max: z.number(),
-})
-
-/** Latency stats type */
-export type LatencyStats = z.infer<typeof LatencyStatsSchema>
-
-/**
- * Confidence intervals for performance metrics.
- */
-export const PerformanceConfidenceIntervalsSchema = z.object({
-  /** CI for latency mean */
-  latencyMean: ConfidenceIntervalSchema.optional(),
-})
-
-/** Performance confidence intervals type */
-export type PerformanceConfidenceIntervals = z.infer<typeof PerformanceConfidenceIntervalsSchema>
-
-/**
- * Performance metrics for a single run in comparison.
- */
-export const PerformanceMetricsSchema = z.object({
-  /** End-to-end latency statistics */
-  latency: LatencyStatsSchema,
-  /** Time to first response statistics (optional, not all adapters support) */
-  firstResponse: LatencyStatsSchema.optional(),
-  /** Sum of all run durations in milliseconds */
-  totalDuration: z.number(),
-  /** Confidence intervals (only with strategy=statistical) */
-  confidenceIntervals: PerformanceConfidenceIntervalsSchema.optional(),
-})
-
-/** Performance metrics type */
-export type PerformanceMetrics = z.infer<typeof PerformanceMetricsSchema>
-
-/**
- * Reliability metrics for a single run in comparison.
- */
-export const ReliabilityMetricsSchema = z.object({
-  /** Discriminator for run-based reliability metrics */
-  type: z.literal('run'),
-  /** Count of runs with toolErrors=true */
-  toolErrors: z.number(),
-  /** Percentage of runs with tool errors */
-  toolErrorRate: z.number(),
-  /** Count of runs that hit timeout */
-  timeouts: z.number(),
-  /** Percentage of runs that hit timeout */
-  timeoutRate: z.number(),
-  /** Percentage of runs that completed successfully */
-  completionRate: z.number(),
-})
-
-/** Reliability metrics type */
-export type ReliabilityMetrics = z.infer<typeof ReliabilityMetricsSchema>
-
-/**
- * Trajectory info for a single run in comparison.
- */
-export const TrajectoryInfoSchema = z.object({
-  /** Trajectory richness level */
-  richness: TrajectoryRichnessSchema,
-  /** Average trajectory steps per run */
-  avgStepCount: z.number(),
-})
-
-/** Trajectory info type */
-export type TrajectoryInfo = z.infer<typeof TrajectoryInfoSchema>
-
-/**
- * Per-prompt comparison entry for head-to-head drill-down.
- */
-export const PromptComparisonSchema = z.object({
-  /** Prompt identifier */
-  id: z.string(),
-  /** Run label of the winner, or null if tie */
-  winner: z.string().nullable(),
-  /** Scores by run label */
-  scores: z.record(z.string(), z.number()),
-  /** Latencies by run label in milliseconds */
-  latencies: z.record(z.string(), z.number()),
-  /** Whether each run had errors */
-  hadErrors: z.record(z.string(), z.boolean()),
-})
-
-/** Prompt comparison type */
-export type PromptComparison = z.infer<typeof PromptComparisonSchema>
-
-/**
- * Pairwise win/loss/tie statistics between two runs.
- */
-export const PairwiseComparisonSchema = z.object({
-  /** First run label */
-  runA: z.string(),
-  /** Second run label */
-  runB: z.string(),
-  /** Number of prompts where A won */
-  aWins: z.number(),
-  /** Number of prompts where B won */
-  bWins: z.number(),
-  /** Number of prompts where A and B tied */
-  ties: z.number(),
-})
-
-/** Pairwise comparison type */
-export type PairwiseComparison = z.infer<typeof PairwiseComparisonSchema>
-
-/**
- * Head-to-head comparison section.
- */
-export const HeadToHeadSchema = z.object({
-  /** Per-prompt breakdown for drill-down */
-  prompts: z.array(PromptComparisonSchema),
-  /** Pairwise win rates between runs */
-  pairwise: z.array(PairwiseComparisonSchema),
-})
-
-/** Head-to-head type */
-export type HeadToHead = z.infer<typeof HeadToHeadSchema>
-
-/**
- * Metadata for the comparison report.
- */
-export const ComparisonMetaSchema = z.object({
-  /** ISO timestamp when report was generated */
-  generatedAt: z.string(),
-  /** Run labels included in comparison */
-  runs: z.array(z.string()),
-  /** Total prompts compared */
-  promptCount: z.number(),
-  /** Prompts where all runs completed */
-  promptsWithAllRuns: z.number(),
-})
-
-/** Comparison meta type */
-export type ComparisonMeta = z.infer<typeof ComparisonMetaSchema>
-
-/**
- * Holistic comparison report schema.
- *
- * @remarks
- * Aggregates comparison output across all dimensions:
- * - Quality: pass rates, scores, distributions
- * - Performance: latency percentiles
- * - Reliability: error rates, completion rates
- * - Head-to-head: per-prompt winners, pairwise stats
- *
- * Note: Tool usage analysis is NOT included because adapter formats vary.
- * Different adapters provide different `trajectoryRichness` levels and
- * the `tool_call.name` field often contains tool use IDs rather than
- * human-readable names.
- */
-export const ComparisonReportSchema = z.object({
-  /** Report metadata */
-  meta: ComparisonMetaSchema,
-  /** Quality metrics by run label */
-  quality: z.record(z.string(), QualityMetricsSchema),
-  /** Performance metrics by run label */
-  performance: z.record(z.string(), PerformanceMetricsSchema),
-  /** Reliability metrics by run label */
-  reliability: z.record(z.string(), ReliabilityMetricsSchema),
-  /** Trajectory info by run label */
-  trajectoryInfo: z.record(z.string(), TrajectoryInfoSchema),
-  /** Head-to-head comparison details */
-  headToHead: HeadToHeadSchema,
-})
-
-/** Comparison report type */
-export type ComparisonReport = z.infer<typeof ComparisonReportSchema>
-
-// ============================================================================
-// Trials Comparison Report Schemas
-// ============================================================================
-
-/**
- * Confidence intervals for trials capability metrics.
- */
-export const TrialsCapabilityConfidenceIntervalsSchema = z.object({
-  /** CI for avgPassAtK */
-  avgPassAtK: ConfidenceIntervalSchema.optional(),
-})
-
-/** Trials capability confidence intervals type */
-export type TrialsCapabilityConfidenceIntervals = z.infer<typeof TrialsCapabilityConfidenceIntervalsSchema>
-
-/**
- * Capability metrics for trials comparison (passAtK-based).
- *
- * @remarks
- * Measures whether the agent CAN solve the task (at least once in K tries).
- * Higher passAtK means the agent has the capability to solve the task.
- */
-export const TrialsCapabilityMetricsSchema = z.object({
-  /** Average passAtK across all prompts */
-  avgPassAtK: z.number(),
-  /** Median passAtK */
-  medianPassAtK: z.number(),
-  /** 25th percentile passAtK */
-  p25PassAtK: z.number(),
-  /** 75th percentile passAtK */
-  p75PassAtK: z.number(),
-  /** Confidence intervals (only with strategy=statistical) */
-  confidenceIntervals: TrialsCapabilityConfidenceIntervalsSchema.optional(),
-})
-
-/** Trials capability metrics type */
-export type TrialsCapabilityMetrics = z.infer<typeof TrialsCapabilityMetricsSchema>
-
-/**
- * Confidence intervals for trials reliability metrics.
- */
-export const TrialsReliabilityConfidenceIntervalsSchema = z.object({
-  /** CI for avgPassExpK */
-  avgPassExpK: ConfidenceIntervalSchema.optional(),
-})
-
-/** Trials reliability confidence intervals type */
-export type TrialsReliabilityConfidenceIntervals = z.infer<typeof TrialsReliabilityConfidenceIntervalsSchema>
-
-/**
- * Reliability metrics for trials comparison (passExpK-based).
- *
- * @remarks
- * Measures whether the agent CONSISTENTLY solves the task (all K tries).
- * Higher passExpK means the agent reliably solves the task every time.
- */
-export const TrialsReliabilityMetricsSchema = z.object({
-  /** Discriminator for trial-based reliability metrics */
-  type: z.literal('trial'),
-  /** Average passExpK across all prompts */
-  avgPassExpK: z.number(),
-  /** Median passExpK */
-  medianPassExpK: z.number(),
-  /** 25th percentile passExpK */
-  p25PassExpK: z.number(),
-  /** 75th percentile passExpK */
-  p75PassExpK: z.number(),
-  /** Confidence intervals (only with strategy=statistical) */
-  confidenceIntervals: TrialsReliabilityConfidenceIntervalsSchema.optional(),
-})
-
-/** Trials reliability metrics type */
-export type TrialsReliabilityMetrics = z.infer<typeof TrialsReliabilityMetricsSchema>
-
-/**
- * Flakiness metrics for trials comparison.
- *
- * @remarks
- * Flakiness = passAtK - passExpK, measuring the gap between capability and reliability.
- * A high flakiness score means the agent can sometimes solve the task but not consistently.
- */
-export const TrialsFlakinessMetricsSchema = z.object({
-  /** Average flakiness across all prompts */
-  avgFlakiness: z.number(),
-  /** Median flakiness */
-  medianFlakiness: z.number(),
-  /** Number of prompts with non-zero flakiness */
-  flakyPromptCount: z.number(),
-  /** Top flaky prompts by flakiness score */
-  topFlakyPrompts: z.array(
-    z.object({
-      /** Prompt identifier */
-      id: z.string(),
-      /** Flakiness score (passAtK - passExpK) */
-      flakiness: z.number(),
-    }),
-  ),
-})
-
-/** Trials flakiness metrics type */
-export type TrialsFlakinessMetrics = z.infer<typeof TrialsFlakinessMetricsSchema>
-
-/**
- * Confidence intervals for trials quality metrics.
- */
-export const TrialsQualityConfidenceIntervalsSchema = z.object({
-  /** CI for avgScore */
-  avgScore: ConfidenceIntervalSchema.optional(),
-})
-
-/** Trials quality confidence intervals type */
-export type TrialsQualityConfidenceIntervals = z.infer<typeof TrialsQualityConfidenceIntervalsSchema>
-
-/**
- * Quality metrics for trials comparison (score-based).
- *
- * @remarks
- * Aggregates grader scores across all trials for each prompt.
- * Only present when a grader was used during trials capture.
- */
-export const TrialsQualityMetricsSchema = z.object({
-  /** Discriminator for trial-level quality metrics */
-  type: z.literal('trial'),
-  /** Average score across all trials */
-  avgScore: z.number(),
-  /** Median score */
-  medianScore: z.number(),
-  /** 25th percentile score */
-  p25Score: z.number(),
-  /** 75th percentile score */
-  p75Score: z.number(),
-  /** Confidence intervals (only with strategy=statistical) */
-  confidenceIntervals: TrialsQualityConfidenceIntervalsSchema.optional(),
-})
-
-/** Trials quality metrics type */
-export type TrialsQualityMetrics = z.infer<typeof TrialsQualityMetricsSchema>
-
-/**
- * Confidence intervals for trials performance metrics.
- */
-export const TrialsPerformanceConfidenceIntervalsSchema = z.object({
-  /** CI for latency mean */
-  latencyMean: ConfidenceIntervalSchema.optional(),
-})
-
-/** Trials performance confidence intervals type */
-export type TrialsPerformanceConfidenceIntervals = z.infer<typeof TrialsPerformanceConfidenceIntervalsSchema>
-
-/**
- * Performance metrics for trials comparison (latency-based).
- *
- * @remarks
- * Aggregates trial durations across all prompts.
- * Always present since TrialEntry.duration is required.
- */
-export const TrialsPerformanceMetricsSchema = z.object({
-  /** End-to-end latency statistics across all trials */
-  latency: LatencyStatsSchema,
-  /** Sum of all trial durations in milliseconds */
-  totalDuration: z.number(),
-  /** Confidence intervals (only with strategy=statistical) */
-  confidenceIntervals: TrialsPerformanceConfidenceIntervalsSchema.optional(),
-})
-
-/** Trials performance metrics type */
-export type TrialsPerformanceMetrics = z.infer<typeof TrialsPerformanceMetricsSchema>
-
-/**
- * Per-prompt metrics for trials comparison drill-down.
- */
-export const TrialsPromptComparisonSchema = z.object({
-  /** Prompt identifier */
-  id: z.string(),
-  /** Run label of the capability winner, or null if tie */
-  capabilityWinner: z.string().nullable(),
-  /** Run label of the reliability winner, or null if tie */
-  reliabilityWinner: z.string().nullable(),
-  /** passAtK by run label */
-  passAtK: z.record(z.string(), z.number()),
-  /** passExpK by run label */
-  passExpK: z.record(z.string(), z.number()),
-  /** Flakiness by run label */
-  flakiness: z.record(z.string(), z.number()),
-})
-
-/** Trials prompt comparison type */
-export type TrialsPromptComparison = z.infer<typeof TrialsPromptComparisonSchema>
-
-/**
- * Metadata for trials comparison report.
- */
-export const TrialsComparisonMetaSchema = z.object({
-  /** ISO timestamp when report was generated */
-  generatedAt: z.string(),
-  /** Run labels included in comparison */
-  runs: z.array(z.string()),
-  /** Total prompts compared */
-  promptCount: z.number(),
-  /** Number of trials per prompt (k value) */
-  trialsPerPrompt: z.number(),
-  /** Input format indicator */
-  inputFormat: z.literal('trials'),
-})
-
-/** Trials comparison meta type */
-export type TrialsComparisonMeta = z.infer<typeof TrialsComparisonMetaSchema>
-
-/**
- * Trials comparison report schema.
- *
- * @remarks
- * Aggregates trials comparison output across capability, reliability, and flakiness dimensions.
- * Used when comparing TrialResult JSONL files instead of CaptureResult files.
- *
- * Key metrics:
- * - Capability: passAtK - can the agent solve this at least once?
- * - Reliability: passExpK - does the agent solve this consistently?
- * - Flakiness: passAtK - passExpK - how inconsistent is the agent?
- */
-export const TrialsComparisonReportSchema = z.object({
-  /** Report metadata */
-  meta: TrialsComparisonMetaSchema,
-  /** Capability metrics by run label */
-  capability: z.record(z.string(), TrialsCapabilityMetricsSchema),
-  /** Reliability metrics by run label */
-  reliability: z.record(z.string(), TrialsReliabilityMetricsSchema),
-  /** Flakiness metrics by run label */
-  flakiness: z.record(z.string(), TrialsFlakinessMetricsSchema),
-  /** Quality metrics by run label (only when grader scores are present) */
-  quality: z.record(z.string(), TrialsQualityMetricsSchema).optional(),
-  /** Performance metrics by run label (always present, uses trial.duration) */
-  performance: z.record(z.string(), TrialsPerformanceMetricsSchema),
-  /** Head-to-head comparison details */
-  headToHead: z.object({
-    /** Pairwise wins by capability */
-    capability: z.array(PairwiseComparisonSchema),
-    /** Pairwise wins by reliability */
-    reliability: z.array(PairwiseComparisonSchema),
-    /** Pairwise wins by overall weighted score */
-    overall: z.array(PairwiseComparisonSchema),
-  }),
-  /** Per-prompt breakdown for drill-down (optional, can be large) */
-  perPrompt: z.array(TrialsPromptComparisonSchema).optional(),
-})
-
-/** Trials comparison report type */
-export type TrialsComparisonReport = z.infer<typeof TrialsComparisonReportSchema>
diff --git a/src/schemas/tests/constants.spec.ts b/src/schemas/tests/constants.spec.ts
deleted file mode 100644
index d140314..0000000
--- a/src/schemas/tests/constants.spec.ts
+++ /dev/null
@@ -1,121 +0,0 @@
-import { describe, expect, test } from 'bun:test'
-import {
-  DEFAULT_CALIBRATION_SAMPLE_SIZE,
-  DEFAULT_CLIENT_NAME,
-  DEFAULT_HARNESS_TIMEOUT,
-  DEFAULT_POLLING_INTERVAL,
-  DEFAULT_PROTOCOL_TIMEOUT,
-  DEFAULT_TRIAL_COUNT,
-  HEAD_LINES,
-  JSON_RPC_ERRORS,
-  MAX_CONTENT_LENGTH,
-  PROTOCOL_METHODS,
-  PROTOCOL_VERSION,
-  TAIL_LINES,
-} from '../constants.ts'
-
-// ============================================================================
-// JSON-RPC Protocol Constants
-// ============================================================================
-
-describe('PROTOCOL_METHODS', () => {
-  test('contains all required lifecycle methods', () => {
-    expect(PROTOCOL_METHODS.INITIALIZE).toBe('initialize')
-    expect(PROTOCOL_METHODS.SHUTDOWN).toBe('shutdown')
-  })
-
-  test('contains all required session methods', () => {
-    expect(PROTOCOL_METHODS.CREATE_SESSION).toBe('session/new')
-    expect(PROTOCOL_METHODS.LOAD_SESSION).toBe('session/load')
-    expect(PROTOCOL_METHODS.PROMPT).toBe('session/prompt')
-    expect(PROTOCOL_METHODS.CANCEL).toBe('session/cancel')
-    expect(PROTOCOL_METHODS.UPDATE).toBe('session/update')
-    expect(PROTOCOL_METHODS.REQUEST_PERMISSION).toBe('session/request_permission')
-    expect(PROTOCOL_METHODS.SET_MODEL).toBe('session/set_model')
-  })
-
-  test('contains protocol-level methods', () => {
-    expect(PROTOCOL_METHODS.CANCEL_REQUEST).toBe('$/cancel_request')
-  })
-})
-
-describe('PROTOCOL_VERSION', () => {
-  test('is version 1', () => {
-    expect(PROTOCOL_VERSION).toBe(1)
-  })
-})
-
-// ============================================================================
-// JSON-RPC Error Codes
-// ============================================================================
-
-describe('JSON_RPC_ERRORS', () => {
-  test('contains standard JSON-RPC error codes', () => {
-    expect(JSON_RPC_ERRORS.PARSE_ERROR).toBe(-32700)
-    expect(JSON_RPC_ERRORS.INVALID_REQUEST).toBe(-32600)
-    expect(JSON_RPC_ERRORS.METHOD_NOT_FOUND).toBe(-32601)
-    expect(JSON_RPC_ERRORS.INVALID_PARAMS).toBe(-32602)
-    expect(JSON_RPC_ERRORS.INTERNAL_ERROR).toBe(-32603)
-  })
-
-  test('contains extension error codes', () => {
-    expect(JSON_RPC_ERRORS.REQUEST_CANCELLED).toBe(-32800)
-  })
-})
-
-// ============================================================================
-// Client Defaults
-// ============================================================================
-
-describe('Client defaults', () => {
-  test('DEFAULT_CLIENT_NAME is set', () => {
-    expect(DEFAULT_CLIENT_NAME).toBe('plaited-eval-harness')
-  })
-
-  test('DEFAULT_PROTOCOL_TIMEOUT is 30 seconds', () => {
-    expect(DEFAULT_PROTOCOL_TIMEOUT).toBe(30000)
-  })
-
-  test('DEFAULT_POLLING_INTERVAL is 50ms', () => {
-    expect(DEFAULT_POLLING_INTERVAL).toBe(50)
-  })
-})
-
-// ============================================================================
-// Harness Preview Configuration
-// ============================================================================
-
-describe('Preview configuration', () => {
-  test('HEAD_LINES is positive', () => {
-    expect(HEAD_LINES).toBeGreaterThan(0)
-    expect(HEAD_LINES).toBe(8)
-  })
-
-  test('TAIL_LINES is positive', () => {
-    expect(TAIL_LINES).toBeGreaterThan(0)
-    expect(TAIL_LINES).toBe(4)
-  })
-
-  test('MAX_CONTENT_LENGTH is reasonable', () => {
-    expect(MAX_CONTENT_LENGTH).toBeGreaterThan(0)
-    expect(MAX_CONTENT_LENGTH).toBe(500)
-  })
-})
-
-// ============================================================================
-// Harness Defaults
-// ============================================================================
-
-describe('Harness defaults', () => {
-  test('DEFAULT_HARNESS_TIMEOUT is 60 seconds', () => {
-    expect(DEFAULT_HARNESS_TIMEOUT).toBe(60000)
-  })
-
-  test('DEFAULT_TRIAL_COUNT is 5', () => {
-    expect(DEFAULT_TRIAL_COUNT).toBe(5)
-  })
-
-  test('DEFAULT_CALIBRATION_SAMPLE_SIZE is 10', () => {
-    expect(DEFAULT_CALIBRATION_SAMPLE_SIZE).toBe(10)
-  })
-})
diff --git a/src/schemas/tests/fixtures/grader-bad-module.ts b/src/schemas/tests/fixtures/grader-bad-module.ts
deleted file mode 100644
index 42516e3..0000000
--- a/src/schemas/tests/fixtures/grader-bad-module.ts
+++ /dev/null
@@ -1,5 +0,0 @@
-/**
- * Test fixture: Invalid TypeScript grader (no 'grade' export).
- */
-
-export const evaluate = () => ({ pass: true, score: 1.0 })
diff --git a/src/schemas/tests/fixtures/grader-exec-fail.py b/src/schemas/tests/fixtures/grader-exec-fail.py
deleted file mode 100755
index 34571e7..0000000
--- a/src/schemas/tests/fixtures/grader-exec-fail.py
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/usr/bin/env python3
-"""
-Test fixture: Python grader that exits with non-zero code.
-"""
-
-import sys
-
-sys.stderr.write("Intentional failure")
-sys.exit(1)
diff --git a/src/schemas/tests/fixtures/grader-exec-invalid.py b/src/schemas/tests/fixtures/grader-exec-invalid.py
deleted file mode 100755
index bc36395..0000000
--- a/src/schemas/tests/fixtures/grader-exec-invalid.py
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/usr/bin/env python3
-"""
-Test fixture: Python grader that outputs invalid JSON.
-"""
-
-print("not valid json")
diff --git a/src/schemas/tests/fixtures/grader-exec.py b/src/schemas/tests/fixtures/grader-exec.py
deleted file mode 100755
index 5d51dcc..0000000
--- a/src/schemas/tests/fixtures/grader-exec.py
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/usr/bin/env python3
-"""
-Test fixture: Python grader script using stdin/stdout JSON protocol.
-"""
-
-import json
-import sys
-
-def main():
-    data = json.load(sys.stdin)
-
-    output = data.get("output", "").lower()
-    hint = (data.get("hint") or "").lower()
-
-    if hint:
-        pass_result = hint in output
-    else:
-        pass_result = True
-
-    result = {
-        "pass": pass_result,
-        "score": 1.0 if pass_result else 0.0,
-        "reasoning": "Contains expected" if pass_result else "Missing expected"
-    }
-
-    print(json.dumps(result))
-
-if __name__ == "__main__":
-    main()
diff --git a/src/schemas/tests/fixtures/grader-git.ts b/src/schemas/tests/fixtures/grader-git.ts
deleted file mode 100644
index d8a7df7..0000000
--- a/src/schemas/tests/fixtures/grader-git.ts
+++ /dev/null
@@ -1,116 +0,0 @@
-/**
- * Test fixture: Git-based grader that detects file changes.
- *
- * @remarks
- * This grader uses git to detect environmental outcomes instead of just
- * checking output text. It demonstrates the "grade outcomes, not paths" principle.
- *
- * SECURITY NOTE: This fixture validates the cwd parameter to prevent command injection.
- * When implementing your own git-based graders, always validate paths from untrusted sources.
- * The cwd parameter should only come from trusted sources (process.cwd(), CLI flags, etc.).
- */
-
-import { resolve } from 'node:path'
-import type { Grader } from '../../schemas.ts'
-
-/**
- * Validates that a path is safe to use in shell commands.
- *
- * @remarks
- * Rejects paths containing shell metacharacters or suspicious patterns
- * that could be used for command injection.
- *
- * @param path - The path to validate
- * @returns True if path appears safe, false otherwise
- */
-const isValidPath = (path: string): boolean => {
-  // Reject paths with shell metacharacters that could enable command injection
-  const dangerousChars = /[;&|`$(){}[\]<>'"\\]/
-  if (dangerousChars.test(path)) {
-    return false
-  }
-
-  // Reject paths with suspicious patterns
-  if (path.includes('..') || path.startsWith('-')) {
-    return false
-  }
-
-  return true
-}
-
-export const grade: Grader = async ({ output: _output, hint, cwd }) => {
-  // If no cwd provided, fall back to hint-based grading
-  if (!cwd) {
-    return {
-      pass: false,
-      score: 0,
-      reasoning: 'No working directory provided',
-    }
-  }
-
-  // SECURITY: Validate cwd to prevent command injection
-  if (!isValidPath(cwd)) {
-    return {
-      pass: false,
-      score: 0,
-      reasoning: 'Invalid working directory path (contains suspicious characters)',
-    }
-  }
-
-  // Normalize path to prevent directory traversal
-  const safeCwd = resolve(cwd)
-
-  // Check if we're in a git repo
-  const isGit = await Bun.$`git -C ${safeCwd} rev-parse --git-dir 2>/dev/null`.nothrow()
-
-  if (isGit.exitCode !== 0) {
-    return {
-      pass: false,
-      score: 0,
-      reasoning: 'Not a git repository',
-    }
-  }
-
-  // Detect what files were created/modified using git
-  // Note: This detects untracked (??) and modified (M) files.
-  // Staged (A), renamed (R), deleted (D) files are not included in this example.
-  const status = await Bun.$`git -C ${safeCwd} status --porcelain`.text()
-
-  const filesCreated = status
-    .split('\n')
-    .filter((line) => line.startsWith('??')) // ?? = untracked files
-    .map((line) => line.slice(3).trim())
-    .filter(Boolean)
-
-  const filesModified = status
-    .split('\n')
-    .filter((line) => line.startsWith(' M') || line.startsWith('M ')) // M = modified
-    .map((line) => line.slice(3).trim())
-    .filter(Boolean)
-
-  const hasChanges = filesCreated.length > 0 || filesModified.length > 0
-
-  // If hint is provided, check if any changed file matches the hint
-  let matchesHint = true
-  if (hint) {
-    const allChangedFiles = [...filesCreated, ...filesModified]
-    matchesHint = allChangedFiles.some((file) => file.toLowerCase().includes(hint.toLowerCase()))
-  }
-
-  const pass = hasChanges && matchesHint
-
-  return {
-    pass,
-    score: pass ? 1.0 : hasChanges ? 0.5 : 0.0,
-    reasoning: pass
-      ? `Files changed: ${[...filesCreated, ...filesModified].join(', ')}`
-      : hasChanges
-        ? 'File changes do not match hint'
-        : 'No file changes detected',
-    outcome: {
-      filesCreated,
-      filesModified,
-      type: 'git_status_check',
-    },
-  }
-}
diff --git a/src/schemas/tests/fixtures/grader-module.ts b/src/schemas/tests/fixtures/grader-module.ts
deleted file mode 100644
index a871167..0000000
--- a/src/schemas/tests/fixtures/grader-module.ts
+++ /dev/null
@@ -1,14 +0,0 @@
-/**
- * Test fixture: TypeScript grader module.
- */
-
-import type { Grader } from '../../schemas.ts'
-
-export const grade: Grader = async ({ input: _input, output, hint }) => {
-  const pass = hint ? output.toLowerCase().includes(hint.toLowerCase()) : true
-  return {
-    pass,
-    score: pass ? 1.0 : 0.0,
-    reasoning: pass ? 'Contains expected text' : 'Missing expected text',
-  }
-}
diff --git a/src/schemas/tests/grader-git.spec.ts b/src/schemas/tests/grader-git.spec.ts
deleted file mode 100644
index 580da90..0000000
--- a/src/schemas/tests/grader-git.spec.ts
+++ /dev/null
@@ -1,222 +0,0 @@
-/**
- * Tests for git-based grader fixture.
- *
- * @remarks
- * Verifies that graders can use git to detect environmental outcomes
- * and return structured outcome data.
- */
-
-import { afterEach, beforeEach, describe, expect, test } from 'bun:test'
-import { mkdtemp, rm } from 'node:fs/promises'
-import { tmpdir } from 'node:os'
-import { join } from 'node:path'
-import type { Grader } from '../schemas.ts'
-
-describe('Git-based grader', () => {
-  let tempDir: string
-  let grader: Grader
-
-  beforeEach(async () => {
-    // Create temporary directory
-    tempDir = await mkdtemp(join(tmpdir(), 'git-grader-test-'))
-
-    // Initialize git repo
-    await Bun.$`git -C ${tempDir} init`.quiet()
-    await Bun.$`git -C ${tempDir} config user.email "test@test.com"`.quiet()
-    await Bun.$`git -C ${tempDir} config user.name "Test User"`.quiet()
-
-    // Load the git-based grader
-    const module = await import('./fixtures/grader-git.ts')
-    grader = module.grade
-  })
-
-  afterEach(async () => {
-    // Clean up temporary directory
-    await rm(tempDir, { recursive: true, force: true })
-  })
-
-  test('detects newly created files', async () => {
-    // Create a new file (untracked)
-    await Bun.write(join(tempDir, 'button.tsx'), 'export const Button = () => <button>Click</button>')
-
-    const result = await grader({
-      input: 'Create a button component',
-      output: 'I created Button.tsx',
-      hint: 'button',
-      cwd: tempDir,
-    })
-
-    expect(result.pass).toBe(true)
-    expect(result.score).toBe(1.0)
-    expect(result.reasoning).toContain('button.tsx')
-    expect(result.outcome).toBeDefined()
-    expect(result.outcome?.filesCreated).toEqual(['button.tsx'])
-    expect(result.outcome?.type).toBe('git_status_check')
-  })
-
-  test('detects modified files', async () => {
-    // Create and commit a file
-    await Bun.write(join(tempDir, 'config.ts'), 'export const config = { value: 1 }')
-    await Bun.$`git -C ${tempDir} add config.ts`.quiet()
-    await Bun.$`git -C ${tempDir} commit -m "Initial commit"`.quiet()
-
-    // Modify the file
-    await Bun.write(join(tempDir, 'config.ts'), 'export const config = { value: 2 }')
-
-    const result = await grader({
-      input: 'Update config value',
-      output: 'I updated the config',
-      hint: 'config',
-      cwd: tempDir,
-    })
-
-    expect(result.pass).toBe(true)
-    expect(result.score).toBe(1.0)
-    expect(result.reasoning).toContain('config.ts')
-    expect(result.outcome).toBeDefined()
-    expect(result.outcome?.filesModified).toEqual(['config.ts'])
-    expect(result.outcome?.type).toBe('git_status_check')
-  })
-
-  test('fails when no changes detected', async () => {
-    // No files created or modified
-    const result = await grader({
-      input: 'Create a button component',
-      output: 'I created a button component',
-      cwd: tempDir,
-    })
-
-    expect(result.pass).toBe(false)
-    expect(result.score).toBe(0)
-    expect(result.reasoning).toContain('No file changes detected')
-    expect(result.outcome).toBeDefined()
-    expect(result.outcome?.filesCreated).toEqual([])
-    expect(result.outcome?.filesModified).toEqual([])
-  })
-
-  test('partial score when changes do not match hint', async () => {
-    // Create a file that does not match the hint
-    await Bun.write(join(tempDir, 'unrelated.ts'), 'export const foo = 1')
-
-    const result = await grader({
-      input: 'Create a button component',
-      output: 'I created something',
-      hint: 'button',
-      cwd: tempDir,
-    })
-
-    expect(result.pass).toBe(false)
-    expect(result.score).toBe(0.5) // Has changes but doesn't match hint
-    expect(result.reasoning).toContain('do not match hint')
-    expect(result.outcome?.filesCreated).toEqual(['unrelated.ts'])
-  })
-
-  test('handles missing cwd parameter', async () => {
-    const result = await grader({
-      input: 'Create a button component',
-      output: 'I created a button',
-      hint: 'button',
-      // cwd not provided
-    })
-
-    expect(result.pass).toBe(false)
-    expect(result.score).toBe(0)
-    expect(result.reasoning).toBe('No working directory provided')
-  })
-
-  test('handles non-git directory', async () => {
-    // Create a non-git temp directory
-    const nonGitDir = await mkdtemp(join(tmpdir(), 'non-git-test-'))
-
-    try {
-      const result = await grader({
-        input: 'Create a button component',
-        output: 'I created a button',
-        cwd: nonGitDir,
-      })
-
-      expect(result.pass).toBe(false)
-      expect(result.score).toBe(0)
-      expect(result.reasoning).toBe('Not a git repository')
-    } finally {
-      await rm(nonGitDir, { recursive: true, force: true })
-    }
-  })
-
-  test('works without hint parameter', async () => {
-    // Create a file
-    await Bun.write(join(tempDir, 'any-file.ts'), 'export const x = 1')
-
-    const result = await grader({
-      input: 'Create a file',
-      output: 'I created a file',
-      cwd: tempDir,
-      // hint not provided
-    })
-
-    expect(result.pass).toBe(true)
-    expect(result.score).toBe(1.0)
-    expect(result.reasoning).toContain('any-file.ts')
-    expect(result.outcome?.filesCreated).toEqual(['any-file.ts'])
-  })
-
-  test('returns structured outcome for downstream analysis', async () => {
-    // Create multiple files
-    await Bun.write(join(tempDir, 'button.tsx'), 'export const Button = () => <button />')
-    await Bun.write(join(tempDir, 'input.tsx'), 'export const Input = () => <input />')
-
-    const result = await grader({
-      input: 'Create UI components',
-      output: 'I created Button and Input components',
-      cwd: tempDir,
-    })
-
-    expect(result.outcome).toBeDefined()
-    expect(result.outcome?.type).toBe('git_status_check')
-    expect(result.outcome?.filesCreated).toBeInstanceOf(Array)
-    expect(result.outcome?.filesCreated).toHaveLength(2)
-    expect(result.outcome?.filesCreated).toContain('button.tsx')
-    expect(result.outcome?.filesCreated).toContain('input.tsx')
-    expect(result.outcome?.filesModified).toEqual([])
-  })
-
-  test('rejects path with command injection attempt', async () => {
-    const result = await grader({
-      input: 'Create a file',
-      output: 'Created file',
-      cwd: '/tmp/test; rm -rf /', // Command injection attempt
-    })
-
-    expect(result.pass).toBe(false)
-    expect(result.score).toBe(0)
-    expect(result.reasoning).toContain('Invalid working directory path')
-  })
-
-  test('rejects path with directory traversal', async () => {
-    const result = await grader({
-      input: 'Create a file',
-      output: 'Created file',
-      cwd: '/tmp/../../../etc', // Directory traversal
-    })
-
-    expect(result.pass).toBe(false)
-    expect(result.score).toBe(0)
-    expect(result.reasoning).toContain('Invalid working directory path')
-  })
-
-  test('rejects path with shell metacharacters', async () => {
-    const dangerousPaths = ['/tmp/test$(whoami)', '/tmp/test`id`', '/tmp/test|cat', '/tmp/test&echo', '/tmp/test>out']
-
-    for (const path of dangerousPaths) {
-      const result = await grader({
-        input: 'Create a file',
-        output: 'Created file',
-        cwd: path,
-      })
-
-      expect(result.pass).toBe(false)
-      expect(result.score).toBe(0)
-      expect(result.reasoning).toContain('Invalid working directory path')
-    }
-  })
-})
diff --git a/src/schemas/tests/grader-loader.spec.ts b/src/schemas/tests/grader-loader.spec.ts
deleted file mode 100644
index 49f8517..0000000
--- a/src/schemas/tests/grader-loader.spec.ts
+++ /dev/null
@@ -1,153 +0,0 @@
-import { describe, expect, test } from 'bun:test'
-import { join } from 'node:path'
-import { loadGrader } from '../grader-loader.ts'
-
-const fixturesDir = join(import.meta.dir, 'fixtures')
-
-// ============================================================================
-// Module Graders (TypeScript/JavaScript)
-// ============================================================================
-
-describe('loadGrader - module graders', () => {
-  test('loads TypeScript grader module', async () => {
-    const grader = await loadGrader(join(fixturesDir, 'grader-module.ts'))
-
-    const result = await grader({
-      input: 'What is 2+2?',
-      output: 'The answer is 4',
-      hint: '4',
-    })
-
-    expect(result.pass).toBe(true)
-    expect(result.score).toBe(1.0)
-    expect(result.reasoning).toBe('Contains expected text')
-  })
-
-  test('fails when module does not export grade function', async () => {
-    await expect(loadGrader(join(fixturesDir, 'grader-bad-module.ts'))).rejects.toThrow(
-      "Grader module must export a 'grade' function",
-    )
-  })
-
-  test('fails when module does not exist', async () => {
-    await expect(loadGrader(join(fixturesDir, 'nonexistent.ts'))).rejects.toThrow('Grader not found')
-  })
-})
-
-// ============================================================================
-// Executable Graders (Python, etc.)
-// ============================================================================
-
-describe('loadGrader - executable graders', () => {
-  test('loads and executes Python grader', async () => {
-    const grader = await loadGrader(join(fixturesDir, 'grader-exec.py'))
-
-    const result = await grader({
-      input: 'What is 2+2?',
-      output: 'The answer is 4',
-      hint: '4',
-    })
-
-    expect(result.pass).toBe(true)
-    expect(result.score).toBe(1.0)
-    expect(result.reasoning).toBe('Contains expected')
-  })
-
-  test('Python grader returns pass=false when expected not in output', async () => {
-    const grader = await loadGrader(join(fixturesDir, 'grader-exec.py'))
-
-    const result = await grader({
-      input: 'What is 2+2?',
-      output: 'I do not know',
-      hint: '4',
-    })
-
-    expect(result.pass).toBe(false)
-    expect(result.score).toBe(0.0)
-  })
-
-  test('throws when executable exits with non-zero code', async () => {
-    const grader = await loadGrader(join(fixturesDir, 'grader-exec-fail.py'))
-
-    await expect(
-      grader({
-        input: 'test',
-        output: 'test',
-      }),
-    ).rejects.toThrow('Grader exited with code 1')
-  })
-
-  test('throws when executable outputs invalid JSON', async () => {
-    const grader = await loadGrader(join(fixturesDir, 'grader-exec-invalid.py'))
-
-    await expect(
-      grader({
-        input: 'test',
-        output: 'test',
-      }),
-    ).rejects.toThrow('Grader output is not valid JSON')
-  })
-
-  test('fails when executable does not exist', async () => {
-    await expect(loadGrader(join(fixturesDir, 'nonexistent.py'))).rejects.toThrow('Grader not found')
-  })
-})
-
-// ============================================================================
-// Extension Detection
-// ============================================================================
-
-describe('loadGrader - extension detection', () => {
-  test('detects .ts as module', async () => {
-    const grader = await loadGrader(join(fixturesDir, 'grader-module.ts'))
-    // If this doesn't throw, it was loaded as a module (not executed)
-    expect(grader).toBeInstanceOf(Function)
-  })
-
-  test('detects .py as executable', async () => {
-    const grader = await loadGrader(join(fixturesDir, 'grader-exec.py'))
-    expect(grader).toBeInstanceOf(Function)
-  })
-})
-
-// ============================================================================
-// Trajectory Support
-// ============================================================================
-
-describe('loadGrader - trajectory support', () => {
-  test('passes trajectory to module grader', async () => {
-    const grader = await loadGrader(join(fixturesDir, 'grader-module.ts'))
-
-    const trajectory = [
-      { type: 'message' as const, content: 'Hello', timestamp: 0 },
-      { type: 'tool_call' as const, name: 'read', status: 'completed', timestamp: 100 },
-    ]
-
-    const result = await grader({
-      input: 'test',
-      output: 'The answer is 4',
-      hint: '4',
-      trajectory,
-    })
-
-    expect(result.pass).toBe(true)
-  })
-
-  test('passes trajectory to executable grader', async () => {
-    const grader = await loadGrader(join(fixturesDir, 'grader-exec.py'))
-
-    const trajectory = [
-      { type: 'message' as const, content: 'Hello', timestamp: 0 },
-      { type: 'tool_call' as const, name: 'read', status: 'completed', timestamp: 100 },
-    ]
-
-    const result = await grader({
-      input: 'test',
-      output: 'The answer is 4',
-      hint: '4',
-      trajectory,
-    })
-
-    expect(result.pass).toBe(true)
-  })
-})
diff --git a/src/schemas/tests/schemas-cli.spec.ts b/src/schemas/tests/schemas-cli.spec.ts
deleted file mode 100644
index 2c81480..0000000
--- a/src/schemas/tests/schemas-cli.spec.ts
+++ /dev/null
@@ -1,142 +0,0 @@
-import { afterEach, beforeEach, describe, expect, test } from 'bun:test'
-import { runSchemas } from '../schemas-cli.ts'
-
-// ============================================================================
-// runSchemas
-// ============================================================================
-
-describe('runSchemas', () => {
-  const testOutputDir = '/tmp/agent-eval-harness-test-schemas'
-
-  beforeEach(async () => {
-    // Clean up test directory
-    await Bun.$`rm -rf ${testOutputDir}`.nothrow()
-  })
-
-  afterEach(async () => {
-    // Clean up test directory
-    await Bun.$`rm -rf ${testOutputDir}`.nothrow()
-  })
-
-  describe('list mode', () => {
-    test('returns array of schema names', async () => {
-      const result = await runSchemas({ list: true })
-      expect(Array.isArray(result)).toBe(true)
-      const names = result as string[]
-      expect(names).toContain('PromptCase')
-      expect(names).toContain('CaptureResult')
-      expect(names).toContain('GraderResult')
-    })
-  })
-
-  describe('single schema mode', () => {
-    test('returns single schema by name', async () => {
-      const result = await runSchemas({ schemaName: 'PromptCase', json: true })
-      expect(typeof result).toBe('object')
-      const schemas = result as Record<string, object>
-      expect(schemas.PromptCase).toBeDefined()
-      expect(schemas.PromptCase).toHaveProperty('$schema')
-      expect(schemas.PromptCase).toHaveProperty('title', 'PromptCase')
-    })
-
-    test('writes schema to file when outputPath provided', async () => {
-      const outputPath = `${testOutputDir}/prompt-case.json`
-      await Bun.$`mkdir -p ${testOutputDir}`
-
-      await runSchemas({
-        schemaName: 'GraderResult',
-        outputPath,
-      })
-
-      const content = await Bun.file(outputPath).text()
-      const schema = JSON.parse(content)
-      expect(schema.title).toBe('GraderResult')
-      expect(schema.$schema).toBe('https://json-schema.org/draft/2020-12/schema')
-    })
-  })
-
-  describe('all schemas mode', () => {
-    test('returns all schemas as object', async () => {
-      const result = await runSchemas({ json: true })
-      expect(typeof result).toBe('object')
-      const schemas = result as Record<string, object>
-
-      // Check a sampling of expected schemas
-      expect(schemas.PromptCase).toBeDefined()
-      expect(schemas.CaptureResult).toBeDefined()
-      expect(schemas.GraderResult).toBeDefined()
-      expect(schemas.TrajectoryStep).toBeDefined()
-      expect(schemas.Session).toBeDefined()
-    })
-
-    test('writes all schemas to single file', async () => {
-      const outputPath = `${testOutputDir}/all-schemas.json`
-      await Bun.$`mkdir -p ${testOutputDir}`
-
-      await runSchemas({
-        json: true,
-        outputPath,
-      })
-
-      const content = await Bun.file(outputPath).text()
-      const schemas = JSON.parse(content)
-      expect(schemas.PromptCase).toBeDefined()
-      expect(schemas.CaptureResult).toBeDefined()
-    })
-
-    test('splits schemas into separate files', async () => {
-      await runSchemas({
-        json: true,
-        split: true,
-        outputPath: testOutputDir,
-      })
-
-      // Check that individual files were created
-      const promptCaseExists = await Bun.file(`${testOutputDir}/PromptCase.json`).exists()
-      const captureResultExists = await Bun.file(`${testOutputDir}/CaptureResult.json`).exists()
-      const graderResultExists = await Bun.file(`${testOutputDir}/GraderResult.json`).exists()
-
-      expect(promptCaseExists).toBe(true)
-      expect(captureResultExists).toBe(true)
-      expect(graderResultExists).toBe(true)
-
-      // Verify content
-      const promptCaseContent = await Bun.file(`${testOutputDir}/PromptCase.json`).text()
-      const promptCaseSchema = JSON.parse(promptCaseContent)
-      expect(promptCaseSchema.title).toBe('PromptCase')
-    })
-  })
-
-  describe('schema content validation', () => {
-    test('PromptCase schema has correct structure', async () => {
-      const result = await runSchemas({ schemaName: 'PromptCase', json: true })
-      const schemas = result as Record<string, object>
-      const schema = schemas.PromptCase as Record<string, unknown>
-
-      expect(schema.$schema).toBe('https://json-schema.org/draft/2020-12/schema')
-      expect(schema.title).toBe('PromptCase')
-      expect(schema.type).toBe('object')
-
-      // Check properties exist
-      const properties = schema.properties as Record<string, unknown>
-      expect(properties).toBeDefined()
-      expect(properties.id).toBeDefined()
-      expect(properties.input).toBeDefined()
-    })
-
-    test('GraderResult schema has correct constraints', async () => {
-      const result = await runSchemas({ schemaName: 'GraderResult', json: true })
-      const schemas = result as Record<string, object>
-      const schema = schemas.GraderResult as Record<string, unknown>
-
-      expect(schema.type).toBe('object')
-      const properties = schema.properties as Record<string, Record<string, unknown>>
-      expect(properties.pass).toBeDefined()
-      expect(properties.score).toBeDefined()
-      expect(properties.pass?.type).toBe('boolean')
-      expect(properties.score?.type).toBe('number')
-      expect(properties.score?.minimum).toBe(0)
-      expect(properties.score?.maximum).toBe(1)
-    })
-  })
-})
diff --git a/src/schemas/tests/schemas.spec.ts b/src/schemas/tests/schemas.spec.ts
deleted file mode 100644
index 7df25a3..0000000
--- a/src/schemas/tests/schemas.spec.ts
+++ /dev/null
@@ -1,606 +0,0 @@
-import { describe, expect, test } from 'bun:test'
-import {
-  CaptureResultSchema,
-  EnvVariableSchema,
-  GraderResultSchema,
-  HttpHeaderSchema,
-  JsonRpcErrorResponseSchema,
-  JsonRpcErrorSchema,
-  JsonRpcMessageSchema,
-  JsonRpcNotificationSchema,
-  JsonRpcRequestSchema,
-  JsonRpcResponseSchema,
-  JsonRpcSuccessResponseSchema,
-  McpServerHttpSchema,
-  McpServerSchema,
-  McpServerStdioSchema,
-  MessageStepSchema,
-  PlanStepSchema,
-  PromptCaseSchema,
-  SessionSchema,
-  ThoughtStepSchema,
-  TimingSchema,
-  ToolCallStepSchema,
-  ToolInputSchema,
-  TrajectoryRichnessSchema,
-  TrajectoryStepSchema,
-} from '../schemas.ts'
-
-// ============================================================================
-// Session Schema
-// ============================================================================
-
-describe('SessionSchema', () => {
-  test('parses valid session', () => {
-    const result = SessionSchema.safeParse({ id: 'sess_123' })
-    expect(result.success).toBe(true)
-  })
-
-  test('parses session with _meta', () => {
-    const result = SessionSchema.safeParse({ id: 'sess_123', _meta: { key: 'value' } })
-    expect(result.success).toBe(true)
-  })
-
-  test('rejects session without id', () => {
-    const result = SessionSchema.safeParse({})
-    expect(result.success).toBe(false)
-  })
-})
-
-// ============================================================================
-// JSON-RPC Schemas
-// ============================================================================
-
-describe('JsonRpcRequestSchema', () => {
-  test('parses valid request', () => {
-    const result = JsonRpcRequestSchema.safeParse({
-      jsonrpc: '2.0',
-      id: 1,
-      method: 'test',
-    })
-    expect(result.success).toBe(true)
-  })
-
-  test('parses request with params', () => {
-    const result = JsonRpcRequestSchema.safeParse({
-      jsonrpc: '2.0',
-      id: 'abc',
-      method: 'test',
-      params: { foo: 'bar' },
-    })
-    expect(result.success).toBe(true)
-  })
-
-  test('rejects invalid jsonrpc version', () => {
-    const result = JsonRpcRequestSchema.safeParse({
-      jsonrpc: '1.0',
-      id: 1,
-      method: 'test',
-    })
-    expect(result.success).toBe(false)
-  })
-})
-
-describe('JsonRpcNotificationSchema', () => {
-  test('parses notification without id', () => {
-    const result = JsonRpcNotificationSchema.safeParse({
-      jsonrpc: '2.0',
-      method: 'notify',
-    })
-    expect(result.success).toBe(true)
-  })
-})
-
-describe('JsonRpcErrorSchema', () => {
-  test('parses error with code and message', () => {
-    const result = JsonRpcErrorSchema.safeParse({
-      code: -32600,
-      message: 'Invalid request',
-    })
-    expect(result.success).toBe(true)
-  })
-
-  test('parses error with data', () => {
-    const result = JsonRpcErrorSchema.safeParse({
-      code: -32603,
-      message: 'Internal error',
-      data: { details: 'something went wrong' },
-    })
-    expect(result.success).toBe(true)
-  })
-})
-
-describe('JsonRpcSuccessResponseSchema', () => {
-  test('parses success response', () => {
-    const result = JsonRpcSuccessResponseSchema.safeParse({
-      jsonrpc: '2.0',
-      id: 1,
-      result: { data: 'test' },
-    })
-    expect(result.success).toBe(true)
-  })
-})
-
-describe('JsonRpcErrorResponseSchema', () => {
-  test('parses error response with id', () => {
-    const result = JsonRpcErrorResponseSchema.safeParse({
-      jsonrpc: '2.0',
-      id: 1,
-      error: { code: -32600, message: 'Invalid' },
-    })
-    expect(result.success).toBe(true)
-  })
-
-  test('parses error response with null id', () => {
-    const result = JsonRpcErrorResponseSchema.safeParse({
-      jsonrpc: '2.0',
-      id: null,
-      error: { code: -32700, message: 'Parse error' },
-    })
-    expect(result.success).toBe(true)
-  })
-})
-
-describe('JsonRpcResponseSchema', () => {
-  test('parses success response', () => {
-    const result = JsonRpcResponseSchema.safeParse({
-      jsonrpc: '2.0',
-      id: 1,
-      result: 'ok',
-    })
-    expect(result.success).toBe(true)
-  })
-
-  test('parses error response', () => {
-    const result = JsonRpcResponseSchema.safeParse({
-      jsonrpc: '2.0',
-      id: 1,
-      error: { code: -32600, message: 'Invalid' },
-    })
-    expect(result.success).toBe(true)
-  })
-})
-
-describe('JsonRpcMessageSchema', () => {
-  test('parses request', () => {
-    const result = JsonRpcMessageSchema.safeParse({
-      jsonrpc: '2.0',
-      id: 1,
-      method: 'test',
-    })
-    expect(result.success).toBe(true)
-  })
-
-  test('parses notification', () => {
-    const result = JsonRpcMessageSchema.safeParse({
-      jsonrpc: '2.0',
-      method: 'notify',
-    })
-    expect(result.success).toBe(true)
-  })
-
-  test('parses response', () => {
-    const result = JsonRpcMessageSchema.safeParse({
-      jsonrpc: '2.0',
-      id: 1,
-      result: 'ok',
-    })
-    expect(result.success).toBe(true)
-  })
-})
-
-// ============================================================================
-// MCP Server Schemas
-// ============================================================================
-
-describe('EnvVariableSchema', () => {
-  test('parses valid env variable', () => {
-    const result = EnvVariableSchema.safeParse({ name: 'API_KEY', value: 'secret' })
-    expect(result.success).toBe(true)
-  })
-})
-
-describe('HttpHeaderSchema', () => {
-  test('parses valid header', () => {
-    const result = HttpHeaderSchema.safeParse({ name: 'Authorization', value: 'Bearer token' })
-    expect(result.success).toBe(true)
-  })
-})
-
-describe('McpServerStdioSchema', () => {
-  test('parses stdio config with optional type', () => {
-    const result = McpServerStdioSchema.safeParse({
-      name: 'test-server',
-      command: 'node',
-      args: ['server.js'],
-      env: [],
-    })
-    expect(result.success).toBe(true)
-  })
-
-  test('parses stdio config with explicit type', () => {
-    const result = McpServerStdioSchema.safeParse({
-      type: 'stdio',
-      name: 'test-server',
-      command: 'bun',
-      args: ['run', 'server.ts'],
-      env: [{ name: 'DEBUG', value: 'true' }],
-    })
-    expect(result.success).toBe(true)
-  })
-})
-
-describe('McpServerHttpSchema', () => {
-  test('parses http config', () => {
-    const result = McpServerHttpSchema.safeParse({
-      type: 'http',
-      name: 'api-server',
-      url: 'https://api.example.com',
-      headers: [{ name: 'Authorization', value: 'Bearer token' }],
-    })
-    expect(result.success).toBe(true)
-  })
-})
-
-describe('McpServerSchema', () => {
-  test('parses stdio server', () => {
-    const result = McpServerSchema.safeParse({
-      name: 'test',
-      command: 'node',
-      args: [],
-      env: [],
-    })
-    expect(result.success).toBe(true)
-  })
-
-  test('parses http server', () => {
-    const result = McpServerSchema.safeParse({
-      type: 'http',
-      name: 'test',
-      url: 'https://example.com',
-      headers: [],
-    })
-    expect(result.success).toBe(true)
-  })
-})
-
-// ============================================================================
-// Prompt Case Schema
-// ============================================================================
-
-describe('PromptCaseSchema', () => {
-  test('parses minimal prompt case', () => {
-    const result = PromptCaseSchema.safeParse({
-      id: 'test-1',
-      input: 'Hello',
-    })
-    expect(result.success).toBe(true)
-  })
-
-  test('parses single-turn with hint', () => {
-    const result = PromptCaseSchema.safeParse({
-      id: 'test-1',
-      input: 'What is 2+2?',
-      hint: '4',
-    })
-    expect(result.success).toBe(true)
-  })
-
-  test('parses multi-turn input array', () => {
-    const result = PromptCaseSchema.safeParse({
-      id: 'test-1',
-      input: ['Hello', 'How are you?', 'Goodbye'],
-    })
-    expect(result.success).toBe(true)
-  })
-
-  test('parses full prompt case with all fields', () => {
-    const result = PromptCaseSchema.safeParse({
-      id: 'test-1',
-      input: ['Hello'],
-      hint: 'greeting',
-      reference: 'Hi there!',
-      metadata: { category: 'test', difficulty: 'easy' },
-      timeout: 30000,
-    })
-    expect(result.success).toBe(true)
-  })
-
-  test('rejects missing id', () => {
-    const result = PromptCaseSchema.safeParse({ input: 'Hello' })
-    expect(result.success).toBe(false)
-  })
-
-  test('rejects missing input', () => {
-    const result = PromptCaseSchema.safeParse({ id: 'test-1' })
-    expect(result.success).toBe(false)
-  })
-})
-
-// ============================================================================
-// Grader Result Schema
-// ============================================================================
-
-describe('GraderResultSchema', () => {
-  test('parses valid grader result', () => {
-    const result = GraderResultSchema.safeParse({
-      pass: true,
-      score: 1.0,
-    })
-    expect(result.success).toBe(true)
-  })
-
-  test('parses result with reasoning', () => {
-    const result = GraderResultSchema.safeParse({
-      pass: false,
-      score: 0.5,
-      reasoning: 'Partial match',
-    })
-    expect(result.success).toBe(true)
-  })
-
-  test('rejects score below 0', () => {
-    const result = GraderResultSchema.safeParse({
-      pass: false,
-      score: -0.1,
-    })
-    expect(result.success).toBe(false)
-  })
-
-  test('rejects score above 1', () => {
-    const result = GraderResultSchema.safeParse({
-      pass: true,
-      score: 1.5,
-    })
-    expect(result.success).toBe(false)
-  })
-})
-
-// ============================================================================
-// Trajectory Step Schemas
-// ============================================================================
-
-describe('ToolInputSchema', () => {
-  test('parses file path input', () => {
-    const result = ToolInputSchema.safeParse({ file_path: '/src/index.ts' })
-    expect(result.success).toBe(true)
-    if (result.success) {
-      expect(result.data.file_path).toBe('/src/index.ts')
-    }
-  })
-
-  test('allows additional properties via passthrough', () => {
-    const result = ToolInputSchema.safeParse({
-      file_path: '/test.ts',
-      custom_field: 'value',
-    })
-    expect(result.success).toBe(true)
-    if (result.success) {
-      expect(result.data.custom_field).toBe('value')
-    }
-  })
-})
-
-describe('ThoughtStepSchema', () => {
-  test('parses thought step', () => {
-    const result = ThoughtStepSchema.safeParse({
-      type: 'thought',
-      content: 'Thinking about the problem...',
-      timestamp: 1234567890,
-    })
-    expect(result.success).toBe(true)
-  })
-})
-
-describe('MessageStepSchema', () => {
-  test('parses message step', () => {
-    const result = MessageStepSchema.safeParse({
-      type: 'message',
-      content: 'Hello, world!',
-      timestamp: 1234567890,
-    })
-    expect(result.success).toBe(true)
-  })
-})
-
-describe('ToolCallStepSchema', () => {
-  test('parses tool call step', () => {
-    const result = ToolCallStepSchema.safeParse({
-      type: 'tool_call',
-      name: 'Read',
-      status: 'completed',
-      timestamp: 1234567890,
-    })
-    expect(result.success).toBe(true)
-  })
-
-  test('parses tool call with input/output', () => {
-    const result = ToolCallStepSchema.safeParse({
-      type: 'tool_call',
-      name: 'Write',
-      status: 'completed',
-      input: { file_path: '/test.ts', content: 'code' },
-      output: 'File written',
-      duration: 150,
-      timestamp: 1234567890,
-    })
-    expect(result.success).toBe(true)
-  })
-})
-
-describe('PlanStepSchema', () => {
-  test('parses plan step', () => {
-    const result = PlanStepSchema.safeParse({
-      type: 'plan',
-      entries: [{ task: 'Step 1' }, { task: 'Step 2' }],
-      timestamp: 1234567890,
-    })
-    expect(result.success).toBe(true)
-  })
-})
-
-describe('TrajectoryStepSchema', () => {
-  test('discriminates thought type', () => {
-    const result = TrajectoryStepSchema.safeParse({
-      type: 'thought',
-      content: 'test',
-      timestamp: 123,
-    })
-    expect(result.success).toBe(true)
-    if (result.success) {
-      expect(result.data.type).toBe('thought')
-    }
-  })
-
-  test('discriminates message type', () => {
-    const result = TrajectoryStepSchema.safeParse({
-      type: 'message',
-      content: 'test',
-      timestamp: 123,
-    })
-    expect(result.success).toBe(true)
-  })
-
-  test('discriminates tool_call type', () => {
-    const result = TrajectoryStepSchema.safeParse({
-      type: 'tool_call',
-      name: 'Test',
-      status: 'completed',
-      timestamp: 123,
-    })
-    expect(result.success).toBe(true)
-  })
-
-  test('discriminates plan type', () => {
-    const result = TrajectoryStepSchema.safeParse({
-      type: 'plan',
-      entries: [],
-      timestamp: 123,
-    })
-    expect(result.success).toBe(true)
-  })
-
-  test('rejects unknown type', () => {
-    const result = TrajectoryStepSchema.safeParse({
-      type: 'unknown',
-      timestamp: 123,
-    })
-    expect(result.success).toBe(false)
-  })
-})
-
-// ============================================================================
-// Trajectory Richness Schema
-// ============================================================================
-
-describe('TrajectoryRichnessSchema', () => {
-  test('parses full', () => {
-    expect(TrajectoryRichnessSchema.safeParse('full').success).toBe(true)
-  })
-
-  test('parses minimal', () => {
-    expect(TrajectoryRichnessSchema.safeParse('minimal').success).toBe(true)
-  })
-
-  test('parses messages-only', () => {
-    expect(TrajectoryRichnessSchema.safeParse('messages-only').success).toBe(true)
-  })
-
-  test('rejects invalid value', () => {
-    expect(TrajectoryRichnessSchema.safeParse('invalid').success).toBe(false)
-  })
-})
-
-// ============================================================================
-// Timing Schema
-// ============================================================================
-
-describe('TimingSchema', () => {
-  test('parses timing with required fields', () => {
-    const result = TimingSchema.safeParse({
-      start: 1000,
-      end: 2000,
-      sessionCreation: 100,
-      total: 1000,
-    })
-    expect(result.success).toBe(true)
-  })
-
-  test('parses timing with optional fields', () => {
-    const result = TimingSchema.safeParse({
-      start: 1000,
-      end: 2000,
-      firstResponse: 500,
-      sessionCreation: 100,
-      total: 1000,
-      inputTokens: 50,
-      outputTokens: 100,
-    })
-    expect(result.success).toBe(true)
-  })
-})
-
-// ============================================================================
-// Capture Result Schema
-// ============================================================================
-
-describe('CaptureResultSchema', () => {
-  test('parses minimal capture result', () => {
-    const result = CaptureResultSchema.safeParse({
-      id: 'test-1',
-      input: 'Hello',
-      output: 'Hi there!',
-      trajectory: [],
-      metadata: {},
-      timing: {
-        start: 1000,
-        end: 2000,
-        sessionCreation: 100,
-        total: 1000,
-      },
-      toolErrors: false,
-    })
-    expect(result.success).toBe(true)
-  })
-
-  test('parses capture result with multi-turn input', () => {
-    const result = CaptureResultSchema.safeParse({
-      id: 'test-1',
-      input: ['Hello', 'Goodbye'],
-      output: 'Bye!',
-      trajectory: [
-        { type: 'message', content: 'Hi', timestamp: 100 },
-        { type: 'message', content: 'Bye!', timestamp: 200 },
-      ],
-      metadata: { agent: 'test-agent', turnCount: 2 },
-      timing: {
-        start: 1000,
-        end: 2000,
-        sessionCreation: 100,
-        total: 1000,
-      },
-      toolErrors: false,
-    })
-    expect(result.success).toBe(true)
-  })
-
-  test('parses capture result with hint and score', () => {
-    const result = CaptureResultSchema.safeParse({
-      id: 'test-1',
-      input: 'What is 2+2?',
-      output: '4',
-      hint: '4',
-      trajectory: [],
-      metadata: {},
-      timing: {
-        start: 1000,
-        end: 2000,
-        sessionCreation: 100,
-        total: 1000,
-      },
-      toolErrors: false,
-      score: { pass: true, score: 1.0 },
-    })
-    expect(result.success).toBe(true)
-  })
-})
diff --git a/src/tests/trial.spec.ts b/src/tests/trial.spec.ts
new file mode 100644
index 0000000..d19656e
--- /dev/null
+++ b/src/tests/trial.spec.ts
@@ -0,0 +1,723 @@
+/**
+ * Tests for the trial runner.
+ *
+ * @remarks
+ * Covers: runTrial (k=1, k=3), pass@k math, grading, concurrency,
+ * CLI contract (--help, --schema), loadPolyglot, and utility functions.
+ */
+
+import { afterEach, describe, expect, test } from 'bun:test'
+import { stat, unlink } from 'node:fs/promises'
+import { tmpdir } from 'node:os'
+import * as z from 'zod'
+import type { Adapter, Grader, PromptCase } from '../trial.schemas.ts'
+import { TrialResultSchema } from '../trial.schemas.ts'
+import { calculatePassAtK, calculatePassExpK, runTrial, TrialInputSchema, TrialOutputSchema } from '../trial.ts'
+import {
+  createWorkspaceDir,
+  createWriteMutex,
+  detectRichness,
+  hasToolErrors,
+  loadJsonl,
+  loadPrompts,
+  resolvePath,
+  runWorkerPool,
+} from '../trial.utils.ts'
+
+// ============================================================================
+// Test Fixtures
+// ============================================================================
+
+/** Echo adapter — returns input as output */
+const echoAdapter: Adapter = async ({ prompt }) => ({
+  output: Array.isArray(prompt) ? prompt.join('\n') : prompt,
+})
+
+/** Slow echo adapter — adds delay per trial for concurrency testing */
+const slowEchoAdapter: Adapter = async ({ prompt }) => {
+  await new Promise((resolve) => setTimeout(resolve, 50))
+  return { output: Array.isArray(prompt) ? prompt.join('\n') : prompt }
+}
+
+/** Adapter that includes trajectory */
+const richAdapter: Adapter = async ({ prompt }) => {
+  const text = Array.isArray(prompt) ? prompt.join('\n') : prompt
+  return {
+    output: text,
+    trajectory: [
+      { type: 'thought' as const, content: 'thinking...', timestamp: Date.now() },
+      { type: 'message' as const, content: text, timestamp: Date.now() },
+    ],
+    timing: { total: 100, inputTokens: 10, outputTokens: 20 },
+  }
+}
+
+/** Adapter that always fails */
+const failingAdapter: Adapter = async () => {
+  throw new Error('Adapter exploded')
+}
+
+/** Always-pass grader */
+const passGrader: Grader = async ({ output }) => ({
+  pass: true,
+  score: 1.0,
+  reasoning: `Output: ${output.slice(0, 20)}`,
+})
+
+/** Flaky grader — passes 50% of the time */
+let flakyCounter = 0
+const flakyGrader: Grader = async () => {
+  const pass = flakyCounter++ % 2 === 0
+  return { pass, score: pass ? 1.0 : 0.0 }
+}
+
+const samplePrompts: PromptCase[] = [
+  { id: 'p1', input: 'Hello world' },
+  { id: 'p2', input: 'Goodbye world' },
+]
+
+// ============================================================================
+// Temp file tracking
+// ============================================================================
+
+const tempFiles: string[] = []
+const tempFile = (name: string) => {
+  // Timestamp goes before name so file extension stays at the end
+  // (isJsModule checks extension for polyglot loader)
+  const path = `${tmpdir()}/trial-${Date.now()}-${name}`
+  tempFiles.push(path)
+  return path
+}
+
+afterEach(async () => {
+  for (const f of tempFiles) {
+    await unlink(f).catch(() => {})
+  }
+  tempFiles.length = 0
+  flakyCounter = 0
+})
+
+// ============================================================================
+// Pass@k Math
+// ============================================================================
+
+describe('calculatePassAtK', () => {
+  test('all pass', () => {
+    expect(calculatePassAtK(5, 5)).toBe(1)
+  })
+
+  test('none pass', () => {
+    expect(calculatePassAtK(0, 5)).toBe(0)
+  })
+
+  test('partial pass', () => {
+    const result = calculatePassAtK(3, 5)
+    // 1 - (1 - 0.6)^5 = 1 - 0.4^5 ≈ 0.9898
+    expect(result).toBeCloseTo(0.9898, 3)
+  })
+
+  test('single trial pass', () => {
+    expect(calculatePassAtK(1, 1)).toBe(1)
+  })
+
+  test('single trial fail', () => {
+    expect(calculatePassAtK(0, 1)).toBe(0)
+  })
+})
+
+describe('calculatePassExpK', () => {
+  test('all pass', () => {
+    expect(calculatePassExpK(5, 5)).toBe(1)
+  })
+
+  test('none pass', () => {
+    expect(calculatePassExpK(0, 5)).toBe(0)
+  })
+
+  test('partial pass', () => {
+    const result = calculatePassExpK(3, 5)
+    // 0.6^5 ≈ 0.0778
+    expect(result).toBeCloseTo(0.0778, 3)
+  })
+})
+
+// ============================================================================
+// runTrial — Core Library Function
+// ============================================================================
+
+describe('runTrial', () => {
+  test('k=1 single trial per prompt', async () => {
+    const results = await runTrial({
+      adapter: echoAdapter,
+      prompts: samplePrompts,
+    })
+
+    expect(results).toHaveLength(2)
+    const first = results[0]
+    expect(first).toBeDefined()
+    expect(first!.id).toBe('p1')
+    expect(first!.k).toBe(1)
+    expect(first!.trials).toHaveLength(1)
+    const firstTrial = first!.trials[0]
+    expect(firstTrial).toBeDefined()
+    expect(firstTrial!.output).toBe('Hello world')
+    expect(firstTrial!.trialNum).toBe(1)
+    expect(firstTrial!.duration).toBeGreaterThanOrEqual(0)
+  })
+
+  test('k=3 multiple trials per prompt', async () => {
+    const results = await runTrial({
+      adapter: echoAdapter,
+      prompts: [{ id: 'multi', input: 'test' }],
+      k: 3,
+    })
+
+    expect(results).toHaveLength(1)
+    const r = results[0]
+    expect(r).toBeDefined()
+    expect(r!.k).toBe(3)
+    expect(r!.trials).toHaveLength(3)
+    const t0 = r!.trials[0]
+    const t1 = r!.trials[1]
+    const t2 = r!.trials[2]
+    expect(t0).toBeDefined()
+    expect(t1).toBeDefined()
+    expect(t2).toBeDefined()
+    expect(t0!.trialNum).toBe(1)
+    expect(t1!.trialNum).toBe(2)
+    expect(t2!.trialNum).toBe(3)
+    // All trials should produce the same output for echo adapter
+    for (const trial of r!.trials) {
+      expect(trial.output).toBe('test')
+    }
+  })
+
+  test('multi-turn input', async () => {
+    const results = await runTrial({
+      adapter: echoAdapter,
+      prompts: [{ id: 'mt', input: ['Hello', 'World'] }],
+    })
+
+    const r = results[0]
+    expect(r).toBeDefined()
+    const trial = r!.trials[0]
+    expect(trial).toBeDefined()
+    expect(trial!.output).toBe('Hello\nWorld')
+    expect(r!.input).toEqual(['Hello', 'World'])
+  })
+
+  test('with grader computes metrics', async () => {
+    const results = await runTrial({
+      adapter: echoAdapter,
+      prompts: [{ id: 'graded', input: 'test', hint: 'should pass' }],
+      grader: passGrader,
+      k: 3,
+    })
+
+    const r = results[0]
+    expect(r).toBeDefined()
+    expect(r!.passRate).toBe(1)
+    expect(r!.passAtK).toBe(1)
+    expect(r!.passExpK).toBe(1)
+    expect(r!.hint).toBe('should pass')
+    for (const trial of r!.trials) {
+      expect(trial.pass).toBe(true)
+      expect(trial.score).toBe(1.0)
+      expect(trial.reasoning).toBeDefined()
+    }
+  })
+
+  test('flaky grader produces partial metrics', async () => {
+    const results = await runTrial({
+      adapter: echoAdapter,
+      prompts: [{ id: 'flaky', input: 'test' }],
+      grader: flakyGrader,
+      k: 4,
+    })
+
+    const r = results[0]
+    expect(r).toBeDefined()
+    // flakyCounter: 0 (pass), 1 (fail), 2 (pass), 3 (fail) → 2/4
+    expect(r!.passRate).toBe(0.5)
+    expect(r!.passAtK).toBeDefined()
+    expect(r!.passExpK).toBeDefined()
+    expect(r!.passAtK!).toBeGreaterThan(0)
+    expect(r!.passAtK!).toBeLessThanOrEqual(1)
+    expect(r!.passExpK!).toBeGreaterThanOrEqual(0)
+    expect(r!.passExpK!).toBeLessThan(1)
+  })
+
+  test('adapter failure records error entry', async () => {
+    const results = await runTrial({
+      adapter: failingAdapter,
+      prompts: [{ id: 'fail', input: 'boom' }],
+    })
+
+    expect(results).toHaveLength(1)
+    const r = results[0]
+    expect(r).toBeDefined()
+    const trial = r!.trials[0]
+    expect(trial).toBeDefined()
+    expect(trial!.output).toBe('')
+    expect(trial!.pass).toBe(false)
+    expect(trial!.reasoning).toContain('Adapter exploded')
+    expect(trial!.duration).toBeGreaterThanOrEqual(0)
+  })
+
+  test('adapter timeout records timed out entry', async () => {
+    const slowAdapter: Adapter = async () => {
+      await new Promise((resolve) => setTimeout(resolve, 5000))
+      return { output: 'never' }
+    }
+
+    const results = await runTrial({
+      adapter: slowAdapter,
+      prompts: [{ id: 'timeout', input: 'slow' }],
+      timeout: 50,
+    })
+
+    const r = results[0]
+    expect(r).toBeDefined()
+    const trial = r!.trials[0]
+    expect(trial).toBeDefined()
+    expect(trial!.output).toBe('')
+    expect(trial!.timedOut).toBe(true)
+    expect(trial!.pass).toBe(false)
+  })
+
+  test('rich adapter includes trajectory and timing', async () => {
+    const results = await runTrial({
+      adapter: richAdapter,
+      prompts: [{ id: 'rich', input: 'data' }],
+    })
+
+    const r = results[0]
+    expect(r).toBeDefined()
+    const trial = r!.trials[0]
+    expect(trial).toBeDefined()
+    expect(trial!.trajectory).toBeDefined()
+    expect(trial!.trajectory).toHaveLength(2)
+    const traj0 = trial!.trajectory![0]
+    const traj1 = trial!.trajectory![1]
+    expect(traj0).toBeDefined()
+    expect(traj1).toBeDefined()
+    expect(traj0!.type).toBe('thought')
+    expect(traj1!.type).toBe('message')
+    expect(trial!.timing).toBeDefined()
+    expect(trial!.timing!.inputTokens).toBe(10)
+    expect(trial!.timing!.outputTokens).toBe(20)
+  })
+
+  test('writes JSONL to output file', async () => {
+    const outPath = tempFile('output.jsonl')
+
+    await runTrial({
+      adapter: echoAdapter,
+      prompts: samplePrompts,
+      outputPath: outPath,
+    })
+
+    const content = await Bun.file(outPath).text()
+    const lines = content.trim().split('\n')
+    expect(lines).toHaveLength(2)
+
+    const line0 = lines[0]
+    const line1 = lines[1]
+    expect(line0).toBeDefined()
+    expect(line1).toBeDefined()
+    const first = TrialResultSchema.parse(JSON.parse(line0!))
+    expect(first.id).toBe('p1')
+    const second = TrialResultSchema.parse(JSON.parse(line1!))
+    expect(second.id).toBe('p2')
+  })
+
+  test('append mode adds to existing file', async () => {
+    const outPath = tempFile('append.jsonl')
+    await Bun.write(outPath, '{"existing":"line"}\n')
+
+    await runTrial({
+      adapter: echoAdapter,
+      prompts: [{ id: 'appended', input: 'test' }],
+      outputPath: outPath,
+      append: true,
+    })
+
+    const lines = (await Bun.file(outPath).text()).trim().split('\n')
+    expect(lines).toHaveLength(2)
+    expect(lines[0]).toBe('{"existing":"line"}')
+    const appendedLine = lines[1]
+    expect(appendedLine).toBeDefined()
+    expect(JSON.parse(appendedLine!).id).toBe('appended')
+  })
+
+  test('metadata passes through', async () => {
+    const results = await runTrial({
+      adapter: echoAdapter,
+      prompts: [{ id: 'meta', input: 'test', metadata: { category: 'unit', difficulty: 'easy' } }],
+    })
+
+    const r = results[0]
+    expect(r).toBeDefined()
+    expect(r!.metadata).toEqual({ category: 'unit', difficulty: 'easy' })
+  })
+
+  test('schema validation on results', () => {
+    const result = TrialResultSchema.parse({
+      id: 'test',
+      input: 'hello',
+      k: 1,
+      trials: [{ trialNum: 1, output: 'hello', duration: 100 }],
+    })
+    expect(result.id).toBe('test')
+    expect(result.trials).toHaveLength(1)
+  })
+})
+
+// ============================================================================
+// Concurrency
+// ============================================================================
+
+describe('concurrency', () => {
+  test('concurrent workers process prompts in parallel', async () => {
+    const manyPrompts: PromptCase[] = Array.from({ length: 8 }, (_, i) => ({
+      id: `c${i}`,
+      input: `prompt ${i}`,
+    }))
+
+    const start = Date.now()
+    const results = await runTrial({
+      adapter: slowEchoAdapter,
+      prompts: manyPrompts,
+      concurrency: 4,
+    })
+    const elapsed = Date.now() - start
+
+    expect(results).toHaveLength(8)
+    // With concurrency=4 and 50ms per prompt, 8 prompts should take ~100ms not ~400ms
+    // Allow generous margin for CI
+    expect(elapsed).toBeLessThan(400)
+  })
+
+  test('concurrent writes do not corrupt JSONL', async () => {
+    const outPath = tempFile('concurrent.jsonl')
+    const manyPrompts: PromptCase[] = Array.from({ length: 6 }, (_, i) => ({
+      id: `w${i}`,
+      input: `prompt ${i}`,
+    }))
+
+    await runTrial({
+      adapter: slowEchoAdapter,
+      prompts: manyPrompts,
+      concurrency: 3,
+      outputPath: outPath,
+    })
+
+    const lines = (await Bun.file(outPath).text()).trim().split('\n')
+    expect(lines).toHaveLength(6)
+    // Every line should be valid JSON
+    for (const line of lines) {
+      expect(() => JSON.parse(line)).not.toThrow()
+    }
+  })
+})
+
+// ============================================================================
+// CLI Contract
+// ============================================================================
+
+describe('CLI contract', () => {
+  test('--schema input emits JSON Schema', () => {
+    const schema = z.toJSONSchema(TrialInputSchema)
+    expect(schema.type).toBe('object')
+    expect(schema.properties).toBeDefined()
+    // Check key fields exist
+    const props = schema.properties as Record<string, unknown>
+    expect(props.adapterPath).toBeDefined()
+    expect(props.promptsPath).toBeDefined()
+    expect(props.k).toBeDefined()
+    expect(props.graderPath).toBeDefined()
+  })
+
+  test('--schema output emits JSON Schema', () => {
+    const schema = z.toJSONSchema(TrialOutputSchema)
+    expect(schema.type).toBe('array')
+  })
+
+  test('TrialInputSchema validates correct input', () => {
+    const result = TrialInputSchema.safeParse({
+      adapterPath: './adapter.ts',
+      promptsPath: './prompts.jsonl',
+      k: 5,
+    })
+    expect(result.success).toBe(true)
+  })
+
+  test('TrialInputSchema rejects missing adapterPath', () => {
+    const result = TrialInputSchema.safeParse({
+      promptsPath: './prompts.jsonl',
+    })
+    expect(result.success).toBe(false)
+  })
+
+  test('TrialInputSchema applies defaults', () => {
+    const result = TrialInputSchema.parse({
+      adapterPath: './adapter.ts',
+    })
+    expect(result.k).toBe(1)
+    expect(result.concurrency).toBe(1)
+    expect(result.progress).toBe(false)
+    expect(result.append).toBe(false)
+    expect(result.debug).toBe(false)
+  })
+})
+
+// ============================================================================
+// Utility Functions
+// ============================================================================
+
+describe('resolvePath', () => {
+  test('returns absolute paths unchanged', () => {
+    expect(resolvePath('/absolute/path')).toBe('/absolute/path')
+  })
+
+  test('resolves relative paths against cwd', () => {
+    const resolved = resolvePath('relative/path')
+    expect(resolved.startsWith('/')).toBe(true)
+    expect(resolved.endsWith('relative/path')).toBe(true)
+  })
+})
+
+describe('loadJsonl', () => {
+  test('loads JSONL file', async () => {
+    const path = tempFile('test.jsonl')
+    await Bun.write(path, '{"a":1}\n{"a":2}\n{"a":3}\n')
+
+    const data = await loadJsonl<{ a: number }>(path)
+    expect(data).toHaveLength(3)
+    const d0 = data[0]
+    const d2 = data[2]
+    expect(d0).toBeDefined()
+    expect(d2).toBeDefined()
+    expect(d0!.a).toBe(1)
+    expect(d2!.a).toBe(3)
+  })
+
+  test('skips empty lines', async () => {
+    const path = tempFile('sparse.jsonl')
+    await Bun.write(path, '{"a":1}\n\n{"a":2}\n')
+
+    const data = await loadJsonl(path)
+    expect(data).toHaveLength(2)
+  })
+
+  test('throws on invalid JSON', async () => {
+    const path = tempFile('invalid.jsonl')
+    await Bun.write(path, '{"a":1}\nnot-json\n')
+
+    await expect(loadJsonl(path)).rejects.toThrow('Invalid JSON at line 2')
+  })
+})
+
+describe('loadPrompts', () => {
+  test('validates against PromptCaseSchema', async () => {
+    const path = tempFile('prompts.jsonl')
+    await Bun.write(path, '{"id":"p1","input":"hello"}\n{"id":"p2","input":["a","b"]}\n')
+
+    const prompts = await loadPrompts(path)
+    expect(prompts).toHaveLength(2)
+    const p0 = prompts[0]
+    const p1 = prompts[1]
+    expect(p0).toBeDefined()
+    expect(p1).toBeDefined()
+    expect(p0!.id).toBe('p1')
+    expect(p1!.input).toEqual(['a', 'b'])
+  })
+
+  test('rejects invalid prompts', async () => {
+    const path = tempFile('bad-prompts.jsonl')
+    await Bun.write(path, '{"id":"p1"}\n') // missing input
+
+    await expect(loadPrompts(path)).rejects.toThrow('Invalid prompt at line 1')
+  })
+})
+
+describe('runWorkerPool', () => {
+  test('sequential execution (concurrency=1)', async () => {
+    const items = [1, 2, 3]
+    const { results, errors } = await runWorkerPool(items, async (n) => n * 2, { concurrency: 1 })
+    expect(results).toEqual([2, 4, 6])
+    expect(errors).toHaveLength(0)
+  })
+
+  test('parallel execution', async () => {
+    const items = [1, 2, 3, 4]
+    const { results, errors } = await runWorkerPool(
+      items,
+      async (n) => {
+        await new Promise((r) => setTimeout(r, 10))
+        return n * 2
+      },
+      { concurrency: 2 },
+    )
+    expect(results).toHaveLength(4)
+    expect(errors).toHaveLength(0)
+    // Results may be in any order
+    expect(results.sort()).toEqual([2, 4, 6, 8])
+  })
+
+  test('collects errors without stopping', async () => {
+    const items = [1, 2, 3]
+    const { results, errors } = await runWorkerPool(
+      items,
+      async (n) => {
+        if (n === 2) throw new Error('boom')
+        return n
+      },
+      { concurrency: 1 },
+    )
+    expect(results).toEqual([1, 3])
+    expect(errors).toHaveLength(1)
+    const err0 = errors[0]
+    expect(err0).toBeDefined()
+    expect(err0!.index).toBe(1)
+    expect(err0!.error.message).toBe('boom')
+  })
+
+  test('progress callback', async () => {
+    const progress: number[] = []
+    await runWorkerPool([1, 2, 3], async (n) => n, {
+      concurrency: 1,
+      onProgress: (completed) => progress.push(completed),
+    })
+    expect(progress).toEqual([1, 2, 3])
+  })
+})
+
+describe('createWriteMutex', () => {
+  test('serializes concurrent writes', async () => {
+    const order: number[] = []
+    const mutex = createWriteMutex()
+
+    await Promise.all([
+      mutex.write(async () => {
+        await new Promise((r) => setTimeout(r, 30))
+        order.push(1)
+      }),
+      mutex.write(async () => {
+        await new Promise((r) => setTimeout(r, 10))
+        order.push(2)
+      }),
+      mutex.write(async () => {
+        order.push(3)
+      }),
+    ])
+
+    expect(order).toEqual([1, 2, 3])
+  })
+})
+
+describe('createWorkspaceDir', () => {
+  test('creates directory with sanitized name', async () => {
+    const base = `${tmpdir()}/trial-ws-${Date.now()}`
+    const dir = await createWorkspaceDir(base, 'test:prompt/1')
+    expect(dir).toContain('prompt-test_prompt_1')
+    const dirStat = await stat(dir)
+    expect(dirStat.isDirectory()).toBe(true)
+    // Cleanup
+    await Bun.$`rm -rf ${base}`.quiet()
+  })
+})
+
+describe('hasToolErrors', () => {
+  test('returns false for empty trajectory', () => {
+    expect(hasToolErrors([])).toBe(false)
+  })
+
+  test('returns false when no tool calls failed', () => {
+    expect(
+      hasToolErrors([
+        { type: 'message', content: 'hello', timestamp: 0 },
+        { type: 'tool_call', name: 'test', status: 'completed', timestamp: 0 },
+      ]),
+    ).toBe(false)
+  })
+
+  test('returns true when a tool call failed', () => {
+    expect(hasToolErrors([{ type: 'tool_call', name: 'test', status: 'failed', timestamp: 0 }])).toBe(true)
+  })
+})
+
+describe('detectRichness', () => {
+  test('empty trajectory is minimal', () => {
+    expect(detectRichness([])).toBe('minimal')
+  })
+
+  test('messages-only trajectory', () => {
+    expect(detectRichness([{ type: 'message', content: 'hello', timestamp: 0 }])).toBe('messages-only')
+  })
+
+  test('trajectory with thoughts is full', () => {
+    expect(
+      detectRichness([
+        { type: 'thought', content: 'thinking', timestamp: 0 },
+        { type: 'message', content: 'hello', timestamp: 0 },
+      ]),
+    ).toBe('full')
+  })
+
+  test('trajectory with tool calls is full', () => {
+    expect(detectRichness([{ type: 'tool_call', name: 'bash', status: 'completed', timestamp: 0 }])).toBe('full')
+  })
+})
+
+// ============================================================================
+// loadPolyglot (TS module)
+// ============================================================================
+
+describe('loadPolyglot', () => {
+  test('loadAdapter from TS module', async () => {
+    // Write a temporary adapter module
+    const adapterPath = tempFile('echo-adapter.ts')
+    await Bun.write(
+      adapterPath,
+      `export const adapt = async ({ prompt }) => ({
+        output: Array.isArray(prompt) ? prompt.join(' ') : prompt,
+      })`,
+    )
+
+    const { loadAdapter } = await import('../trial.utils.ts')
+    const adapter = await loadAdapter(adapterPath)
+    const result = await adapter({ prompt: 'hello' })
+    expect(result.output).toBe('hello')
+  })
+
+  test('loadGrader from TS module', async () => {
+    const graderPath = tempFile('pass-grader.ts')
+    await Bun.write(
+      graderPath,
+      `export const grade = async ({ output }) => ({
+        pass: output.length > 0,
+        score: output.length > 0 ? 1.0 : 0.0,
+      })`,
+    )
+
+    const { loadGrader } = await import('../trial.utils.ts')
+    const grader = await loadGrader(graderPath)
+    const result = await grader({ input: 'test', output: 'hello' })
+    expect(result.pass).toBe(true)
+    expect(result.score).toBe(1.0)
+  })
+
+  test('rejects module without expected export', async () => {
+    const badPath = tempFile('bad-module.ts')
+    await Bun.write(badPath, 'export const wrong = () => {}')
+
+    const { loadAdapter } = await import('../trial.utils.ts')
+    await expect(loadAdapter(badPath)).rejects.toThrow("Module must export a 'adapt' function")
+  })
+
+  test('rejects non-existent file', async () => {
+    const { loadAdapter } = await import('../trial.utils.ts')
+    await expect(loadAdapter('/nonexistent/adapter.ts')).rejects.toThrow('File not found')
+  })
+})
diff --git a/src/trial.constants.ts b/src/trial.constants.ts
new file mode 100644
index 0000000..a28f6bb
--- /dev/null
+++ b/src/trial.constants.ts
@@ -0,0 +1,11 @@
+/**
+ * Constants for the trial runner.
+ *
+ * @internal
+ */
+
+/** Default timeout per prompt in milliseconds */
+export const DEFAULT_TIMEOUT = 60_000
+
+/** Default number of trials per prompt */
+export const DEFAULT_K = 1
diff --git a/src/trial.schemas.ts b/src/trial.schemas.ts
new file mode 100644
index 0000000..91c53f7
--- /dev/null
+++ b/src/trial.schemas.ts
@@ -0,0 +1,288 @@
+/**
+ * Zod schemas and types for the trial runner.
+ *
+ * @remarks
+ * Schema-first approach — Zod schemas are the single source of truth,
+ * TypeScript types derived via `z.infer<>`.
+ *
+ * TrajectoryStepSchema is inlined with 3 core step types:
+ * thought, message, tool_call.
+ *
+ * @packageDocumentation
+ */
+
+import * as z from 'zod'
+
+// ============================================================================
+// Trajectory Step (standalone — no agent-specific imports)
+// ============================================================================
+
+export const ThoughtStepSchema = z.object({
+  type: z.literal('thought'),
+  content: z.string(),
+  timestamp: z.number(),
+  stepId: z.string().optional(),
+})
+
+export const MessageStepSchema = z.object({
+  type: z.literal('message'),
+  content: z.string(),
+  timestamp: z.number(),
+  stepId: z.string().optional(),
+})
+
+export const ToolCallStepSchema = z.object({
+  type: z.literal('tool_call'),
+  name: z.string(),
+  status: z.string(),
+  input: z.unknown().optional(),
+  output: z.unknown().optional(),
+  duration: z.number().optional(),
+  timestamp: z.number(),
+  stepId: z.string().optional(),
+})
+
+export const TrajectoryStepSchema = z.discriminatedUnion('type', [
+  ThoughtStepSchema,
+  MessageStepSchema,
+  ToolCallStepSchema,
+])
+export type TrajectoryStep = z.infer<typeof TrajectoryStepSchema>
+
+// ============================================================================
+// Prompt Case
+// ============================================================================
+
+/**
+ * Prompt case schema for evaluation inputs.
+ *
+ * @remarks
+ * Each line in a prompts.jsonl file should match this schema.
+ * - Single turn: `input: "Hello"` — one prompt
+ * - Multi-turn: `input: ["Hello", "Follow up"]` — sequential turns
+ *
+ * @public
+ */
+export const PromptCaseSchema = z.object({
+  /** Unique identifier for the test case */
+  id: z.string(),
+  /** Prompt text(s) — string for single turn, array for multi-turn */
+  input: z.union([z.string(), z.array(z.string())]),
+  /** Optional grader context hint */
+  hint: z.string().optional(),
+  /** Optional reference solution */
+  reference: z.string().optional(),
+  /** Optional metadata for categorization */
+  metadata: z.record(z.string(), z.unknown()).optional(),
+  /** Optional per-case timeout override in milliseconds */
+  timeout: z.number().optional(),
+})
+
+/** Prompt case type */
+export type PromptCase = z.infer<typeof PromptCaseSchema>
+
+// ============================================================================
+// Timing
+// ============================================================================
+
+/**
+ * Timing information from the adapter.
+ *
+ * @remarks
+ * Adapter-reported timing. `total` is the adapter's own measurement
+ * (may differ from the runner's wall-clock `duration` on TrialEntry).
+ * Token counts are adapter-dependent — only present if the adapter exposes them.
+ *
+ * @public
+ */
+export const TimingSchema = z.object({
+  /** Adapter-reported total duration in ms */
+  total: z.number().optional(),
+  /** Input tokens consumed */
+  inputTokens: z.number().optional(),
+  /** Output tokens generated */
+  outputTokens: z.number().optional(),
+})
+
+/** Timing type */
+export type Timing = z.infer<typeof TimingSchema>
+
+// ============================================================================
+// Adapter
+// ============================================================================
+
+/**
+ * Input passed to an adapter.
+ *
+ * @public
+ */
+export const AdapterInputSchema = z.object({
+  /** Single or multi-turn prompt */
+  prompt: z.union([z.string(), z.array(z.string())]),
+  /** Working directory for the adapter */
+  cwd: z.string().optional(),
+})
+
+/** Adapter input type */
+export type AdapterInput = z.infer<typeof AdapterInputSchema>
+
+/**
+ * Result returned by an adapter.
+ *
+ * @public
+ */
+export const AdapterResultSchema = z.object({
+  /** Final agent response text */
+  output: z.string(),
+  /** Optional structured trajectory */
+  trajectory: z.array(TrajectoryStepSchema).optional(),
+  /** Optional timing from the adapter */
+  timing: TimingSchema.optional(),
+  /** Process exit code (null if signaled) */
+  exitCode: z.number().nullable().optional(),
+  /** Whether the adapter timed out */
+  timedOut: z.boolean().optional(),
+})
+
+/** Adapter result type */
+export type AdapterResult = z.infer<typeof AdapterResultSchema>
+
+/**
+ * Adapter function — runs a prompt against an agent and returns structured output.
+ *
+ * @remarks
+ * TS module adapters export this as `adapt`. Executable adapters receive
+ * `AdapterInput` on stdin and emit `AdapterResult` on stdout.
+ *
+ * @public
+ */
+export type Adapter = (input: AdapterInput) => Promise<AdapterResult>
+
+// ============================================================================
+// Grader
+// ============================================================================
+
+/**
+ * Grader result schema.
+ *
+ * @public
+ */
+export const GraderResultSchema = z.object({
+  /** Whether the output passes evaluation criteria */
+  pass: z.boolean(),
+  /** Numeric score from 0.0 to 1.0 */
+  score: z.number().min(0).max(1),
+  /** Optional explanation for the score */
+  reasoning: z.string().optional(),
+  /** Optional structured outcome data */
+  outcome: z.record(z.string(), z.unknown()).optional(),
+})
+
+/** Grader result type */
+export type GraderResult = z.infer<typeof GraderResultSchema>
+
+/**
+ * Grader function — scores agent output.
+ *
+ * @remarks
+ * TS module graders export this as `grade`. Executable graders receive
+ * grader input on stdin and emit `GraderResult` on stdout.
+ *
+ * @public
+ */
+export type Grader = (params: {
+  input: string | string[]
+  output: string
+  hint?: string
+  trajectory?: TrajectoryStep[]
+  metadata?: Record<string, unknown>
+  cwd?: string
+}) => Promise<GraderResult>
+
+// ============================================================================
+// Trial Entry
+// ============================================================================
+
+/**
+ * Single trial within a trial run.
+ *
+ * @public
+ */
+export const TrialEntrySchema = z.object({
+  /** Trial number (1-indexed) */
+  trialNum: z.number(),
+  /** Agent output for this trial */
+  output: z.string(),
+  /** Full trajectory for this trial */
+  trajectory: z.array(TrajectoryStepSchema).optional(),
+  /** Runner-measured wall-clock duration in ms */
+  duration: z.number(),
+  /** Adapter-reported timing (token counts, adapter-measured duration) */
+  timing: TimingSchema.optional(),
+  /** Process exit code */
+  exitCode: z.number().nullable().optional(),
+  /** Whether the trial timed out */
+  timedOut: z.boolean().optional(),
+  /** Pass/fail (if grader provided) */
+  pass: z.boolean().optional(),
+  /** Numeric score (if grader provided) */
+  score: z.number().optional(),
+  /** Grader reasoning (if grader provided) */
+  reasoning: z.string().optional(),
+  /** Outcome data from grader */
+  outcome: z.record(z.string(), z.unknown()).optional(),
+})
+
+/** Trial entry type */
+export type TrialEntry = z.infer<typeof TrialEntrySchema>
+
+// ============================================================================
+// Trial Result
+// ============================================================================
+
+/**
+ * Trial result schema — unified output for all trial runs.
+ *
+ * @remarks
+ * k=1 produces one trial entry. k>1 produces multiple entries with
+ * pass@k/pass^k metrics when a grader is provided.
+ *
+ * @public
+ */
+export const TrialResultSchema = z.object({
+  /** Test case identifier */
+  id: z.string(),
+  /** Original prompt input */
+  input: z.union([z.string(), z.array(z.string())]),
+  /** Grader context hint */
+  hint: z.string().optional(),
+  /** Number of trials (k) */
+  k: z.number(),
+  /** Simple pass rate: passes / k (with grader only) */
+  passRate: z.number().optional(),
+  /** pass@k: probability of at least one pass in k samples (with grader only) */
+  passAtK: z.number().optional(),
+  /** pass^k: probability of all k samples passing (with grader only) */
+  passExpK: z.number().optional(),
+  /** Individual trial results */
+  trials: z.array(TrialEntrySchema),
+  /** Metadata (from prompt case + runtime additions) */
+  metadata: z.record(z.string(), z.unknown()).optional(),
+})
+
+/** Trial result type */
+export type TrialResult = z.infer<typeof TrialResultSchema>
+
+// ============================================================================
+// Trajectory Richness
+// ============================================================================
+
+/**
+ * Trajectory richness level.
+ *
+ * @public
+ */
+export const TrajectoryRichnessSchema = z.enum(['full', 'minimal', 'messages-only'])
+
+/** Trajectory richness type */
+export type TrajectoryRichness = z.infer<typeof TrajectoryRichnessSchema>
diff --git a/src/trial.ts b/src/trial.ts
new file mode 100644
index 0000000..e4bb509
--- /dev/null
+++ b/src/trial.ts
@@ -0,0 +1,319 @@
+/**
+ * Trial runner — library function and CLI handler.
+ *
+ * @remarks
+ * Runs prompts against an adapter k times, optionally grades results,
+ * and computes pass@k/pass^k metrics. Library API is primary —
+ * `runTrial({ adapter, prompts, grader, k })` works in-process.
+ * CLI resolves paths to functions, then delegates to `runTrial`.
+ *
+ * @packageDocumentation
+ */
+
+import * as z from 'zod'
+import { parseCli } from './cli.utils.ts'
+import { DEFAULT_K, DEFAULT_TIMEOUT } from './trial.constants.ts'
+import type { Adapter, Grader, PromptCase, TrialEntry, TrialResult } from './trial.schemas.ts'
+import { TrialResultSchema } from './trial.schemas.ts'
+import {
+  createWorkspaceDir,
+  createWriteMutex,
+  loadAdapter,
+  loadGrader,
+  loadPrompts,
+  logProgress,
+  readStdinPrompts,
+  runWorkerPool,
+  writeOutput,
+} from './trial.utils.ts'
+
+// ============================================================================
+// Pass@k / Pass^k Calculation
+// ============================================================================
+
+/**
+ * Calculate pass@k: probability of at least one pass in k samples.
+ *
+ * @remarks
+ * Simplified formula when n = k: `1 - (1 - passRate)^k`
+ *
+ * @public
+ */
+export const calculatePassAtK = (passes: number, k: number): number => {
+  if (passes >= k) return 1
+  if (passes === 0) return 0
+  const passRate = passes / k
+  return 1 - (1 - passRate) ** k
+}
+
+/**
+ * Calculate pass^k: probability of all k samples passing.
+ *
+ * @remarks
+ * Simply `passRate^k`.
+ *
+ * @public
+ */
+export const calculatePassExpK = (passes: number, k: number): number => {
+  if (passes === k) return 1
+  if (passes === 0) return 0
+  const passRate = passes / k
+  return passRate ** k
+}
+
+// ============================================================================
+// Library API
+// ============================================================================
+
+/** Configuration for `runTrial` */
+export type TrialConfig = {
+  adapter: Adapter
+  prompts: PromptCase[]
+  grader?: Grader
+  k?: number
+  outputPath?: string
+  cwd?: string
+  timeout?: number
+  concurrency?: number
+  workspaceDir?: string
+  progress?: boolean
+  append?: boolean
+  debug?: boolean
+}
+
+/**
+ * Run trials against an adapter.
+ *
+ * @remarks
+ * For each prompt, runs the adapter k times. Optionally grades each run
+ * and computes pass@k/pass^k metrics. Writes results as JSONL when
+ * `outputPath` is provided. Always returns the full result array.
+ *
+ * @public
+ */
+export const runTrial = async (config: TrialConfig): Promise<TrialResult[]> => {
+  const {
+    adapter,
+    prompts,
+    grader,
+    k = DEFAULT_K,
+    outputPath,
+    cwd,
+    timeout = DEFAULT_TIMEOUT,
+    concurrency = 1,
+    workspaceDir,
+    progress = false,
+    append = false,
+  } = config
+
+  const writeMutex = outputPath ? createWriteMutex() : undefined
+
+  // Initialize output file
+  if (outputPath && !append) {
+    await Bun.write(outputPath, '')
+  }
+
+  logProgress(`Running ${prompts.length} prompt(s), k=${k} (${prompts.length * k} total executions)`, progress)
+  if (concurrency > 1) {
+    logProgress(`Concurrency: ${concurrency} workers`, progress)
+  }
+  if (workspaceDir) {
+    logProgress(`Workspace: ${workspaceDir}`, progress)
+  }
+
+  const processPrompt = async (promptCase: PromptCase, index: number): Promise<TrialResult> => {
+    logProgress(`[${index + 1}/${prompts.length}] ${promptCase.id}: Running ${k} trial(s)...`, progress)
+
+    const entries: TrialEntry[] = []
+    const effectiveTimeout = promptCase.timeout ?? timeout
+
+    for (let trialNum = 1; trialNum <= k; trialNum++) {
+      const promptCwd = workspaceDir
+        ? await createWorkspaceDir(workspaceDir, `${promptCase.id}-trial-${trialNum}`)
+        : cwd
+
+      const start = Date.now()
+
+      try {
+        const adapterResult = await Promise.race([
+          adapter({ prompt: promptCase.input, cwd: promptCwd }),
+          new Promise<never>((_, reject) => setTimeout(() => reject(new Error('Trial timed out')), effectiveTimeout)),
+        ])
+
+        const duration = Date.now() - start
+
+        const entry: TrialEntry = {
+          trialNum,
+          output: adapterResult.output,
+          duration,
+          ...(adapterResult.trajectory && { trajectory: adapterResult.trajectory }),
+          ...(adapterResult.timing && { timing: adapterResult.timing }),
+          ...(adapterResult.exitCode !== undefined && { exitCode: adapterResult.exitCode }),
+          ...(adapterResult.timedOut && { timedOut: true }),
+        }
+
+        // Grade if grader provided
+        if (grader) {
+          const graderResult = await grader({
+            input: promptCase.input,
+            output: adapterResult.output,
+            hint: promptCase.hint,
+            trajectory: adapterResult.trajectory,
+            metadata: promptCase.metadata,
+            cwd: promptCwd,
+          })
+          entry.pass = graderResult.pass
+          entry.score = graderResult.score
+          entry.reasoning = graderResult.reasoning
+          if (graderResult.outcome) {
+            entry.outcome = graderResult.outcome
+          }
+        }
+
+        entries.push(entry)
+        logProgress(
+          `  Trial ${trialNum}/${k}: ${entry.pass !== undefined ? (entry.pass ? 'PASS' : 'FAIL') : '?'}`,
+          progress,
+        )
+      } catch (error) {
+        const duration = Date.now() - start
+        const isTimeout = error instanceof Error && error.message === 'Trial timed out'
+
+        entries.push({
+          trialNum,
+          output: '',
+          duration,
+          ...(isTimeout && { timedOut: true }),
+          pass: false,
+          reasoning: `Error: ${error instanceof Error ? error.message : String(error)}`,
+        })
+        logProgress(`  Trial ${trialNum}/${k}: ERROR`, progress)
+      }
+    }
+
+    // Build result
+    const result: TrialResult = {
+      id: promptCase.id,
+      input: promptCase.input,
+      ...(promptCase.hint && { hint: promptCase.hint }),
+      k,
+      trials: entries,
+      ...(promptCase.metadata && { metadata: promptCase.metadata }),
+    }
+
+    // Calculate metrics if grader was used
+    if (grader) {
+      const passes = entries.filter((t) => t.pass).length
+      result.passRate = passes / k
+      result.passAtK = calculatePassAtK(passes, k)
+      result.passExpK = calculatePassExpK(passes, k)
+    }
+
+    // Write result immediately (mutex for concurrent writes)
+    if (outputPath && writeMutex) {
+      await writeMutex.write(async () => {
+        await writeOutput(JSON.stringify(result), outputPath, true)
+      })
+    }
+
+    if (grader && progress) {
+      logProgress(
+        `  => ${promptCase.id}: passRate=${(result.passRate ?? 0).toFixed(2)}, pass@${k}=${(result.passAtK ?? 0).toFixed(2)}`,
+        true,
+      )
+    }
+
+    return result
+  }
+
+  const { results } = await runWorkerPool(prompts, processPrompt, { concurrency })
+
+  // If no outputPath, write all results to stdout
+  if (!outputPath) {
+    for (const result of results) {
+      await writeOutput(JSON.stringify(result))
+    }
+  }
+
+  logProgress(`Done. ${results.length} result(s).`, progress)
+
+  return results
+}
+
+// ============================================================================
+// CLI Schema + Handler
+// ============================================================================
+
+/**
+ * CLI input schema for the trial command.
+ *
+ * @public
+ */
+export const TrialInputSchema = z.object({
+  adapterPath: z.string().describe('Path to adapter script (.ts/.js module or executable)'),
+  promptsPath: z.string().optional().describe('Path to prompts.jsonl'),
+  outputPath: z.string().optional().describe('Output file (default: stdout)'),
+  k: z.number().optional().default(1).describe('Trials per prompt'),
+  graderPath: z.string().optional().describe('Path to grader script'),
+  cwd: z.string().optional().describe('Working directory for adapter'),
+  timeout: z.number().optional().describe('Timeout per prompt in ms'),
+  concurrency: z.number().optional().default(1).describe('Concurrent workers'),
+  workspaceDir: z.string().optional().describe('Per-prompt workspace isolation base dir'),
+  progress: z.boolean().optional().default(false).describe('Show progress to stderr'),
+  append: z.boolean().optional().default(false).describe('Append to output file'),
+  debug: z.boolean().optional().default(false).describe('Enable debug mode'),
+})
+
+/** CLI output schema (array of TrialResult) */
+export const TrialOutputSchema = z.array(TrialResultSchema)
+
+/**
+ * CLI handler for the trial command.
+ *
+ * @remarks
+ * Uses `parseCli` for input parsing, then resolves paths → functions
+ * and delegates to `runTrial`. Custom execution handles partial failures,
+ * concurrent workers, and progress reporting.
+ *
+ * @public
+ */
+export const trialCli = async (args: string[]): Promise<void> => {
+  const input = await parseCli(args, TrialInputSchema, {
+    name: 'trials',
+    outputSchema: TrialOutputSchema,
+  })
+
+  // Resolve adapter path → function
+  const adapter = await loadAdapter(input.adapterPath)
+
+  // Resolve grader path → function (optional)
+  const grader = input.graderPath ? await loadGrader(input.graderPath) : undefined
+
+  // Load prompts from file or stdin
+  let prompts: PromptCase[]
+  if (input.promptsPath) {
+    prompts = await loadPrompts(input.promptsPath)
+  } else {
+    const stdinPrompts = await readStdinPrompts()
+    if (!stdinPrompts || stdinPrompts.length === 0) {
+      console.error('Error: promptsPath required or pipe prompts via stdin')
+      process.exit(2)
+    }
+    prompts = stdinPrompts
+  }
+
+  await runTrial({
+    adapter,
+    prompts,
+    grader,
+    k: input.k,
+    outputPath: input.outputPath,
+    cwd: input.cwd,
+    timeout: input.timeout,
+    concurrency: input.concurrency,
+    workspaceDir: input.workspaceDir,
+    progress: input.progress,
+    append: input.append,
+    debug: input.debug,
+  })
+}
diff --git a/src/trial.utils.ts b/src/trial.utils.ts
new file mode 100644
index 0000000..630a340
--- /dev/null
+++ b/src/trial.utils.ts
@@ -0,0 +1,444 @@
+/**
+ * Shared utilities for the trial runner.
+ *
+ * @remarks
+ * Consolidates loading, output, worker pool, workspace, and trajectory
+ * utilities.
+ *
+ * @packageDocumentation
+ */
+
+import { appendFile, mkdir } from 'node:fs/promises'
+import type { Adapter, Grader, PromptCase, TrajectoryRichness } from './trial.schemas.ts'
+import { AdapterResultSchema, GraderResultSchema, PromptCaseSchema, type TrajectoryStep } from './trial.schemas.ts'
+
+// ============================================================================
+// Path Resolution
+// ============================================================================
+
+/**
+ * Resolve path relative to process.cwd().
+ *
+ * @remarks
+ * Absolute paths (starting with /) are returned as-is.
+ *
+ * @public
+ */
+export const resolvePath = (path: string): string => {
+  if (path.startsWith('/')) return path
+  return `${process.cwd()}/${path}`
+}
+
+// ============================================================================
+// Polyglot Loader (adapters + graders)
+// ============================================================================
+
+/** File extensions imported as ES modules */
+const JS_EXTENSIONS = ['.ts', '.js', '.mjs', '.cjs']
+
+/** Check if a file path is a JavaScript/TypeScript module */
+const isJsModule = (path: string): boolean => JS_EXTENSIONS.some((ext) => path.endsWith(ext))
+
+/**
+ * Create an executable wrapper that spawns a subprocess.
+ *
+ * @remarks
+ * Sends JSON on stdin, reads JSON from stdout. Validates output with
+ * the provided schema. Non-zero exit codes are treated as errors.
+ *
+ * @internal
+ */
+const createExecWrapper = <TInput, TOutput>(
+  execPath: string,
+  outputSchema: { safeParse: (data: unknown) => { success: boolean; data?: TOutput; error?: { message: string } } },
+): ((input: TInput) => Promise<TOutput>) => {
+  return async (input: TInput): Promise<TOutput> => {
+    const inputJson = JSON.stringify(input)
+
+    const proc = Bun.spawn([execPath], {
+      stdin: new TextEncoder().encode(inputJson),
+      stdout: 'pipe',
+      stderr: 'pipe',
+    })
+
+    const [stdout, stderr, exitCode] = await Promise.all([
+      new Response(proc.stdout).text(),
+      new Response(proc.stderr).text(),
+      proc.exited,
+    ])
+
+    if (exitCode !== 0) {
+      throw new Error(`Process exited with code ${exitCode}: ${stderr.trim() || 'No error output'}`)
+    }
+
+    const trimmed = stdout.trim()
+    if (!trimmed) {
+      throw new Error('Process produced no output')
+    }
+
+    let parsed: unknown
+    try {
+      parsed = JSON.parse(trimmed)
+    } catch {
+      throw new Error(`Output is not valid JSON: ${trimmed.slice(0, 100)}`)
+    }
+
+    const result = outputSchema.safeParse(parsed)
+    if (!result.success) {
+      throw new Error(`Invalid output: ${result.error?.message}`)
+    }
+
+    return result.data as TOutput
+  }
+}
+
+/**
+ * Load a polyglot module — TS/JS modules or executable scripts.
+ *
+ * @remarks
+ * Detection logic:
+ * - `.ts`, `.js`, `.mjs`, `.cjs` → Import as ES module, extract named export
+ * - Everything else → Execute as subprocess with stdin/stdout JSON protocol
+ *
+ * @param path - Path to the module or executable (relative or absolute)
+ * @param exportName - Name of the function export to extract from TS/JS modules
+ * @param outputSchema - Zod schema for validating executable output
+ * @returns Loaded function
+ *
+ * @public
+ */
+export const loadPolyglot = async <TFn>(
+  path: string,
+  exportName: string,
+  outputSchema: { safeParse: (data: unknown) => { success: boolean; data?: unknown; error?: { message: string } } },
+): Promise<TFn> => {
+  const resolved = resolvePath(path)
+
+  const file = Bun.file(resolved)
+  if (!(await file.exists())) {
+    throw new Error(`File not found: ${resolved}`)
+  }
+
+  if (isJsModule(resolved)) {
+    const mod = await import(resolved)
+    if (typeof mod[exportName] !== 'function') {
+      throw new Error(`Module must export a '${exportName}' function`)
+    }
+    return mod[exportName] as TFn
+  }
+
+  // Executable: wrap as stdin/stdout JSON protocol
+  return createExecWrapper(resolved, outputSchema) as TFn
+}
+
+/**
+ * Load an adapter from a file path.
+ *
+ * @public
+ */
+export const loadAdapter = (path: string): Promise<Adapter> => loadPolyglot<Adapter>(path, 'adapt', AdapterResultSchema)
+
+/**
+ * Load a grader from a file path.
+ *
+ * @public
+ */
+export const loadGrader = (path: string): Promise<Grader> => loadPolyglot<Grader>(path, 'grade', GraderResultSchema)
+
+// ============================================================================
+// JSONL Loading
+// ============================================================================
+
+/**
+ * Load raw JSONL file as parsed JSON objects.
+ *
+ * @public
+ */
+export const loadJsonl = async <T = unknown>(path: string): Promise<T[]> => {
+  const content = await Bun.file(path).text()
+  return content
+    .trim()
+    .split('\n')
+    .filter(Boolean)
+    .map((line, index) => {
+      try {
+        return JSON.parse(line) as T
+      } catch (error) {
+        throw new Error(`Invalid JSON at line ${index + 1}: ${error instanceof Error ? error.message : error}`)
+      }
+    })
+}
+
+/**
+ * Load prompts from a JSONL file with schema validation.
+ *
+ * @public
+ */
+export const loadPrompts = async (path: string): Promise<PromptCase[]> => {
+  const content = await Bun.file(path).text()
+  return content
+    .trim()
+    .split('\n')
+    .filter(Boolean)
+    .map((line, index) => {
+      try {
+        return PromptCaseSchema.parse(JSON.parse(line))
+      } catch (error) {
+        throw new Error(`Invalid prompt at line ${index + 1}: ${error instanceof Error ? error.message : error}`)
+      }
+    })
+}
+
+/**
+ * Read prompts from stdin as JSONL.
+ *
+ * @returns Parsed prompt cases, or null if stdin is a TTY
+ *
+ * @public
+ */
+export const readStdinPrompts = async (): Promise<PromptCase[] | null> => {
+  if (process.stdin.isTTY) {
+    return null
+  }
+
+  const content = (await Bun.stdin.text()).trim()
+  if (!content) return null
+
+  return content
+    .split('\n')
+    .filter(Boolean)
+    .map((line, index) => {
+      try {
+        return PromptCaseSchema.parse(JSON.parse(line))
+      } catch (error) {
+        throw new Error(`Invalid stdin prompt at line ${index + 1}: ${error instanceof Error ? error.message : error}`)
+      }
+    })
+}
+
+// ============================================================================
+// Output Utilities
+// ============================================================================
+
+/**
+ * Write output line to stdout or file.
+ *
+ * @public
+ */
+export const writeOutput = async (line: string, outputPath?: string, append?: boolean): Promise<void> => {
+  if (outputPath) {
+    if (append) {
+      await appendFile(outputPath, `${line}\n`)
+    } else {
+      await Bun.write(outputPath, `${line}\n`)
+    }
+  } else {
+    console.log(line)
+  }
+}
+
+/**
+ * Log progress message to stderr.
+ *
+ * @public
+ */
+export const logProgress = (message: string, showProgress: boolean): void => {
+  if (showProgress) {
+    console.error(message)
+  }
+}
+
+/**
+ * Get preview text for input (handles string or array).
+ *
+ * @public
+ */
+export const getInputPreview = (input: string | string[]): string => {
+  if (Array.isArray(input)) {
+    const first = input[0] ?? ''
+    return `[${input.length} turns] ${first.slice(0, 40)}...`
+  }
+  return input.slice(0, 50)
+}
+
+// ============================================================================
+// Worker Pool
+// ============================================================================
+
+/** Progress callback for worker pool */
+export type ProgressCallback<T> = (completed: number, total: number, result?: T, error?: Error) => void
+
+/** Worker pool options */
+export type WorkerPoolOptions<T> = {
+  concurrency: number
+  onProgress?: ProgressCallback<T>
+}
+
+/** Worker pool result */
+export type WorkerPoolResult<T> = {
+  results: T[]
+  errors: Array<{ index: number; error: Error }>
+}
+
+/**
+ * Execute tasks in parallel with concurrency limit.
+ *
+ * @remarks
+ * Semaphore-style work distribution. Results collected in completion order.
+ *
+ * @public
+ */
+export const runWorkerPool = async <TItem, TResult>(
+  items: TItem[],
+  worker: (item: TItem, index: number) => Promise<TResult>,
+  options: WorkerPoolOptions<TResult>,
+): Promise<WorkerPoolResult<TResult>> => {
+  const { concurrency, onProgress } = options
+  const results: TResult[] = []
+  const errors: Array<{ index: number; error: Error }> = []
+
+  if (concurrency === 1) {
+    for (let i = 0; i < items.length; i++) {
+      const item = items[i]
+      if (item === undefined) continue
+
+      try {
+        const result = await worker(item, i)
+        results.push(result)
+        onProgress?.(results.length + errors.length, items.length, result)
+      } catch (err) {
+        const error = err instanceof Error ? err : new Error(String(err))
+        errors.push({ index: i, error })
+        onProgress?.(results.length + errors.length, items.length, undefined, error)
+      }
+    }
+    return { results, errors }
+  }
+
+  let nextIndex = 0
+  let completed = 0
+  const mutex = { lock: Promise.resolve() }
+
+  const getNextItem = (): { item: TItem; index: number } | undefined => {
+    while (nextIndex < items.length) {
+      const index = nextIndex++
+      const item = items[index]
+      if (item !== undefined) {
+        return { item, index }
+      }
+    }
+    return undefined
+  }
+
+  const runSingleWorker = async (): Promise<void> => {
+    let work = getNextItem()
+    while (work) {
+      const { item, index } = work
+      try {
+        const result = await worker(item, index)
+        await new Promise<void>((resolve) => {
+          mutex.lock = mutex.lock.then(() => {
+            results.push(result)
+            completed++
+            onProgress?.(completed, items.length, result)
+            resolve()
+          })
+        })
+      } catch (err) {
+        const error = err instanceof Error ? err : new Error(String(err))
+        await new Promise<void>((resolve) => {
+          mutex.lock = mutex.lock.then(() => {
+            errors.push({ index, error })
+            completed++
+            onProgress?.(completed, items.length, undefined, error)
+            resolve()
+          })
+        })
+      }
+      work = getNextItem()
+    }
+  }
+
+  const workers = Array.from({ length: Math.min(concurrency, items.length) }, () => runSingleWorker())
+  await Promise.all(workers)
+
+  return { results, errors }
+}
+
+// ============================================================================
+// Write Mutex
+// ============================================================================
+
+/** Mutex for coordinated file writes */
+export type WriteMutex = {
+  write: (fn: () => Promise<void>) => Promise<void>
+}
+
+/**
+ * Create a write mutex for coordinated JSONL output.
+ *
+ * @public
+ */
+export const createWriteMutex = (): WriteMutex => {
+  let chain = Promise.resolve()
+  return {
+    write: (fn: () => Promise<void>): Promise<void> => {
+      chain = chain.then(fn, fn)
+      return chain
+    },
+  }
+}
+
+// ============================================================================
+// Workspace Directory
+// ============================================================================
+
+/**
+ * Create an isolated workspace directory for a prompt.
+ *
+ * @public
+ */
+export const createWorkspaceDir = async (baseDir: string, promptId: string): Promise<string> => {
+  const sanitizedId = promptId.replace(/[<>:"/\\|?*]/g, '_')
+  const workspaceDir = `${baseDir}/prompt-${sanitizedId}`
+  await mkdir(workspaceDir, { recursive: true })
+  return workspaceDir
+}
+
+// ============================================================================
+// Trajectory Analysis
+// ============================================================================
+
+/**
+ * Check if any tool calls failed in trajectory.
+ *
+ * @public
+ */
+export const hasToolErrors = (trajectory: TrajectoryStep[]): boolean =>
+  trajectory.some((step) => step.type === 'tool_call' && step.status === 'failed')
+
+/**
+ * Detect trajectory richness level.
+ *
+ * @remarks
+ * Single-pass with early exit:
+ * - `full`: Has thoughts or tool calls
+ * - `messages-only`: Only message steps
+ * - `minimal`: Empty or no recognized content
+ *
+ * @public
+ */
+export const detectRichness = (trajectory: TrajectoryStep[]): TrajectoryRichness => {
+  let hasMessages = false
+
+  for (const step of trajectory) {
+    if (step.type === 'thought' || step.type === 'tool_call') {
+      return 'full'
+    }
+    if (step.type === 'message') {
+      hasMessages = true
+    }
+  }
+
+  return hasMessages ? 'messages-only' : 'minimal'
+}
diff --git a/tsconfig.json b/tsconfig.json
index f38543f..edbd949 100644
--- a/tsconfig.json
+++ b/tsconfig.json
@@ -1,5 +1,5 @@
 {
-  "include": ["src", "bin"],
+  "include": ["src"],
   "compilerOptions": {
     // Enable latest features
     "lib": ["ESNext"],

From db9ec8669097d7371ad18be6a736354ff8e2425d Mon Sep 17 00:00:00 2001
From: Edward Irby <e.irby@pm.me>
Date: Tue, 10 Mar 2026 21:52:26 -0700
Subject: [PATCH 2/7] chore: remove obsolete agent-eval-harness and
 headless-adapters skills

These skills taught the old pipeline's concepts (Docker evals,
calibration flow, headless browser adapters) which no longer exist
after the trial-runner replacement.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .agents/skills/agent-eval-harness/SKILL.md    | 937 ------------------
 .../agent-eval-harness/assets/Dockerfile.eval |  25 -
 .../assets/docker-compose.eval.yml            |  19 -
 .../references/calibration.md                 | 146 ---
 .../references/comparison-graders.md          | 473 ---------
 .../references/docker-evals.md                | 184 ----
 .../references/eval-concepts.md               | 229 -----
 .../references/inline-graders.md              | 711 -------------
 .../references/output-formats.md              | 261 -----
 .agents/skills/headless-adapters/SKILL.md     | 144 ---
 .../references/schema-creation-guide.md       | 310 ------
 .../references/troubleshooting-guide.md       | 497 ----------
 12 files changed, 3936 deletions(-)
 delete mode 100644 .agents/skills/agent-eval-harness/SKILL.md
 delete mode 100644 .agents/skills/agent-eval-harness/assets/Dockerfile.eval
 delete mode 100644 .agents/skills/agent-eval-harness/assets/docker-compose.eval.yml
 delete mode 100644 .agents/skills/agent-eval-harness/references/calibration.md
 delete mode 100644 .agents/skills/agent-eval-harness/references/comparison-graders.md
 delete mode 100644 .agents/skills/agent-eval-harness/references/docker-evals.md
 delete mode 100644 .agents/skills/agent-eval-harness/references/eval-concepts.md
 delete mode 100644 .agents/skills/agent-eval-harness/references/inline-graders.md
 delete mode 100644 .agents/skills/agent-eval-harness/references/output-formats.md
 delete mode 100644 .agents/skills/headless-adapters/SKILL.md
 delete mode 100644 .agents/skills/headless-adapters/references/schema-creation-guide.md
 delete mode 100644 .agents/skills/headless-adapters/references/troubleshooting-guide.md

diff --git a/.agents/skills/agent-eval-harness/SKILL.md b/.agents/skills/agent-eval-harness/SKILL.md
deleted file mode 100644
index 1c93876..0000000
--- a/.agents/skills/agent-eval-harness/SKILL.md
+++ /dev/null
@@ -1,937 +0,0 @@
----
-name: agent-eval-harness
-description: CLI tool for capturing agent trajectories. Execute prompts against headless CLI agents via schema-driven adapters, capture full trajectories (tools, thoughts, plans), and output structured JSONL for downstream scoring.
-compatibility: Bun >= 1.2.9
----
-
-# Agent Eval Harness
-
-## Purpose
-
-CLI tool for capturing trajectories from headless CLI agents, optimized for TypeScript/JavaScript projects using Bun.
-
-**The harness captures. You score.**
-
-| Harness Provides | You Provide |
-|------------------|-------------|
-| Prompt execution via headless adapters | Scoring logic (Braintrust, custom scripts) |
-| Full trajectory capture (thoughts, tools, plans) | Pass/fail determination via graders |
-| Structured JSONL output | LLM-as-judge prompts |
-| Reproducible execution environment | CI integration, golden file comparison |
-
-**Use this when:**
-- Capturing trajectories for downstream evaluation
-- Generating training data (SFT/DPO) with full context
-- Building regression test fixtures for agent behavior
-- Comparing agent responses across configurations
-
-## Installation
-
-```bash
-# Run without installing (recommended)
-bunx @plaited/agent-eval-harness capture prompts.jsonl --schema ./claude.json -o results.jsonl
-
-# Or install as project dependency
-bun add @plaited/agent-eval-harness
-```
-
-## Core Principle: Capture Once, Derive Many Views
-
-```mermaid
-flowchart LR
-    Prompts["prompts.jsonl"] --> Capture["capture/trials"]
-    Schema["headless schema"] --> Capture
-    Capture --> Results["results.jsonl (full trajectory)"]
-    Results --> Summarize["summarize"]
-    Results --> Calibrate["calibrate"]
-    Results --> Custom["(your tools)"]
-    Summarize --> Views["summary.jsonl / .md"]
-    Calibrate --> Report["calibration.md"]
-    Custom --> Pipeline["any scoring platform"]
-```
-
-**Single output format:** Full trajectory JSONL (always)
-**No `--format` flag:** Derive views with separate commands
-**Schema exports:** Zod schemas + JSON Schema for any tooling
-
-## Commands
-
-### Core Commands
-
-| Command | Input | Output | Purpose |
-|---------|-------|--------|---------|
-| `capture` | prompts.jsonl + schema | results.jsonl | Trajectory capture (full) |
-| `trials` | prompts.jsonl + schema | trials.jsonl | Multi-run + optional metrics |
-| `summarize` | results.jsonl | summary.jsonl or .md | Derive compact views |
-| `calibrate` | results.jsonl | calibration.md | Sample failures for review |
-| `validate-refs` | prompts.jsonl | validation.jsonl | Check reference solutions |
-| `balance` | prompts.jsonl | balance.json | Analyze test set coverage |
-| `schemas` | (none) | JSON Schema | Export schemas for non-TS users |
-
-### Pipeline Commands (Unix-style)
-
-| Command | Input | Output | Purpose |
-|---------|-------|--------|---------|
-| `run` | prompts.jsonl + schema | raw.jsonl | Execute prompts, raw output |
-| `extract` | raw.jsonl + schema | extracted.jsonl | Parse trajectories |
-| `grade` | extracted.jsonl + grader | graded.jsonl | Apply grader scoring |
-| `format` | results.jsonl | jsonl/markdown/csv | Convert output format |
-| `compare` | multiple results.jsonl | comparison.json | Compare runs (aggregate report) |
-
-All commands support optional `--grader ./grader.ts` for scoring.
-
-## Capture Command
-
-### Basic Usage
-
-```bash
-bunx @plaited/agent-eval-harness capture <prompts.jsonl> --schema <schema.json> [options]
-```
-
-### Arguments
-
-| Argument/Flag | Description | Default |
-|------|-------------|---------|
-| `prompts.jsonl` | Input file with prompts to execute | Required |
-| `-s, --schema` | Path to headless adapter schema | Required |
-| `-o, --output` | Output file/path | stdout |
-| `-c, --cwd` | Working directory for agent | current |
-| `-t, --timeout` | Request timeout in ms | `60000` |
-| `-j, --concurrency` | Number of concurrent workers | `1` |
-| `--workspace-dir` | Base directory for per-prompt workspace isolation | none |
-| `--progress` | Show progress to stderr | false |
-| `--append` | Append to output file | false |
-| `-g, --grader` | Path to grader module | none |
-| `--debug` | Show detailed CLI output for debugging | false |
-
-### Examples
-
-```bash
-# Basic capture
-bunx @plaited/agent-eval-harness capture prompts.jsonl --schema ./claude.json -o results.jsonl
-
-# Parallel execution (4x faster with 4 workers)
-bunx @plaited/agent-eval-harness capture prompts.jsonl --schema ./claude.json -j 4 -o results.jsonl
-
-# With workspace isolation for code generation tasks
-bunx @plaited/agent-eval-harness capture prompts.jsonl --schema ./claude.json \
-  -j 4 --workspace-dir ./workspaces -o results.jsonl
-
-# Using a local adapter script
-bunx @plaited/agent-eval-harness capture prompts.jsonl bun ./my-adapter.ts -o results.jsonl
-
-# With grader (adds score to each result)
-bunx @plaited/agent-eval-harness capture prompts.jsonl --schema ./claude.json --grader ./grader.ts -o results.jsonl
-```
-
-## Trials Command
-
-Run each prompt multiple times for pass@k/pass^k analysis.
-
-```bash
-# Capture only (no grader)
-bunx @plaited/agent-eval-harness trials prompts.jsonl --schema ./claude.json -k 5 -o trials.jsonl
-
-# With grader (computes pass@k, pass^k)
-bunx @plaited/agent-eval-harness trials prompts.jsonl --schema ./claude.json -k 5 --grader ./grader.ts -o trials.jsonl
-
-# Parallel execution (4 prompts' trials run concurrently)
-bunx @plaited/agent-eval-harness trials prompts.jsonl --schema ./claude.json -k 5 -j 4 -o trials.jsonl
-
-# With workspace isolation (each trial gets its own directory)
-bunx @plaited/agent-eval-harness trials prompts.jsonl --schema ./claude.json -k 5 -j 4 \
-  --workspace-dir ./workspaces -o trials.jsonl
-```
-
-**Parallelization notes:**
-- `-j/--concurrency` parallelizes across prompts (not trials within a prompt)
-- Each prompt's k trials still run sequentially (required for aggregation)
-- With 151 prompts and `-j 4`, you get 4 prompts running trials concurrently
-- `--workspace-dir` creates `{workspace-dir}/prompt-{id}-trial-{n}/` for each trial
-- Progress logging shows aggregate completion (e.g., `12/50 prompts completed`)
-
-**Workspace cleanup:**
-Directories persist after completion for debugging. Clean up manually:
-```bash
-# After capture
-rm -rf ./workspaces
-
-# In CI (add as post-step)
-- run: rm -rf ./workspaces
-  if: always()
-```
-
-### Output
-
-Without grader:
-```jsonl
-{"id":"search-001","input":"Find the CEO","k":5,"trials":[{"trialNum":1,"output":"...","trajectory":[...],"duration":1234},...]}
-```
-
-With grader:
-```jsonl
-{"id":"search-001","input":"Find the CEO","k":5,"passRate":0.8,"passAtK":0.99,"passExpK":0.33,"trials":[{"trialNum":1,"output":"...","pass":true,"score":1.0},...]}
-```
-
-## Summarize Command
-
-Derive compact views from full trajectory results.
-
-```bash
-# Summary JSONL (for jq analysis)
-bunx @plaited/agent-eval-harness summarize results.jsonl -o summary.jsonl
-
-# Markdown (for LLM-as-judge)
-bunx @plaited/agent-eval-harness summarize results.jsonl --markdown -o results.md
-```
-
-## Calibrate Command
-
-Sample failures for grader review. Calibration helps you distinguish between **agent failures** (agent did wrong thing) and **grader bugs** (agent was correct, grader too strict).
-
-```bash
-# Sample failures for human review
-bunx @plaited/agent-eval-harness calibrate results.jsonl --sample 10 -o calibration.md
-
-# Re-score with different grader to compare
-bunx @plaited/agent-eval-harness calibrate results.jsonl --grader ./loose-grader.ts --sample 10 -o comparison.md
-```
-
-See [eval-concepts.md](references/eval-concepts.md#grader-calibration) for why calibration matters.
-
-## Validate-Refs Command
-
-Check that reference solutions pass your grader before evaluating agents.
-
-```bash
-# Validate reference solutions
-bunx @plaited/agent-eval-harness validate-refs prompts.jsonl --grader ./grader.ts -o validation.jsonl
-
-# Check for failures
-cat validation.jsonl | jq 'select(.pass == false)'
-```
-
-### Why Use This?
-
-If your reference solution fails your own grader:
-- The task definition is ambiguous
-- The grader is too strict
-- The hint is wrong
-
-**Fix the eval before evaluating the agent.**
-
-### Input Format
-
-Prompts must include a `reference` field:
-
-```jsonl
-{"id":"test-001","input":"Create a button component","hint":"<button>","reference":"export const Button = () => <button>Click</button>"}
-```
-
-### Output Format
-
-```jsonl
-{"id":"test-001","input":"Create a button component","reference":"export const Button = () => <button>Click</button>","pass":true,"score":1.0,"reasoning":"Contains hint content"}
-```
-
-## Balance Command
-
-Analyze test set coverage to ensure balanced evaluation.
-
-```bash
-# Analyze prompt distribution
-bunx @plaited/agent-eval-harness balance prompts.jsonl -o balance.json
-
-# Pretty print
-bunx @plaited/agent-eval-harness balance prompts.jsonl | jq .
-```
-
-### Why Use This?
-
-An eval with only "make X work" misses "don't break Y". Balance analysis shows:
-
-- **Category distribution** (from `metadata.category`)
-- **Positive/negative case ratio**
-- **Coverage gaps**
-
-### Output Format
-
-```json
-{
-  "totalCases": 50,
-  "categories": [
-    { "name": "ui", "count": 20, "percentage": 40 },
-    { "name": "logic", "count": 15, "percentage": 30 },
-    { "name": "api", "count": 10, "percentage": 20 },
-    { "name": "edge-case", "count": 5, "percentage": 10 }
-  ],
-  "underrepresented": ["edge-case"],
-  "suggestions": ["Consider adding more test cases for: edge-case"]
-}
-```
-
-### Balanced Eval Design
-
-Include both positive and negative cases:
-
-| Type | Example | Purpose |
-|------|---------|---------|
-| Positive | "Add a login button" | Agent should succeed |
-| Negative | "Add a button without breaking tests" | Agent should not break things |
-| Edge case | "Handle empty input gracefully" | Agent should be robust |
-
-See [eval-concepts.md](references/eval-concepts.md#test-set-balance) for more on balanced test sets.
-
-## Pipeline Workflow
-
-The pipeline commands enable Unix-style composition for flexible evaluation workflows.
-
-### Full Pipeline Example
-
-```bash
-# Execute → Extract → Grade → Format in one pipeline
-cat prompts.jsonl | \
-  bunx @plaited/agent-eval-harness run -s claude.json | \
-  bunx @plaited/agent-eval-harness extract -s claude.json | \
-  bunx @plaited/agent-eval-harness grade -g ./grader.ts | \
-  bunx @plaited/agent-eval-harness format -f markdown > report.md
-```
-
-### Run Command
-
-Execute prompts and output raw results. Three modes available:
-
-```bash
-# Schema mode (recommended)
-bunx @plaited/agent-eval-harness run prompts.jsonl --schema claude.json
-
-# Simple mode: {} placeholder substitution
-bunx @plaited/agent-eval-harness run prompts.jsonl --simple "claude -p {} --output-format stream-json"
-
-# Shell mode: $PROMPT environment variable
-bunx @plaited/agent-eval-harness run prompts.jsonl --shell 'claude -p "$PROMPT" --output-format stream-json'
-```
-
-> **⚠️ Security Warning:** The `--simple` and `--shell` modes execute prompts via shell commands. Prompts are escaped but **do not use untrusted prompt content** with these modes. Malicious prompt text could potentially escape the quoting and execute arbitrary commands. Use `--schema` mode (headless adapter) for untrusted inputs.
-
-### Extract Command
-
-Parse raw output into structured trajectories:
-
-```bash
-# From file
-bunx @plaited/agent-eval-harness extract raw.jsonl --schema claude.json -o extracted.jsonl
-
-# Piped from run
-bunx @plaited/agent-eval-harness run prompts.jsonl -s claude.json | \
-  bunx @plaited/agent-eval-harness extract -s claude.json
-```
-
-### Grade Command
-
-Apply grader to extracted results:
-
-```bash
-bunx @plaited/agent-eval-harness grade extracted.jsonl --grader ./grader.ts -o graded.jsonl
-```
-
-### Format Command
-
-Convert results to different output formats:
-
-```bash
-# Markdown report
-bunx @plaited/agent-eval-harness format results.jsonl --style markdown -o report.md
-
-# CSV for spreadsheets
-bunx @plaited/agent-eval-harness format results.jsonl --style csv -o results.csv
-
-# JSONL (pass-through, default)
-bunx @plaited/agent-eval-harness format results.jsonl --style jsonl
-```
-
-### Compare Command
-
-Compare multiple runs of the same prompts. Supports both **CaptureResult** (single-run) and **TrialResult** (multi-run reliability) formats with auto-detection.
-
-```bash
-# Default: auto-detect format, weighted strategy, JSON output
-bunx @plaited/agent-eval-harness compare run1.jsonl run2.jsonl -o comparison.json
-
-# Statistical significance strategy
-bunx @plaited/agent-eval-harness compare run1.jsonl run2.jsonl --strategy statistical -o comparison.json
-
-# Custom weights via environment variables (CaptureResult)
-COMPARE_QUALITY=0.7 COMPARE_LATENCY=0.2 COMPARE_RELIABILITY=0.1 \
-  bunx @plaited/agent-eval-harness compare run1.jsonl run2.jsonl -o comparison.json
-
-# Markdown report format
-bunx @plaited/agent-eval-harness compare run1.jsonl run2.jsonl --format markdown -o report.md
-
-# Custom grader (LLM-as-Judge)
-bunx @plaited/agent-eval-harness compare run1.jsonl run2.jsonl \
-  --strategy custom --grader ./my-llm-judge.ts -o comparison.json
-
-# With explicit labels
-bunx @plaited/agent-eval-harness compare \
-  --run "with-mcp:results-mcp.jsonl" \
-  --run "vanilla:results-vanilla.jsonl" \
-  -o comparison.json
-```
-
-**Use cases for compare:**
-- Same agent, different MCP servers
-- Same agent, different skills enabled
-- Same agent, different model versions
-- Different agents entirely
-
-### Trials Comparison (pass@k Analysis)
-
-Compare TrialResult files for reliability analysis:
-
-```bash
-# Auto-detect trials format
-bunx @plaited/agent-eval-harness compare trials1.jsonl trials2.jsonl -o comparison.json
-
-# Explicit format (skip auto-detection)
-bunx @plaited/agent-eval-harness compare trials1.jsonl trials2.jsonl --input-format trials -o comparison.json
-
-# Custom weights for trials comparison
-COMPARE_CAPABILITY=0.5 COMPARE_RELIABILITY=0.3 COMPARE_CONSISTENCY=0.2 \
-  bunx @plaited/agent-eval-harness compare trials1.jsonl trials2.jsonl -o comparison.json
-```
-
-**Trials metrics:**
-
-| Metric | Description | Formula |
-|--------|-------------|---------|
-| **Capability** (passAtK) | Can solve at least once in K tries | `1 - (1-p)^k` |
-| **Reliability** (passExpK) | Solves consistently every time | `p^k` |
-| **Flakiness** | Gap between capability and reliability | `passAtK - passExpK` |
-| **Quality** (scores) | Aggregate grader scores across trials | avg/median/p25/p75 (only with grader) |
-| **Performance** (latency) | Aggregate trial durations | p50/p90/p99/mean/min/max (always present) |
-
-### Built-in Comparison Strategies
-
-**For CaptureResult (single-run):**
-
-| Strategy | Description | Env Vars |
-|----------|-------------|----------|
-| `weighted` (default) | Quality, latency, reliability | `COMPARE_QUALITY`, `COMPARE_LATENCY`, `COMPARE_RELIABILITY` |
-| `statistical` | Bootstrap for confidence intervals | `COMPARE_BOOTSTRAP_ITERATIONS` |
-| `custom` | Your own grader | `--grader path` |
-
-**For TrialResult (multi-run):**
-
-| Strategy | Description | Env Vars |
-|----------|-------------|----------|
-| `weighted` (default) | Capability, reliability, consistency | `COMPARE_CAPABILITY`, `COMPARE_RELIABILITY`, `COMPARE_CONSISTENCY` |
-| `statistical` | Bootstrap passAtK confidence intervals | `COMPARE_BOOTSTRAP_ITERATIONS` |
-| `custom` | Your own grader | `--grader path` |
-
-### Comparison Report Output
-
-**CaptureResult format** outputs `ComparisonReport`:
-
-```json
-{
-  "meta": { "generatedAt": "...", "runs": ["baseline", "variant"], "promptCount": 100 },
-  "quality": { "baseline": { "avgScore": 0.85, "passRate": 0.82 }, "variant": { ... } },
-  "performance": { "baseline": { "latency": { "p50": 1200, "p90": 3400 } }, ... },
-  "reliability": { "baseline": { "type": "run", "toolErrors": 5, "completionRate": 0.99 }, ... },
-  "headToHead": { "pairwise": [{ "runA": "baseline", "runB": "variant", "aWins": 35, "bWins": 55 }] }
-}
-```
-
-With `--strategy statistical`, quality and performance metrics include 95% confidence intervals:
-
-```json
-{
-  "quality": {
-    "baseline": {
-      "avgScore": 0.85,
-      "passRate": 0.82,
-      "confidenceIntervals": {
-        "avgScore": [0.82, 0.88],
-        "passRate": [0.79, 0.85]
-      }
-    }
-  },
-  "performance": {
-    "baseline": {
-      "latency": { "p50": 1200, "mean": 1350 },
-      "confidenceIntervals": {
-        "latencyMean": [1280, 1420]
-      }
-    }
-  }
-}
-```
-
-**TrialResult format** outputs `TrialsComparisonReport`:
-
-```json
-{
-  "meta": { "generatedAt": "...", "runs": ["claude", "gemini"], "promptCount": 50, "trialsPerPrompt": 5, "inputFormat": "trials" },
-  "capability": { "claude": { "avgPassAtK": 0.92, "medianPassAtK": 0.95 }, "gemini": { "..." : "..." } },
-  "reliability": { "claude": { "type": "trial", "avgPassExpK": 0.78, "medianPassExpK": 0.82 }, "gemini": { "..." : "..." } },
-  "flakiness": { "claude": { "avgFlakiness": 0.14, "flakyPromptCount": 12 }, "gemini": { "..." : "..." } },
-  "quality": { "claude": { "avgScore": 0.85, "medianScore": 0.90, "p25Score": 0.75, "p75Score": 0.95 }, "gemini": { "..." : "..." } },
-  "performance": { "claude": { "latency": { "p50": 1200, "p90": 3400, "p99": 5100, "mean": 1500, "min": 800, "max": 5200 }, "totalDuration": 375000 }, "gemini": { "..." : "..." } },
-  "headToHead": {
-    "capability": [{ "runA": "claude", "runB": "gemini", "aWins": 28, "bWins": 18, "ties": 4 }],
-    "reliability": ["..."],
-    "overall": ["..."]
-  }
-}
-```
-
-**Notes:**
-- `quality` is only present when a grader was used (trials have `score` fields)
-- `performance` is always present (every trial has `duration`)
-
-With `--strategy statistical`, capability, reliability, quality, and performance metrics include 95% confidence intervals:
-
-```json
-{
-  "capability": {
-    "claude": {
-      "avgPassAtK": 0.92,
-      "confidenceIntervals": { "avgPassAtK": [0.88, 0.95] }
-    }
-  },
-  "reliability": {
-    "claude": {
-      "type": "trial",
-      "avgPassExpK": 0.78,
-      "confidenceIntervals": { "avgPassExpK": [0.72, 0.84] }
-    }
-  },
-  "quality": {
-    "claude": {
-      "avgScore": 0.85,
-      "confidenceIntervals": { "avgScore": [0.82, 0.88] }
-    }
-  },
-  "performance": {
-    "claude": {
-      "latency": { "mean": 1500 },
-      "confidenceIntervals": { "latencyMean": [1380, 1620] }
-    }
-  }
-}
-```
-
-See [comparison-graders.md](references/comparison-graders.md) for complete comparison grader documentation including LLM-as-Judge patterns.
-
-### Comparison Grader Interface
-
-**CaptureResult grader:**
-
-```typescript
-import type { ComparisonGrader } from '@plaited/agent-eval-harness/pipeline'
-
-export const grade: ComparisonGrader = async ({ id, input, hint, runs }) => {
-  // runs is Record<string, { output, trajectory?, score?, duration?, toolErrors? }>
-  return {
-    rankings: [
-      { run: 'with-mcp', rank: 1, score: 0.9 },
-      { run: 'vanilla', rank: 2, score: 0.7 },
-    ],
-    reasoning: 'MCP run produced more accurate output'
-  }
-}
-```
-
-**TrialResult grader:**
-
-```typescript
-import type { TrialsComparisonGrader } from '@plaited/agent-eval-harness/pipeline'
-
-export const grade: TrialsComparisonGrader = async ({ id, input, hint, runs }) => {
-  // runs is Record<string, { passAtK?, passExpK?, k, trials }>
-  // Each trial in trials has: { duration, score?, pass?, output, trajectory }
-  return {
-    rankings: [
-      { run: 'claude', rank: 1, score: 0.92 },
-      { run: 'gemini', rank: 2, score: 0.85 },
-    ],
-    reasoning: 'Claude has higher reliability with lower flakiness'
-  }
-}
-```
-
-### Pipeline Workflow Diagram
-
-```mermaid
-flowchart LR
-    Prompts["prompts.jsonl"] --> Run["run"]
-    Schema["headless schema"] --> Run
-    Run --> Raw["raw.jsonl"]
-    Raw --> Extract["extract"]
-    Schema --> Extract
-    Extract --> Extracted["extracted.jsonl"]
-    Extracted --> Grade["grade"]
-    Grader["grader.ts"] --> Grade
-    Grade --> Graded["graded.jsonl"]
-    Graded --> Format["format"]
-    Format --> Output["report.md / .csv / .jsonl"]
-
-    Graded --> Compare["compare"]
-    Results2["other runs..."] --> Compare
-    CompareGrader["compare-grader.ts"] --> Compare
-    Compare --> Comparison["comparison.jsonl"]
-```
-
-## Schemas Command
-
-Export JSON schemas for non-TypeScript tools.
-
-```bash
-# List available schemas
-bunx @plaited/agent-eval-harness schemas
-
-# Export all schemas as JSON
-bunx @plaited/agent-eval-harness schemas --json -o schemas.json
-
-# Export specific schema
-bunx @plaited/agent-eval-harness schemas CaptureResult --json
-bunx @plaited/agent-eval-harness schemas TrialResult --json
-bunx @plaited/agent-eval-harness schemas GraderResult --json
-```
-
-### Available Schemas
-
-| Schema | Description |
-|--------|-------------|
-| `CaptureResult` | Single capture output (id, input, output, trajectory, timing) |
-| `TrialResult` | Multi-run trial output (includes passAtK, passExpK) |
-| `GraderResult` | Grader return value (pass, score, reasoning) |
-| `PromptInput` | Input prompt format |
-| `TrajectoryStep` | Single step in trajectory array |
-| `SummaryResult` | Compact summary format |
-
-### Usage in Other Languages
-
-Export schemas for validation in Python, Go, etc.:
-
-```bash
-# Export all schemas
-bunx @plaited/agent-eval-harness schemas --json -o schemas.json
-
-# Use in Python with jsonschema
-python -c "
-import json
-from jsonschema import validate
-
-with open('schemas.json') as f:
-    schemas = json.load(f)
-
-with open('results.jsonl') as f:
-    for line in f:
-        result = json.loads(line)
-        validate(result, schemas['CaptureResult'])
-        print(f'{result[\"id\"]}: valid')
-"
-```
-
-## Grader Interface
-
-Graders provide semantic pass/fail scoring for captured trajectories. The harness supports graders written in **any language**.
-
-### Git-Based Grading (Recommended for Coding Tasks)
-
-**Grade outcomes, not paths.** Use the optional `cwd` parameter to detect environmental changes with git:
-
-```typescript
-// git-grader.ts
-import type { Grader } from '@plaited/agent-eval-harness/schemas'
-
-export const grade: Grader = async ({ output, hint, cwd }) => {
-  if (!cwd) return { pass: false, score: 0, reasoning: 'No cwd' }
-  
-  // Detect file changes
-  const status = await Bun.$`git -C ${cwd} status --porcelain`.text()
-  const filesCreated = status
-    .split('\n')
-    .filter(line => line.startsWith('??'))
-    .map(line => line.slice(3).trim())
-  
-  // Verify tests pass
-  const testResult = await Bun.$`cd ${cwd} && bun test`.nothrow()
-  
-  return {
-    pass: filesCreated.length > 0 && testResult.exitCode === 0,
-    score: testResult.exitCode === 0 ? 1 : 0,
-    reasoning: `Files: ${filesCreated.join(', ')}. Tests: ${testResult.exitCode === 0 ? 'pass' : 'fail'}`,
-    outcome: {  // Optional: structured data for analysis
-      filesCreated,
-      testsPassed: testResult.exitCode === 0,
-      type: 'file_creation_with_tests'
-    }
-  }
-}
-```
-
-See [inline-graders.md](references/inline-graders.md#git-based-outcome-grading) for comprehensive git-based grading patterns.
-
-### Output-Based Grading (General Purpose)
-
-```typescript
-// my-grader.ts
-import type { Grader } from '@plaited/agent-eval-harness/schemas'
-
-export const grade: Grader = async ({ input, output, hint, trajectory }) => {
-  const pass = output.toLowerCase().includes(hint?.toLowerCase() ?? '')
-  return {
-    pass,
-    score: pass ? 1 : 0,
-    reasoning: pass ? 'Contains hint content' : 'Missing hint content'
-  }
-}
-```
-
-**Note:** `input` can be `string` (single turn) or `string[]` (multi-turn). The `hint` field provides grader context (renamed from `expected`).
-
-### Python/Executable Graders
-
-Any executable can be a grader using stdin/stdout JSON protocol:
-
-```python
-#!/usr/bin/env python3
-import json, sys
-
-data = json.load(sys.stdin)
-output = data.get("output", "").lower()
-hint = (data.get("hint") or "").lower()
-
-pass_result = hint in output if hint else True
-print(json.dumps({
-    "pass": pass_result,
-    "score": 1.0 if pass_result else 0.0,
-    "reasoning": "Contains hint" if pass_result else "Missing hint"
-}))
-```
-
-```bash
-chmod +x ./grader.py
-bunx @plaited/agent-eval-harness capture prompts.jsonl --schema ./claude.json --grader ./grader.py -o results.jsonl
-```
-
-See [inline-graders.md](references/inline-graders.md) for complete grader documentation including LLM-as-Judge patterns.
-
-## Input Format
-
-Each line in `prompts.jsonl`:
-
-```jsonl
-{"id":"test-001","input":"Create a button","hint":"should contain <button>"}
-{"id":"test-002","input":["Create a button","Make it blue"],"metadata":{"category":"ui"}}
-```
-
-| Field | Required | Description |
-|-------|----------|-------------|
-| `id` | Yes | Unique identifier |
-| `input` | Yes | Single prompt (string) or conversation turns (string[]) |
-| `hint` | No | Grader context - what to look for (not strict match) |
-| `reference` | No | Reference solution (for validate-refs) |
-| `metadata` | No | Tags, category, difficulty for filtering |
-| `timeout` | No | Override default timeout for this prompt |
-
-**Session behavior:** Each JSONL entry = 1 fresh session
-- `input: string` → 1 session, 1 prompt
-- `input: string[]` → 1 session, N prompts (sequential turns)
-
-## Output Format
-
-Full trajectory JSONL (always):
-
-```jsonl
-{
-  "id": "test-001",
-  "input": "Find the CEO of Anthropic",
-  "output": "The CEO of Anthropic is Dario Amodei.",
-  "hint": "should mention Dario Amodei",
-  "trajectory": [
-    {"type": "thought", "content": "I'll search for this...", "timestamp": 100},
-    {"type": "tool_call", "name": "WebSearch", "status": "completed", "input": {...}, "output": {...}, "duration": 500},
-    {"type": "message", "content": "The CEO of Anthropic is Dario Amodei.", "timestamp": 700}
-  ],
-  "metadata": {
-    "category": "search",
-    "agent": "--schema ./claude.json",
-    "trajectoryRichness": "full",
-    "turnCount": 1
-  },
-  "timing": {
-    "start": 1704067200000,
-    "end": 1704067201234,
-    "firstResponse": 100,
-    "sessionCreation": 234,
-    "total": 1234,
-    "inputTokens": 150,
-    "outputTokens": 85
-  },
-  "toolErrors": false
-}
-```
-
-### Output Fields
-
-| Field | Description |
-|-------|-------------|
-| `input` | Original prompt (string or string[] for multi-turn) |
-| `hint` | Grader context hint (if provided) |
-| `metadata.trajectoryRichness` | `"full"` \| `"messages-only"` \| `"minimal"` |
-| `metadata.turnCount` | Number of conversation turns (1 for string, N for array) |
-| `timing.sessionCreation` | Time to create session (ms) |
-| `timing.total` | Total duration (end - start) |
-| `timing.inputTokens` | Input tokens consumed (if available from adapter) |
-| `timing.outputTokens` | Output tokens generated (if available from adapter) |
-| `toolErrors` | Whether any tool calls failed |
-
-**Note:** `toolErrors` replaces misleading `status: 'passed'|'failed'`. Real pass/fail comes from YOUR grader.
-
-### Trust Boundary
-
-Trajectory data contains content from the agent's execution, including tool results from **external sources** (web searches, file reads, API calls). The `tool_call.input` and `tool_call.output` fields preserve raw values from CLI output without sanitization.
-
-**Graders and analysis scripts should treat trajectory content as untrusted:**
-- Validate `cwd` paths before using in shell commands (see [inline-graders.md](references/inline-graders.md#security-considerations))
-- Do not execute trajectory content as code
-- Use injection-aware prompting when passing trajectory content to LLM-as-judge graders
-
-## Schema Exports
-
-Consumers can import Zod schemas directly:
-
-```typescript
-import { CaptureResultSchema, TrialResultSchema } from '@plaited/agent-eval-harness/schemas'
-
-// Validate external data
-const result = CaptureResultSchema.parse(jsonData)
-
-// Generate JSON Schema (Zod 4 native)
-import { z } from 'zod'
-const jsonSchema = z.toJSONSchema(CaptureResultSchema)
-```
-
-### Discriminated Unions for Reliability Metrics
-
-Reliability metrics include a `type` discriminator for type-safe parsing:
-
-```typescript
-import { z } from 'zod'
-import {
-  ReliabilityMetricsSchema,       // type: 'run'
-  TrialsReliabilityMetricsSchema  // type: 'trial'
-} from '@plaited/agent-eval-harness/schemas'
-
-// Create a unified schema for both metric types
-const UnifiedReliabilitySchema = z.discriminatedUnion('type', [
-  ReliabilityMetricsSchema,
-  TrialsReliabilityMetricsSchema,
-])
-
-// Type-safe parsing with automatic narrowing
-const metrics = UnifiedReliabilitySchema.parse(data)
-if (metrics.type === 'run') {
-  // TypeScript knows: ReliabilityMetrics
-  console.log(metrics.toolErrors, metrics.completionRate)
-} else {
-  // TypeScript knows: TrialsReliabilityMetrics
-  console.log(metrics.avgPassExpK, metrics.medianPassExpK)
-}
-```
-
-Or export JSON schemas for non-TypeScript tools:
-
-```bash
-bunx @plaited/agent-eval-harness schemas --json -o schemas.json
-bunx @plaited/agent-eval-harness schemas CaptureResult --json
-```
-
-## Execution Environment
-
-**Recommendation:** Run the harness in Docker containers for consistent, isolated execution.
-
-```bash
-# Run integration tests via Docker
-docker compose -f docker-compose.test.yml run --rm test
-
-# Or with explicit API keys
-ANTHROPIC_API_KEY=sk-... GEMINI_API_KEY=... docker compose -f docker-compose.test.yml run --rm test
-```
-
-### Docker Requirements
-
-| Requirement | Reason |
-|-------------|--------|
-| **Node.js 24+** | Gemini CLI uses modern JS features (optional chaining) |
-| **Non-root user** | Claude CLI blocks `--dangerously-skip-permissions` as root |
-| **Gemini API key** | Pass `GEMINI_API_KEY` for Gemini CLI |
-
-See [docker-evals.md](references/docker-evals.md) for complete Docker setup guide, debugging tips, and CI integration patterns.
-
-### Multi-turn Conversations
-
-Use `input: string[]` to execute multi-turn conversations within a single session:
-
-```jsonl
-{"id":"context-001","input":["Remember this number: 42","What number did I ask you to remember?"],"hint":"42"}
-{"id":"context-002","input":["My name is Alice","What is my name?"],"hint":"Alice"}
-```
-
-Run with the headless adapter:
-
-```bash
-# Using Claude Code via headless adapter
-bunx @plaited/agent-eval-harness capture multi-turn.jsonl \
-  bunx @plaited/agent-eval-harness headless --schema ./claude-headless.json \
-  -o results.jsonl
-
-# Using Gemini CLI via headless adapter
-GEMINI_API_KEY=... bunx @plaited/agent-eval-harness capture multi-turn.jsonl \
-  bunx @plaited/agent-eval-harness headless --schema ./gemini-headless.json \
-  -o results.jsonl
-```
-
-**Key points:**
-- Each JSONL entry = 1 fresh session
-- `input: string[]` sends sequential turns to the **same session**
-- Works with both `stream` mode (Claude) and `iterative` mode (Gemini)
-- The adapter handles context preservation automatically
-
-## Downstream Integration
-
-The harness outputs standard JSONL that pipes to any tool:
-
-```bash
-# Filter with jq
-cat results.jsonl | jq 'select(.metadata.category == "ui")'
-
-# Count tool usage
-cat results.jsonl | jq -s 'map(.trajectory | map(select(.type == "tool_call")) | length) | add'
-
-# Summarize for quick analysis
-bunx @plaited/agent-eval-harness summarize results.jsonl -o summary.jsonl
-
-# Compare runs with built-in strategies
-bunx @plaited/agent-eval-harness compare run1.jsonl run2.jsonl -o comparison.json
-```
-
-## Quick Reference
-
-| Resource | Description |
-|----------|-------------|
-| `bunx @plaited/agent-eval-harness` | CLI help |
-| [output-formats.md](references/output-formats.md) | JSONL schemas, command details |
-| [inline-graders.md](references/inline-graders.md) | Single input/output graders (TypeScript, Python, shell) |
-| [comparison-graders.md](references/comparison-graders.md) | Comparison strategies (weighted, statistical, LLM-as-Judge) |
-| [calibration.md](references/calibration.md) | Grader calibration workflow |
-| [eval-concepts.md](references/eval-concepts.md) | Evaluation concepts (pass@k, pass^k) |
-| [docker-evals.md](references/docker-evals.md) | Docker setup, debugging, CI integration |
-
-## Related
-
-- **[headless-adapters skill](../headless-adapters/SKILL.md)** - Schema-driven adapters for headless CLI agents
diff --git a/.agents/skills/agent-eval-harness/assets/Dockerfile.eval b/.agents/skills/agent-eval-harness/assets/Dockerfile.eval
deleted file mode 100644
index 7719377..0000000
--- a/.agents/skills/agent-eval-harness/assets/Dockerfile.eval
+++ /dev/null
@@ -1,25 +0,0 @@
-# Agent Eval Harness Docker Configuration
-#
-# Example Dockerfile for running agent evaluations in an isolated container.
-# Copy this to your project and customize as needed.
-#
-# Usage:
-#   docker build -f Dockerfile.eval -t agent-eval-harness .
-#   docker run --rm -e ANTHROPIC_API_KEY agent-eval-harness bunx @plaited/agent-eval-harness capture prompts.jsonl --schema ./claude.json
-
-FROM oven/bun:1.2.9
-
-# Install git (required for some agent operations)
-RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
-
-WORKDIR /app
-
-# Copy package files first for better layer caching
-COPY package.json bun.lock* ./
-RUN bun install --frozen-lockfile
-
-# Copy source files
-COPY . .
-
-# Default command - override with your harness invocation
-CMD ["bun", "test"]
diff --git a/.agents/skills/agent-eval-harness/assets/docker-compose.eval.yml b/.agents/skills/agent-eval-harness/assets/docker-compose.eval.yml
deleted file mode 100644
index 9b8e4b0..0000000
--- a/.agents/skills/agent-eval-harness/assets/docker-compose.eval.yml
+++ /dev/null
@@ -1,19 +0,0 @@
-# Agent Eval Harness Docker Compose Configuration
-#
-# Example docker-compose for running agent evaluations.
-# Copy this to your project and customize as needed.
-#
-# Usage:
-#   ANTHROPIC_API_KEY=sk-... docker compose -f docker-compose.eval.yml run --rm agent-eval-harness
-
-services:
-  agent-eval-harness:
-    build:
-      context: .
-      dockerfile: Dockerfile.eval
-    environment:
-      - ANTHROPIC_API_KEY
-    volumes:
-      # Mount output directory to persist results
-      - ./results:/app/results
-    command: ["bunx", "@plaited/agent-eval-harness", "capture", "prompts.jsonl", "--schema", "./claude.json", "-o", "results/output.jsonl"]
diff --git a/.agents/skills/agent-eval-harness/references/calibration.md b/.agents/skills/agent-eval-harness/references/calibration.md
deleted file mode 100644
index ee0f61c..0000000
--- a/.agents/skills/agent-eval-harness/references/calibration.md
+++ /dev/null
@@ -1,146 +0,0 @@
-# Grader Calibration Guide
-
-## Why Calibrate?
-
-When an agent fails a task, there are two possibilities:
-
-1. **The agent failed** - It made mistakes, used wrong tools, produced incorrect output
-2. **The grader failed** - The grader is too strict, has bugs, or doesn't handle valid variations
-
-Calibration helps you distinguish between these cases **before** running expensive trials.
-
-## When to Calibrate
-
-| Situation | Action |
-|-----------|--------|
-| Just wrote a new grader | Calibrate immediately with 10-20 samples |
-| Pass rate dropped unexpectedly | Calibrate to check if grader is too strict |
-| Pass rate seems suspiciously high | Calibrate to check if grader is too lenient |
-| Before running k=10 trials | Calibrate to catch bugs early (save API costs) |
-| Grader uses LLM-as-judge | Calibrate to verify LLM reasoning is sound |
-
-## Calibration Workflow
-
-### Step 1: Run Initial Capture
-
-```bash
-agent-eval-harness capture prompts.jsonl \
-  --schema ./claude-headless.json \
-  --grader ./my-grader.ts \
-  -o results.jsonl
-```
-
-### Step 2: Sample Failures for Review
-
-```bash
-# Sample 10 random failures
-agent-eval-harness calibrate results.jsonl --sample 10 -o calibration.md
-```
-
-This outputs a markdown file with:
-- The prompt input
-- The agent's output
-- Trajectory snippet (first 2 steps, middle, last 2 steps)
-- The grader's score and reasoning
-
-### Step 3: Human Review
-
-For each sampled failure, answer:
-
-| Question | If Yes... |
-|----------|-----------|
-| Did the agent actually fail the task? | Grader is correct ✓ |
-| Is the output correct but grader rejected it? | Grader is too strict → fix grader |
-| Is the grader's reasoning wrong? | Grader has bugs → fix grader |
-| Is the task ambiguous? | Improve prompt or hint |
-
-### Step 4: Fix and Re-run
-
-If you found grader issues:
-
-1. Fix the grader
-2. Re-score existing results (no need to re-run agent):
-   ```bash
-   agent-eval-harness grade results.jsonl --grader ./fixed-grader.ts -o re-scored.jsonl
-   ```
-3. Re-calibrate to verify fix
-
-### Step 5: Validate with Reference Solutions
-
-Before blaming the agent, prove the task is solvable:
-
-```bash
-# Ensure reference solutions pass
-agent-eval-harness validate-refs prompts.jsonl --grader ./my-grader.ts
-```
-
-If references fail, the grader (not the agent) is broken.
-
-## Calibration Metrics
-
-Track these over time:
-
-| Metric | Formula | Target |
-|--------|---------|--------|
-| False Negative Rate | (correct outputs marked fail) / (total fails) | < 5% |
-| False Positive Rate | (wrong outputs marked pass) / (total passes) | < 5% |
-| Grader Agreement | (human agrees with grader) / (samples reviewed) | > 90% |
-
-## Example: Calibrating an LLM-as-Judge Grader
-
-LLM graders are powerful but need extra calibration:
-
-```bash
-# 1. Run with LLM grader
-agent-eval-harness capture prompts.jsonl --grader ./llm-grader.ts -o results.jsonl
-
-# 2. Sample BOTH failures AND passes
-agent-eval-harness calibrate results.jsonl --sample 5 --include-passes -o calibration.md
-
-# 3. For each sample, verify LLM reasoning makes sense
-# - Is the reasoning consistent with the score?
-# - Would a human agree with this reasoning?
-# - Are there edge cases the LLM missed?
-```
-
-## Common Calibration Pitfalls
-
-### 1. Only Reviewing Failures
-
-Always sample some passes too. A grader that passes everything has 100% pass rate but is useless.
-
-### 2. Small Sample Size
-
-10 samples catches ~65% of issues at 10% error rate. For critical evals, sample 30+.
-
-### 3. Confirmation Bias
-
-Don't calibrate with the goal of "proving the grader works." Actively look for failures.
-
-### 4. One-Time Calibration
-
-Re-calibrate when:
-- Prompts change
-- Agent behavior changes
-- Grader logic is updated
-
-## Calibration Checklist
-
-Before trusting your evaluation results:
-
-- [ ] Ran `validate-refs` to ensure references pass
-- [ ] Sampled 10+ failures for manual review
-- [ ] Sampled 5+ passes to check for false positives
-- [ ] False negative rate < 5%
-- [ ] False positive rate < 5%
-- [ ] Grader reasoning makes sense for edge cases
-- [ ] Re-calibrated after any grader changes
-
-## Related Commands
-
-| Command | Purpose |
-|---------|---------|
-| `calibrate` | Sample failures for review |
-| `validate-refs` | Check reference solutions pass |
-| `grade` | Re-score with updated grader |
-| `balance` | Analyze test set coverage |
diff --git a/.agents/skills/agent-eval-harness/references/comparison-graders.md b/.agents/skills/agent-eval-harness/references/comparison-graders.md
deleted file mode 100644
index 1f762e1..0000000
--- a/.agents/skills/agent-eval-harness/references/comparison-graders.md
+++ /dev/null
@@ -1,473 +0,0 @@
-# Comparison Graders
-
-## Overview
-
-The `compare` command supports three grading strategies:
-
-1. **Weighted** (default) - Configurable weights for quality, latency, reliability
-2. **Statistical** - Bootstrap sampling for confidence intervals
-3. **Custom** - Your own logic-based or LLM-as-Judge grader
-
-## Built-in Strategy: Weighted
-
-Scores runs by combining quality, latency, and reliability metrics with configurable weights.
-
-### How It Works
-
-```
-weighted_score = (quality × w_q) + (latency × w_l) + (reliability × w_r)
-```
-
-Where:
-- **quality**: Grader score (0-1) from previous grading step
-- **latency**: Inverse duration (faster = higher, normalized)
-- **reliability**: 1 if no tool errors, 0 otherwise
-
-### Configuration
-
-Default weights: `quality=0.5, latency=0.3, reliability=0.2`
-
-Override via environment variables:
-
-```bash
-COMPARE_QUALITY=0.7 COMPARE_LATENCY=0.2 COMPARE_RELIABILITY=0.1 \
-  agent-eval-harness compare a.jsonl b.jsonl -o comparison.json
-```
-
-### When to Use
-
-- Quick comparisons without custom logic
-- Balancing speed vs correctness tradeoffs
-- Initial exploration before writing custom graders
-
-## Built-in Strategy: Statistical
-
-Uses bootstrap sampling to compute confidence intervals and flag statistically significant differences.
-
-### How It Works
-
-1. Resample scores with replacement (1000 iterations by default)
-2. Compute mean of each resample
-3. Calculate 95% confidence interval from percentiles (2.5th and 97.5th)
-4. Return median of bootstrap means as central estimate
-5. If winner's lower CI > second's upper CI → statistically significant
-
-### Configuration
-
-```bash
-COMPARE_BOOTSTRAP_ITERATIONS=5000 \
-  agent-eval-harness compare a.jsonl b.jsonl --strategy statistical -o comparison.json
-```
-
-### When to Use
-
-- Rigorous A/B testing
-- Publishing results (need significance claims)
-- Small sample sizes where noise matters
-- When you need uncertainty bounds on metrics
-
-### Output
-
-The statistical strategy adds `confidenceIntervals` to quality and performance metrics:
-
-**CaptureResult format:**
-```json
-{
-  "quality": {
-    "run-a": {
-      "avgScore": 0.85,
-      "passRate": 0.90,
-      "confidenceIntervals": {
-        "avgScore": [0.82, 0.88],
-        "passRate": [0.87, 0.93]
-      }
-    }
-  },
-  "performance": {
-    "run-a": {
-      "latency": { "mean": 1200 },
-      "confidenceIntervals": {
-        "latencyMean": [1100, 1300]
-      }
-    }
-  }
-}
-```
-
-**TrialResult format:**
-```json
-{
-  "capability": {
-    "run-a": {
-      "avgPassAtK": 0.92,
-      "confidenceIntervals": { "avgPassAtK": [0.88, 0.95] }
-    }
-  },
-  "reliability": {
-    "run-a": {
-      "type": "trial",
-      "avgPassExpK": 0.78,
-      "confidenceIntervals": { "avgPassExpK": [0.72, 0.84] }
-    }
-  },
-  "quality": {
-    "run-a": {
-      "avgScore": 0.85,
-      "confidenceIntervals": { "avgScore": [0.82, 0.88] }
-    }
-  },
-  "performance": {
-    "run-a": {
-      "latency": { "mean": 1500 },
-      "confidenceIntervals": { "latencyMean": [1380, 1620] }
-    }
-  }
-}
-```
-
-**Markdown output** includes 95% CI columns when using statistical strategy:
-
-```markdown
-## Quality
-| Run | Avg Score | 95% CI | Pass Rate | 95% CI | Pass | Fail |
-|-----|-----------|--------|-----------|--------|------|------|
-| run-a | 0.850 | [0.820, 0.880] | 90.0% | [0.870, 0.930] | 45 | 5 |
-```
-
-The per-prompt reasoning indicates significance:
-
-```json
-{
-  "reasoning": "Winner \"run-a\" is statistically significant (p<0.05, non-overlapping 95% CIs)"
-}
-```
-
-Or:
-
-```json
-{
-  "reasoning": "No statistically significant difference between top runs (overlapping 95% CIs)"
-}
-```
-
-## Custom Graders
-
-For specialized comparison logic or LLM-as-Judge evaluation.
-
-### Grader Interface
-
-```typescript
-import type { ComparisonGrader } from '@plaited/agent-eval-harness/pipeline'
-
-type ComparisonGraderInput = {
-  id: string                    // Prompt identifier
-  input: string | string[]      // Original prompt
-  hint?: string                 // Grader context
-  metadata?: Record<string, unknown> // Optional metadata from prompt
-  runs: Record<string, {
-    output: string              // Agent output
-    trajectory?: TrajectoryStep[]
-    score?: GraderResult        // If previously graded
-    duration?: number           // Total ms
-    toolErrors?: boolean
-  }>
-}
-
-type ComparisonGraderResult = {
-  rankings: Array<{
-    run: string                 // Run label
-    rank: number               // 1 = best
-    score: number              // 0-1
-  }>
-  reasoning?: string           // Explanation
-}
-```
-
-### Building a Custom Grader
-
-**Step 1: Create the grader file**
-
-```typescript
-// my-compare-grader.ts
-import type { ComparisonGrader } from '@plaited/agent-eval-harness/pipeline'
-
-export const grade: ComparisonGrader = async ({ id, input, hint, runs }) => {
-  // Grade each run in isolation first
-  const scored = await Promise.all(
-    Object.entries(runs).map(async ([label, run]) => {
-      const score = await scoreRun(run, hint)
-      return { label, score }
-    })
-  )
-
-  // Sort by score descending
-  const sorted = scored.sort((a, b) => b.score - a.score)
-
-  return {
-    rankings: sorted.map((r, i) => ({
-      run: r.label,
-      rank: i + 1,
-      score: r.score
-    })),
-    reasoning: `Ranked by scoring criteria`
-  }
-}
-
-const scoreRun = async (run: { output: string; toolErrors?: boolean }, hint?: string): Promise<number> => {
-  let score = 0
-
-  // Example: Check if output contains hint
-  if (hint && run.output.toLowerCase().includes(hint.toLowerCase())) {
-    score += 0.5
-  }
-
-  // Example: Penalize tool errors
-  if (run.toolErrors) {
-    score -= 0.2
-  }
-
-  return Math.max(0, Math.min(1, score + 0.5))
-}
-```
-
-**Step 2: Use the grader**
-
-```bash
-agent-eval-harness compare a.jsonl b.jsonl \
-  --strategy custom \
-  --grader ./my-compare-grader.ts \
-  -o comparison.json
-```
-
-### LLM-as-Judge Pattern
-
-For semantic evaluation, integrate an LLM into your grader:
-
-```typescript
-// llm-compare-grader.ts
-import Anthropic from '@anthropic-ai/sdk'
-import type { ComparisonGrader } from '@plaited/agent-eval-harness/pipeline'
-
-const client = new Anthropic()
-
-export const grade: ComparisonGrader = async ({ id, input, hint, runs }) => {
-  // Build prompt for LLM
-  const runDescriptions = Object.entries(runs)
-    .map(([label, run]) => `## ${label}\nOutput: ${run.output}`)
-    .join('\n\n')
-
-  const response = await client.messages.create({
-    model: 'claude-sonnet-4-20250514',
-    max_tokens: 1024,
-    messages: [{
-      role: 'user',
-      content: `Compare these agent runs for the task: "${input}"
-${hint ? `Expected: ${hint}` : ''}
-
-${runDescriptions}
-
-Rank from best to worst. Respond as JSON:
-{"rankings": [{"run": "label", "rank": 1, "score": 0.95}, ...], "reasoning": "..."}`
-    }]
-  })
-
-  const text = response.content[0]?.type === 'text' ? response.content[0].text : ''
-  const json = JSON.parse(text.match(/\{[\s\S]*\}/)?.[0] ?? '{}')
-
-  return {
-    rankings: json.rankings ?? [],
-    reasoning: json.reasoning ?? 'LLM comparison complete'
-  }
-}
-```
-
-**Key principle:** Grade each run in isolation, then rank. This produces consistent, reproducible comparisons.
-
-## Tool Usage Analysis
-
-Tool usage is NOT included in standard comparison output because:
-
-1. Different adapters provide different `trajectoryRichness` levels
-2. The `tool_call.name` field often contains tool use IDs, not human-readable names
-3. Adapters with `messages-only` richness don't capture tool calls
-
-### Custom Tool Analysis Grader
-
-For tool analysis, create a custom grader:
-
-```typescript
-import type { ComparisonGrader } from '@plaited/agent-eval-harness/pipeline'
-
-export const grade: ComparisonGrader = async ({ runs }) => {
-  const runAnalysis = Object.entries(runs).map(([label, run]) => {
-    const toolCalls = (run.trajectory ?? []).filter(s => s.type === 'tool_call')
-    return { label, toolCount: toolCalls.length }
-  })
-
-  // Rank by efficiency (fewer calls = better)
-  const sorted = runAnalysis.sort((a, b) => a.toolCount - b.toolCount)
-
-  return {
-    rankings: sorted.map((r, i) => ({
-      run: r.label,
-      rank: i + 1,
-      score: 1 / (1 + r.toolCount / 10)
-    })),
-    reasoning: `Tool counts: ${sorted.map(r => `${r.label}=${r.toolCount}`).join(', ')}`
-  }
-}
-```
-
-### Adapter Format Reference
-
-| Adapter | `trajectoryRichness` | Tool Name Format |
-|---------|---------------------|------------------|
-| claude-headless | `full` | Tool use ID (e.g., `toolu_017...`) |
-| gemini-headless | `full` | Function name |
-| droid | `messages-only` | N/A |
-| Custom | Varies | Check your schema |
-
-## Strategy Selection Guide
-
-| Use Case | Recommended Strategy |
-|----------|---------------------|
-| Quick comparison | `weighted` (default) |
-| A/B test with significance | `statistical` |
-| Semantic quality evaluation | `custom` (LLM-as-Judge) |
-| Complex multi-criteria scoring | `custom` (logic-based) |
-| Tool usage analysis | `custom` (see above) |
-
-## Hybrid Graders (Weighted + LLM)
-
-Layer LLM-as-Judge on top of built-in `weighted` comparison for balanced scoring.
-
-### Architecture
-
-| Component | Weight | Source |
-|-----------|--------|--------|
-| **Weighted score** | 50% | Built-in: quality, latency, reliability |
-| **LLM Judge** | 50% | Semantic quality assessment |
-
-```
-final_score = (weighted_score × 0.5) + (llm_score × 0.5)
-```
-
-**Why 50/50?**
-- Weighted catches objective metrics (speed, errors, prior grading)
-- LLM catches semantic quality (relevance, completeness, accuracy)
-- Neither alone is sufficient for production comparisons
-
-### Available Context
-
-```
-{ id, input, hint?, metadata?, runs: { [label]: { output, trajectory?, score?, duration?, toolErrors? } } }
-```
-
-`hint` and prior `score` (from inline grading) are available for LLM context.
-
-### Implementation (Any Language)
-
-Graders use stdin/stdout JSON protocol. Example patterns:
-
-**TypeScript:**
-```typescript
-import type { ComparisonGrader } from '@plaited/agent-eval-harness/pipeline'
-
-export const grade: ComparisonGrader = async ({ input, hint, runs }) => {
-  const scored = await Promise.all(
-    Object.entries(runs).map(async ([label, run]) => {
-      // 50%: Use prior score from weighted/inline grading
-      const weighted = run.score?.score ?? 0
-
-      // 50%: LLM semantic assessment
-      const llm = await llmJudge(input, run.output, hint)
-
-      return { label, score: (weighted * 0.5) + (llm * 0.5) }
-    })
-  )
-  return { rankings: rank(scored) }
-}
-```
-
-**Python:**
-```python
-#!/usr/bin/env python3
-import json, sys
-
-data = json.load(sys.stdin)
-scored = []
-
-for label, run in data["runs"].items():
-    weighted = run.get("score", {}).get("score", 0)  # Prior grading
-    llm = llm_judge(data["input"], run["output"], data.get("hint"))
-    scored.append({"run": label, "score": (weighted * 0.5) + (llm * 0.5)})
-
-ranked = sorted(scored, key=lambda x: -x["score"])
-print(json.dumps({
-    "rankings": [{"run": r["run"], "rank": i+1, "score": r["score"]}
-                 for i, r in enumerate(ranked)]
-}))
-```
-
-### LLM Judge Component
-
-The LLM evaluates semantic quality (0-1 normalized):
-
-```python
-def llm_judge(input, output, hint=None):
-    prompt = f"""Rate this response 0-100:
-Task: {input}
-{"Expected: " + hint if hint else ""}
-Response: {output}
-
-Criteria: relevance, completeness, accuracy, clarity
-Return only the number."""
-
-    score = int(llm_call(prompt)) / 100
-    return max(0, min(1, score))
-```
-
-### Workflow
-
-1. Run `capture` with inline `--grader` to get per-result scores
-2. Run `compare --strategy custom --grader ./hybrid.py`
-3. Hybrid grader combines prior scores (50%) + LLM judgment (50%)
-
-```bash
-# Step 1: Capture with inline grader
-agent-eval-harness capture prompts.jsonl -s claude.json -g ./grader.ts -o run-a.jsonl
-agent-eval-harness capture prompts.jsonl -s gemini.json -g ./grader.ts -o run-b.jsonl
-
-# Step 2: Compare with hybrid grader
-agent-eval-harness compare run-a.jsonl run-b.jsonl \
-  --strategy custom --grader ./hybrid-compare.py -o comparison.json
-```
-
-### Calibration
-
-1. **Fallback**: Without LLM API key, use weighted-only (score × 1.0)
-2. **Caching**: Cache LLM calls by hash(input + output) to reduce cost
-3. **Threshold**: Adjust pass threshold based on labeled samples
-
-### Trials Variant
-
-For `TrialsComparisonGrader`, combine passAtK metrics with LLM assessment of best trial:
-
-```python
-weighted = (run["passAtK"] * 0.5) + (run["passExpK"] * 0.3) + consistency * 0.2
-llm = llm_judge_best_trial(run["trials"])
-final = (weighted * 0.5) + (llm * 0.5)
-```
-
-The trials comparison report also includes **quality** and **performance** metrics when available:
-
-- **Quality** (optional): `avgScore`, `medianScore`, `p25Score`, `p75Score` — aggregated from `trial.score` across all prompts. Only present when a grader was used during trials capture.
-- **Performance** (always present): `latency` (p50/p90/p99/mean/min/max), `totalDuration` — aggregated from `trial.duration` across all prompts.
-
-With `--strategy statistical`, both include `confidenceIntervals` (`avgScore` CI for quality, `latencyMean` CI for performance).
-
-## Related Documentation
-
-- [inline-graders.md](inline-graders.md) - Single input/output graders
-- [eval-concepts.md](eval-concepts.md) - pass@k, pass^k metrics
-- [calibration.md](calibration.md) - Grader calibration workflow
diff --git a/.agents/skills/agent-eval-harness/references/docker-evals.md b/.agents/skills/agent-eval-harness/references/docker-evals.md
deleted file mode 100644
index 89a2769..0000000
--- a/.agents/skills/agent-eval-harness/references/docker-evals.md
+++ /dev/null
@@ -1,184 +0,0 @@
-# Running Evals in Docker
-
-Docker provides a consistent, isolated environment for running agent evaluations. This guide covers lessons learned from real debugging sessions.
-
-## Why Docker?
-
-| Benefit | Description |
-|---------|-------------|
-| **Reproducibility** | Same environment in CI and local development |
-| **Isolation** | API keys and CLIs don't pollute host system |
-| **CI Integration** | GitHub Actions can run Docker Compose directly |
-| **Multi-CLI Support** | Bundle multiple agent CLIs (Claude, Gemini) in one image |
-
-## Dockerfile Structure
-
-### Key Requirements
-
-```dockerfile
-# Start with Bun for fast TypeScript execution
-FROM oven/bun:1.2.9
-
-# Install Node.js 24+ (required for Gemini CLI's modern JS features)
-RUN apt-get update && apt-get install -y git curl ca-certificates gnupg && \
-    curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key | gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg && \
-    echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_24.x nodistro main" > /etc/apt/sources.list.d/nodesource.list && \
-    apt-get update && apt-get install -y nodejs && \
-    rm -rf /var/lib/apt/lists/*
-
-# Install Gemini CLI (npm global is accessible to all users)
-RUN npm install -g @google/gemini-cli
-
-# Create non-root user for Claude CLI
-RUN useradd -m -s /bin/bash testuser
-USER testuser
-WORKDIR /home/testuser
-
-# Install Claude CLI as non-root user
-RUN curl -fsSL https://claude.ai/install.sh | bash
-ENV PATH="/home/testuser/.local/bin:$PATH"
-```
-
-## Common Pitfalls & Solutions
-
-### 1. Claude CLI Refuses to Run as Root
-
-**Symptom:**
-```
---dangerously-skip-permissions cannot be used with root/sudo privileges for security reasons
-```
-
-**Cause:** Claude CLI blocks auto-approve flags when running as root for security.
-
-**Solution:** Create a non-root user and run tests as that user:
-```dockerfile
-RUN useradd -m -s /bin/bash testuser
-USER testuser
-RUN curl -fsSL https://claude.ai/install.sh | bash
-```
-
-### 2. Gemini CLI Syntax Error
-
-**Symptom:**
-```
-SyntaxError: Unexpected token '.'
-```
-
-**Cause:** Gemini CLI uses optional chaining (`?.`) which requires Node.js 14+. The Bun base image includes Node.js 12.
-
-**Solution:** Install Node.js 24 (latest LTS):
-```dockerfile
-RUN apt-get update && apt-get install -y nodejs  # From NodeSource repo
-```
-
-### 3. Global Package Permission Denied
-
-**Symptom:**
-```
-error: Failed to link @google/gemini-cli: EACCES
-```
-
-**Cause:** Bun's global install creates packages in user-specific directories (`~/.bun`).
-
-**Solution:** Use npm for system-wide packages (installs to `/usr/local`):
-```dockerfile
-# As root, before USER switch
-RUN npm install -g @google/gemini-cli
-```
-
-### 4. CLI Not Found in PATH
-
-**Symptom:**
-```
-which gemini  # fails
-```
-
-**Cause:** Non-root user doesn't have `/usr/local/bin` in PATH, or package was installed to root's home directory.
-
-**Solution:** Verify symlinks point to accessible locations:
-```bash
-# Debug inside container
-docker compose run --rm test bash -c 'which gemini && ls -la $(which gemini)'
-```
-
-### 5. Environment Variables Not Passed
-
-**Symptom:** Tests timeout silently with no API calls being made.
-
-**Solution:** Pass all required API keys via docker-compose.yml:
-```yaml
-environment:
-  - ANTHROPIC_API_KEY
-  - GEMINI_API_KEY
-```
-
-## Debugging Checklist
-
-When tests fail in Docker, run these checks:
-
-```bash
-# 1. Verify CLI installation and access
-docker compose run --rm test bash -c '
-  echo "=== Node.js ===" && node --version &&
-  echo "=== Bun ===" && bun --version &&
-  echo "=== Claude ===" && which claude && claude --version &&
-  echo "=== Gemini ===" && which gemini && gemini --version
-'
-
-# 2. Verify environment variables
-docker compose run --rm test bash -c '
-  echo "ANTHROPIC_API_KEY: ${ANTHROPIC_API_KEY:+set}"
-  echo "GEMINI_API_KEY: ${GEMINI_API_KEY:+set}"
-'
-
-# 3. Test CLI directly
-docker compose run --rm test bash -c '
-  gemini -p "Say hello" --output-format stream-json 2>&1 | head -5
-'
-
-# 4. Run as root to isolate permission issues
-docker compose run --rm --user root test bash -c 'whoami && which claude'
-```
-
-## CI Integration (GitHub Actions)
-
-```yaml
-test-integration:
-  runs-on: ubuntu-latest
-  steps:
-    - uses: actions/checkout@v4
-    - name: Run integration tests
-      env:
-        ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
-        GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
-      run: docker compose -f docker-compose.test.yml run --rm test
-```
-
-## Version Matrix
-
-Tested configurations:
-
-| Component | Version | Notes |
-|-----------|---------|-------|
-| Bun | 1.2.9 | Base image |
-| Node.js | 24.x | Required for Gemini CLI |
-| Claude CLI | 2.1.14+ | Install as non-root |
-| Gemini CLI | 0.25.0+ | Install via npm global |
-
-## Example docker-compose.yml
-
-```yaml
-services:
-  test:
-    build:
-      context: .
-      dockerfile: Dockerfile.test
-    environment:
-      - ANTHROPIC_API_KEY
-      - GEMINI_API_KEY
-```
-
-## Related
-
-- [Execution Environment](../SKILL.md#execution-environment) - Main skill docs
-- [Dockerfile.test](../../../../Dockerfile.test) - Reference implementation
diff --git a/.agents/skills/agent-eval-harness/references/eval-concepts.md b/.agents/skills/agent-eval-harness/references/eval-concepts.md
deleted file mode 100644
index e5d6f70..0000000
--- a/.agents/skills/agent-eval-harness/references/eval-concepts.md
+++ /dev/null
@@ -1,229 +0,0 @@
-# Evaluation Concepts
-
-Core concepts for agent evaluation based on Anthropic's "Demystifying Evals for AI Agents" guidance.
-
-## The Non-Determinism Problem
-
-Agents are inherently non-deterministic. A single run doesn't tell you:
-- **Can** the agent solve this problem? (capability)
-- **Will** it reliably solve it every time? (regression safety)
-
-The `trials` command addresses this by running each prompt multiple times.
-
-## pass@k vs pass^k
-
-Two metrics that answer different questions:
-
-| Metric | Formula | Question | Use Case |
-|--------|---------|----------|----------|
-| **pass@k** | `1 - (1-p)^k` | Can the agent ever do this? | Capability evals |
-| **pass^k** | `p^k` | Will it work every time? | Regression evals |
-
-Where `p` = raw pass rate (passes / k trials)
-
-### Example
-
-Run a prompt 5 times, 3 pass (60% raw pass rate):
-
-| Metric | Calculation | Result | Interpretation |
-|--------|-------------|--------|----------------|
-| **pass@5** | `1 - (0.4)^5` | **98.9%** | Agent can likely do this task |
-| **pass^5** | `(0.6)^5` | **7.8%** | Not reliable for production |
-
-### When to Use Each
-
-**pass@k (Capability Evals):**
-- Exploring what an agent can do
-- Testing new features or prompts
-- Higher k values (5-10) for thorough assessment
-- Accept if pass@k > 90%
-
-**pass^k (Regression Evals):**
-- CI/CD pipelines
-- Known-good tasks that must always work
-- Lower k values (3-5) for efficiency
-- Reject if pass^k < 80%
-
-## Capability vs Regression Evals
-
-| Eval Type | Starting Point | Goal | Metric |
-|-----------|----------------|------|--------|
-| **Capability** | Low pass rates | Find agent's limits | pass@k |
-| **Regression** | ~100% pass rates | Catch degradation | pass^k |
-
-### Capability Eval Workflow
-
-```bash
-# Run many trials to assess capability
-agent-eval-harness trials new-prompts.jsonl bunx agent -k 10 --grader ./grader.ts -o capability.jsonl
-
-# Analyze results
-cat capability.jsonl | jq 'select(.passAtK > 0.9) | {id, passAtK}'
-```
-
-Questions answered:
-- What tasks can the agent handle?
-- Where are the capability boundaries?
-- Which prompts need refinement?
-
-### Regression Eval Workflow
-
-```bash
-# Run fewer trials for known-good tasks
-agent-eval-harness trials regression-suite.jsonl bunx agent -k 3 --grader ./grader.ts -o regression.jsonl
-
-# Fail CI if reliability drops
-cat regression.jsonl | jq -e 'all(.passExpK > 0.8)'
-```
-
-Questions answered:
-- Did the latest change break anything?
-- Is the agent still reliable on known tasks?
-- Which tasks became flaky?
-
-## Grader Calibration
-
-### The Problem
-
-When your eval says an agent "failed", there are two possibilities:
-
-| Reality | What Happened |
-|---------|---------------|
-| Agent failed | The agent did the wrong thing |
-| **Grader failed** | The agent did the right thing, but your grader didn't recognize it |
-
-Without calibration, you can't tell which.
-
-### Why Calibration Matters
-
-Imagine tracking agent performance:
-
-```
-Week 1: 70% pass rate
-Week 2: 65% pass rate  ← "Agent got worse!"
-Week 3: 60% pass rate  ← "Something is broken!"
-```
-
-But what if your grader is rejecting valid solutions?
-
-```
-Week 1: 70% grader pass → 70% actually correct
-Week 2: 65% grader pass → 75% actually correct (grader rejected 10% valid)
-Week 3: 60% grader pass → 80% actually correct (grader rejected 20% valid)
-```
-
-**The agent is improving, but your grader can't see it.**
-
-### Using the Calibrate Command
-
-```bash
-# Sample 10 failures for human review
-agent-eval-harness calibrate results.jsonl --sample 10 -o calibration.md
-```
-
-Review the markdown output and label each sample:
-- **Valid failure** - Agent was actually wrong
-- **Grader bug** - Agent was correct, grader too strict
-- **Ambiguous** - Unclear
-
-### When to Calibrate
-
-| Situation | Calibrate? |
-|-----------|------------|
-| Building a new grader | Yes - validate it works |
-| Pass rate suddenly drops | Yes - is it agent or grader? |
-| Agent uses unexpected approach | Yes - grader might not recognize it |
-| Grader is simple string match | Yes - likely too strict |
-| Well-tested LLM judge | Maybe - periodic spot checks |
-
-### Grader Bug Examples
-
-**Too strict (exact match):**
-```typescript
-// Bad: rejects "Dario Amodei is the CEO" when hint is "Dario Amodei"
-const pass = output === hint
-```
-
-**Better (contains check):**
-```typescript
-// Good: accepts variations
-const pass = output.toLowerCase().includes(hint.toLowerCase())
-```
-
-**Best (semantic match via LLM):**
-```typescript
-// Best: understands meaning, not just text
-const pass = await llmJudge({ output, hint })
-```
-
-## Reference Solutions
-
-### Purpose
-
-Reference solutions prove a task is solvable before blaming the agent.
-
-**Prompt file with reference:**
-```jsonl
-{"id":"test-001","input":"Create a button component","hint":"<button>","reference":"export const Button = () => <button>Click</button>"}
-```
-
-### Validation Workflow
-
-```bash
-# Check that reference solutions pass your grader
-agent-eval-harness validate-refs prompts.jsonl --grader ./grader.ts -o validation.jsonl
-
-# If references fail, your grader or task is broken
-cat validation.jsonl | jq 'select(.pass == false)'
-```
-
-### Why This Matters
-
-If your reference solution fails your own grader:
-- The task definition is ambiguous
-- The grader is too strict
-- The hint is wrong
-
-Fix the eval before evaluating the agent.
-
-## Test Set Balance
-
-### The Problem
-
-An eval with only "make X work" misses "don't break Y".
-
-**Unbalanced:**
-- 50 prompts: "Add feature X"
-- 0 prompts: "Don't break existing feature Y"
-
-### Using the Balance Command
-
-```bash
-agent-eval-harness balance prompts.jsonl -o balance.json
-```
-
-Analyzes:
-- Category distribution (from `metadata.category`)
-- Positive/negative case ratio
-- Coverage gaps
-
-### Balanced Eval Design
-
-Include both positive and negative cases:
-
-| Type | Example | Purpose |
-|------|---------|---------|
-| Positive | "Add a login button" | Agent should succeed |
-| Negative | "Add a button without breaking tests" | Agent should not break things |
-| Edge case | "Handle empty input gracefully" | Agent should be robust |
-
-## Summary
-
-| Concept | Command | Key Insight |
-|---------|---------|-------------|
-| Non-determinism | `trials` | Run multiple times to measure reliability |
-| pass@k | `trials -k N` | Capability: can agent do this? |
-| pass^k | `trials -k N` | Regression: will it always work? |
-| Calibration | `calibrate` | Validate grader, not just agent |
-| Reference validation | `validate-refs` | Prove tasks are solvable |
-| Balance | `balance` | Cover positive + negative cases |
diff --git a/.agents/skills/agent-eval-harness/references/inline-graders.md b/.agents/skills/agent-eval-harness/references/inline-graders.md
deleted file mode 100644
index f504a90..0000000
--- a/.agents/skills/agent-eval-harness/references/inline-graders.md
+++ /dev/null
@@ -1,711 +0,0 @@
-# Inline Graders
-
-Inline graders score individual agent outputs in isolation. Each input/output pair is graded independently, producing a pass/fail result with a score.
-
-## Grader Interface
-
-```typescript
-import type { Grader } from '@plaited/agent-eval-harness/schemas'
-
-type GraderInput = {
-  input: string | string[]      // Original prompt(s)
-  output: string                // Agent output
-  hint?: string                 // Grader context/expectation
-  trajectory?: TrajectoryStep[] // Execution trace
-  metadata?: Record<string, unknown> // Optional metadata from prompt
-  cwd?: string                  // Working directory (for git-based grading)
-}
-
-type GraderResult = {
-  pass: boolean                 // Did it pass?
-  score: number                // 0.0 to 1.0
-  reasoning?: string           // Explanation
-  outcome?: Record<string, unknown> // Structured outcome data (merged onto result)
-}
-```
-
-## Building an Inline Grader
-
-### Step 1: Export the Schema
-
-Get the JSON Schema for validation in any language:
-
-```bash
-agent-eval-harness schemas GraderResult --json -o grader-result.json
-```
-
-### Step 2: Create the Grader
-
-**TypeScript (recommended):**
-
-```typescript
-// my-grader.ts
-import type { Grader } from '@plaited/agent-eval-harness/schemas'
-
-export const grade: Grader = async ({ input, output, hint, trajectory, metadata }) => {
-  // Your scoring logic here
-  const pass = evaluateOutput(output, hint)
-
-  return {
-    pass,
-    score: pass ? 1.0 : 0.0,
-    reasoning: pass ? 'Meets criteria' : 'Does not meet criteria'
-  }
-}
-
-const evaluateOutput = (output: string, hint?: string): boolean => {
-  if (!hint) return true
-  return output.toLowerCase().includes(hint.toLowerCase())
-}
-```
-
-**Python:**
-
-```python
-#!/usr/bin/env python3
-import json
-import sys
-
-data = json.load(sys.stdin)
-output = data.get("output", "").lower()
-hint = (data.get("hint") or "").lower()
-
-pass_result = hint in output if hint else True
-
-print(json.dumps({
-    "pass": pass_result,
-    "score": 1.0 if pass_result else 0.0,
-    "reasoning": "Contains hint" if pass_result else "Missing hint"
-}))
-```
-
-### Step 3: Use the Grader
-
-```bash
-# With capture command
-agent-eval-harness capture prompts.jsonl --schema ./claude.json --grader ./my-grader.ts -o results.jsonl
-
-# With grade command (pipeline)
-agent-eval-harness grade extracted.jsonl --grader ./my-grader.ts -o graded.jsonl
-
-# With trials command
-agent-eval-harness trials prompts.jsonl --schema ./claude.json -k 5 --grader ./my-grader.ts -o trials.jsonl
-```
-
-## Git-Based Outcome Grading
-
-The most powerful grading pattern for coding agents: use Git to detect actual environmental changes, not just check the agent's output text.
-
-### Why Git-Based Grading?
-
-**Grade outcomes, not paths.** Anthropic's eval framework emphasizes grading final environmental state, not procedural steps. Git provides the perfect oracle:
-
-- **Universal** - Works in any git repo, any language
-- **Precise** - Shows exactly what files changed
-- **Zero config** - No complex outcome schemas needed
-- **Debuggable** - `git diff` shows what happened
-
-### The `cwd` Parameter
-
-Graders receive an optional `cwd` parameter - the working directory where the agent executed:
-
-```typescript
-export const grade: Grader = async ({ input, output, hint, trajectory, metadata, cwd }) => {
-  // cwd is the session's working directory
-  // Use it to run git commands and detect outcomes
-}
-```
-
-### The `outcome` Field
-
-Graders can return an optional `outcome` field with structured data about what changed:
-
-```typescript
-return {
-  pass: true,
-  score: 1.0,
-  reasoning: 'Created Button.tsx with valid syntax',
-  outcome: {  // ← Optional: structured outcome data
-    filesCreated: ['src/components/Button.tsx'],
-    validSyntax: true,
-    type: 'file_creation'
-  }
-}
-```
-
-The harness merges this `outcome` onto the capture result, making it available for downstream analysis.
-
-### Pattern 1: File Creation
-
-**Task:** "Create a button component"
-
-```typescript
-import type { Grader } from '@plaited/agent-eval-harness/schemas'
-
-export const grade: Grader = async ({ output, hint, cwd }) => {
-  if (!cwd) {
-    return { pass: false, score: 0, reasoning: 'No working directory provided' }
-  }
-
-  // Detect what files were created using git
-  const status = await Bun.$`git -C ${cwd} status --porcelain`.text()
-  
-  const filesCreated = status
-    .split('\n')
-    .filter(line => line.startsWith('??'))  // ?? = untracked files
-    .map(line => line.slice(3))
-  
-  const buttonFileCreated = filesCreated.some(f => 
-    f.toLowerCase().includes('button')
-  )
-  
-  // Check if file has valid syntax
-  let validSyntax = true
-  if (buttonFileCreated) {
-    const tscCheck = await Bun.$`cd ${cwd} && npx tsc --noEmit`.nothrow()
-    validSyntax = tscCheck.exitCode === 0
-  }
-  
-  return {
-    pass: buttonFileCreated && validSyntax,
-    score: (buttonFileCreated ? 0.5 : 0) + (validSyntax ? 0.5 : 0),
-    reasoning: `Files created: ${filesCreated.join(', ')}. Valid syntax: ${validSyntax}`,
-    outcome: {  // ← Structured outcome for analysis
-      filesCreated,
-      validSyntax,
-      type: 'file_creation'
-    }
-  }
-}
-```
-
-### Pattern 2: Test Fixing
-
-**Task:** "Fix the failing tests in auth.spec.ts"
-
-```typescript
-export const grade: Grader = async ({ output, cwd }) => {
-  if (!cwd) return { pass: false, score: 0, reasoning: 'No cwd' }
-  
-  // Run tests to verify they pass
-  const testResult = await Bun.$`cd ${cwd} && bun test`.nothrow()
-  const testsPassed = testResult.exitCode === 0
-  
-  // Check what files were modified
-  const diff = await Bun.$`git -C ${cwd} diff --name-only`.text()
-  const filesModified = diff.split('\n').filter(Boolean)
-  
-  return {
-    pass: testsPassed,
-    score: testsPassed ? 1 : 0,
-    reasoning: testsPassed 
-      ? `Tests passed. Modified: ${filesModified.join(', ')}`
-      : `Tests failed: ${testResult.stderr.toString().slice(0, 200)}`,
-    outcome: {
-      testsPassed,
-      exitCode: testResult.exitCode,
-      filesModified,
-      type: 'test_execution'
-    }
-  }
-}
-```
-
-### Pattern 3: Non-Breaking Changes
-
-**Task:** "Refactor the authentication flow without breaking existing tests"
-
-```typescript
-export const grade: Grader = async ({ output, cwd }) => {
-  if (!cwd) return { pass: false, score: 0, reasoning: 'No cwd' }
-  
-  // Verify tests still pass
-  const testResult = await Bun.$`cd ${cwd} && bun test`.nothrow()
-  const testsPassed = testResult.exitCode === 0
-  
-  // Check what files changed
-  const diff = await Bun.$`git -C ${cwd} diff --name-only`.text()
-  const changedFiles = diff.split('\n').filter(Boolean)
-  
-  // Define critical files that shouldn't be touched
-  const touchedCriticalFiles = changedFiles.some(f => 
-    f.includes('package.json') || 
-    f.includes('tsconfig.json') ||
-    f.includes('.env')
-  )
-  
-  return {
-    pass: testsPassed && !touchedCriticalFiles,
-    score: testsPassed ? (touchedCriticalFiles ? 0.5 : 1) : 0,
-    reasoning: `Tests: ${testsPassed ? 'pass' : 'fail'}. Changed: ${changedFiles.join(', ')}. Critical files touched: ${touchedCriticalFiles}`,
-    outcome: {
-      testsPassed,
-      filesModified: changedFiles,
-      touchedCriticalFiles,
-      type: 'refactoring_safety'
-    }
-  }
-}
-```
-
-### Pattern 4: Code Quality
-
-**Task:** "Add TypeScript types to the API functions"
-
-```typescript
-export const grade: Grader = async ({ output, cwd }) => {
-  if (!cwd) return { pass: false, score: 0, reasoning: 'No cwd' }
-  
-  // Check type errors before
-  const beforeErrors = await Bun.$`cd ${cwd} && git stash && npx tsc --noEmit 2>&1 | grep -c "error TS" || echo 0`.text()
-  await Bun.$`cd ${cwd} && git stash pop`.nothrow()
-  
-  // Check type errors after
-  const afterErrors = await Bun.$`cd ${cwd} && npx tsc --noEmit 2>&1 | grep -c "error TS" || echo 0`.text()
-  
-  const errorsBefore = parseInt(beforeErrors.trim())
-  const errorsAfter = parseInt(afterErrors.trim())
-  const improved = errorsAfter < errorsBefore
-  
-  return {
-    pass: improved,
-    score: Math.max(0, (errorsBefore - errorsAfter) / errorsBefore),
-    reasoning: `Type errors: ${errorsBefore} → ${errorsAfter}`,
-    outcome: {
-      errorsBefore,
-      errorsAfter,
-      improved,
-      type: 'code_quality'
-    }
-  }
-}
-```
-
-### Pattern 5: Build Success
-
-**Task:** "Fix the build errors"
-
-```typescript
-export const grade: Grader = async ({ output, cwd }) => {
-  if (!cwd) return { pass: false, score: 0, reasoning: 'No cwd' }
-  
-  // Try to build
-  const buildResult = await Bun.$`cd ${cwd} && bun run build`.nothrow()
-  const buildSucceeded = buildResult.exitCode === 0
-  
-  // Check what was changed
-  const diff = await Bun.$`git -C ${cwd} diff --stat`.text()
-  
-  return {
-    pass: buildSucceeded,
-    score: buildSucceeded ? 1 : 0,
-    reasoning: buildSucceeded 
-      ? `Build succeeded. Changes:\n${diff}`
-      : `Build failed: ${buildResult.stderr.toString().slice(0, 300)}`,
-    outcome: {
-      buildSucceeded,
-      exitCode: buildResult.exitCode,
-      diffStat: diff,
-      type: 'build_verification'
-    }
-  }
-}
-```
-
-### Fallback for Non-Git Repos
-
-Always check if git is available before using git commands:
-
-```typescript
-export const grade: Grader = async ({ output, hint, cwd }) => {
-  // Check if we're in a git repo
-  if (cwd) {
-    const isGit = await Bun.$`git -C ${cwd} rev-parse --git-dir 2>/dev/null`.nothrow()
-    
-    if (isGit.exitCode === 0) {
-      // Use git-based grading
-      const status = await Bun.$`git -C ${cwd} status --porcelain`.text()
-      // ... git-based logic
-    }
-  }
-  
-  // Fall back to output-based grading
-  const pass = hint ? output.toLowerCase().includes(hint.toLowerCase()) : true
-  return {
-    pass,
-    score: pass ? 1 : 0,
-    reasoning: cwd ? 'Git not available, using output matching' : 'No cwd provided'
-  }
-}
-```
-
-### Best Practices for Git-Based Grading
-
-1. **Always check for `cwd`** - It's an optional parameter
-2. **Validate paths for security** - See security notes below
-3. **Use `.nothrow()`** - Don't let failed commands crash the grader
-4. **Grade outcomes, not paths** - Check if tests pass, not which tools were used
-5. **Return structured outcomes** - Makes downstream analysis easier
-6. **Keep repos clean** - Run evals in clean working directories (`git status` should be clean)
-7. **Include reasoning** - Explain what git detected and why it passed/failed
-8. **Handle non-git gracefully** - Provide fallback logic for non-git environments
-
-### Security Considerations
-
-**IMPORTANT:** When using the `cwd` parameter in shell commands, validate paths to prevent command injection.
-
-```typescript
-import { resolve } from 'node:path'
-
-const isValidPath = (path: string): boolean => {
-  // Reject paths with shell metacharacters
-  const dangerousChars = /[;&|`$(){}[\]<>'"\\]/
-  if (dangerousChars.test(path)) {
-    return false
-  }
-  
-  // Reject directory traversal and option injection
-  if (path.includes('..') || path.startsWith('-')) {
-    return false
-  }
-  
-  return true
-}
-
-export const grade: Grader = async ({ cwd }) => {
-  if (!cwd || !isValidPath(cwd)) {
-    return { pass: false, score: 0, reasoning: 'Invalid path' }
-  }
-  
-  // Normalize path to prevent traversal
-  const safeCwd = resolve(cwd)
-  
-  // Now safe to use in shell commands
-  const result = await Bun.$`git -C ${safeCwd} status --porcelain`.text()
-  // ...
-}
-```
-
-**Trust boundary:** The `cwd` parameter typically comes from trusted sources (`process.cwd()`, CLI `--cwd` flag). If accepting paths from untrusted sources (e.g., JSONL metadata), always validate before using in shell commands.
-
-### Git Status Detection Scope
-
-The examples above detect:
-- **Untracked files** (`??`) - New files not yet staged
-- **Modified files** (`M` or ` M`) - Changed tracked files
-
-Not included in basic examples:
-- **Staged files** (`A`) - Files added to index
-- **Renamed files** (`R`) - Files moved/renamed
-- **Deleted files** (`D`) - Files removed
-- **Copied files** (`C`) - Files duplicated
-
-For comprehensive detection, parse all `git status --porcelain` codes. See `git status --help` for complete format specification.
-
-### Performance Note
-
-**Git-based grading has higher latency than output-based grading** because each grader invocation spawns multiple git processes (typically 2-3 per evaluation). For large evaluation batches:
-
-- Output-based grading: ~1-5ms per evaluation
-- Git-based grading: ~50-200ms per evaluation (depending on repo size)
-
-Use git-based grading when environmental outcomes matter more than speed. For high-throughput scenarios, consider batching or caching strategies.
-
-### Outcome Field Benefits
-
-When graders return the `outcome` field, it's merged onto the capture result:
-
-```jsonl
-{
-  "id": "create-button",
-  "input": "Create a button component",
-  "output": "I created Button.tsx with a reusable button component.",
-  "trajectory": [...],
-  "pass": true,
-  "score": 1,
-  "reasoning": "Files created: src/Button.tsx. Valid syntax: true",
-  "outcome": {
-    "filesCreated": ["src/Button.tsx"],
-    "validSyntax": true,
-    "type": "file_creation"
-  }
-}
-```
-
-This enables powerful downstream analysis:
-
-```bash
-# Find all test-fixing tasks
-cat results.jsonl | jq 'select(.outcome.type == "test_execution")'
-
-# Calculate test pass rate
-cat results.jsonl | jq -s 'map(select(.outcome.testsPassed)) | length'
-
-# Identify refactoring tasks that touched critical files
-cat results.jsonl | jq 'select(.outcome.touchedCriticalFiles == true)'
-```
-
-## Grading Patterns
-
-### Hint-Based Matching
-
-Simple pattern for checking if output contains expected content:
-
-```typescript
-export const grade: Grader = async ({ output, hint }) => {
-  if (!hint) {
-    return { pass: true, score: 1.0, reasoning: 'No hint provided' }
-  }
-
-  const contains = output.toLowerCase().includes(hint.toLowerCase())
-  return {
-    pass: contains,
-    score: contains ? 1.0 : 0.0,
-    reasoning: contains ? 'Output contains hint' : 'Output missing hint'
-  }
-}
-```
-
-### Multi-Criteria Scoring
-
-Score based on multiple independent criteria:
-
-```typescript
-export const grade: Grader = async ({ output, hint, trajectory }) => {
-  let score = 0
-  const reasons: string[] = []
-
-  // Criterion 1: Contains hint
-  if (hint && output.toLowerCase().includes(hint.toLowerCase())) {
-    score += 0.4
-    reasons.push('Contains expected content')
-  }
-
-  // Criterion 2: No tool errors
-  const hasErrors = trajectory?.some(s =>
-    s.type === 'tool_call' && s.status === 'error'
-  )
-  if (!hasErrors) {
-    score += 0.3
-    reasons.push('No tool errors')
-  }
-
-  // Criterion 3: Efficient execution
-  const toolCount = trajectory?.filter(s => s.type === 'tool_call').length ?? 0
-  if (toolCount <= 5) {
-    score += 0.3
-    reasons.push(`Efficient (${toolCount} tools)`)
-  }
-
-  return {
-    pass: score >= 0.7,
-    score,
-    reasoning: reasons.join('; ') || 'Failed all criteria'
-  }
-}
-```
-
-### Metadata-Based Grading
-
-Use metadata for category-specific scoring logic:
-
-```typescript
-export const grade: Grader = async ({ output, hint, metadata }) => {
-  const category = (metadata?.category as string) ?? 'general'
-  const difficulty = (metadata?.difficulty as string) ?? 'medium'
-
-  // Apply different criteria based on category
-  if (category === 'code') {
-    // Code tasks require syntax validation
-    const hasCodeBlock = /```[\s\S]*?```/.test(output)
-    if (!hasCodeBlock) {
-      return { pass: false, score: 0.0, reasoning: 'Code category requires code block' }
-    }
-  } else if (category === 'web-search') {
-    // Web search tasks require sources
-    const hasSources = /source:/i.test(output) || /https?:\/\//.test(output)
-    if (!hasSources) {
-      return { pass: false, score: 0.5, reasoning: 'Web search should cite sources' }
-    }
-  }
-
-  // Adjust score threshold by difficulty
-  const baseScore = hint ? (output.toLowerCase().includes(hint.toLowerCase()) ? 1.0 : 0.0) : 1.0
-  const threshold = difficulty === 'hard' ? 0.9 : difficulty === 'easy' ? 0.6 : 0.7
-
-  return {
-    pass: baseScore >= threshold,
-    score: baseScore,
-    reasoning: `Category: ${category}, Difficulty: ${difficulty}`
-  }
-}
-```
-
-### Trajectory-Based Grading
-
-Analyze the execution path, not just the output:
-
-```typescript
-export const grade: Grader = async ({ trajectory }) => {
-  const toolCalls = trajectory?.filter(s => s.type === 'tool_call') ?? []
-
-  // Check for required tool usage
-  const usedWrite = toolCalls.some(t => t.name === 'Write')
-  const usedRead = toolCalls.some(t => t.name === 'Read')
-
-  if (!usedWrite || !usedRead) {
-    return {
-      pass: false,
-      score: 0.0,
-      reasoning: `Missing required tools: ${!usedWrite ? 'Write' : ''} ${!usedRead ? 'Read' : ''}`
-    }
-  }
-
-  return {
-    pass: true,
-    score: 1.0,
-    reasoning: 'Used required Read and Write tools'
-  }
-}
-```
-
-### LLM-as-Judge
-
-Use an LLM for semantic evaluation:
-
-```typescript
-import Anthropic from '@anthropic-ai/sdk'
-import type { Grader } from '@plaited/agent-eval-harness/schemas'
-
-const client = new Anthropic()
-
-export const grade: Grader = async ({ input, output, hint }) => {
-  const response = await client.messages.create({
-    model: 'claude-sonnet-4-20250514',
-    max_tokens: 256,
-    messages: [{
-      role: 'user',
-      content: `Evaluate this agent output.
-
-Task: ${Array.isArray(input) ? input.join(' → ') : input}
-${hint ? `Expected: ${hint}` : ''}
-
-Agent output:
-${output}
-
-Did the agent correctly complete the task? Respond as JSON only:
-{"pass": true/false, "score": 0.0-1.0, "reasoning": "brief explanation"}`
-    }]
-  })
-
-  const text = response.content[0]?.type === 'text' ? response.content[0].text : ''
-
-  try {
-    return JSON.parse(text.match(/\{[\s\S]*\}/)?.[0] ?? '{}')
-  } catch {
-    return { pass: false, score: 0, reasoning: 'Failed to parse LLM response' }
-  }
-}
-```
-
-## Detection Logic
-
-The harness determines grader type by file extension:
-
-| Extension | Treatment |
-|-----------|-----------|
-| `.ts`, `.js`, `.mjs`, `.cjs` | Import as ES module |
-| Everything else (`.py`, `.sh`, etc.) | Execute as subprocess |
-
-## Executable Protocol
-
-For non-JavaScript graders, use stdin/stdout JSON:
-
-**Input (stdin):**
-```json
-{
-  "input": "Find the CEO of Anthropic",
-  "output": "The CEO of Anthropic is Dario Amodei.",
-  "hint": "Dario Amodei",
-  "trajectory": [...],
-  "metadata": {"category": "web-search", "difficulty": "easy"},
-  "cwd": "/path/to/working/directory"
-}
-```
-
-**Output (stdout):**
-```json
-{
-  "pass": true,
-  "score": 1.0,
-  "reasoning": "Output contains expected name",
-  "outcome": {
-    "method": "semantic_match",
-    "confidence": 0.95
-  }
-}
-```
-
-**Exit codes:**
-- `0` = Success (result parsed from stdout)
-- Non-zero = Error (stderr used for error message)
-
-## Testing Graders
-
-Test independently before using with the harness:
-
-```bash
-# TypeScript
-echo '{"input":"test","output":"hello world","hint":"world"}' | bun run ./my-grader.ts
-
-# Python
-echo '{"input":"test","output":"hello world","hint":"world"}' | ./grader.py
-
-# Shell
-echo '{"input":"test","output":"hello world","hint":"world"}' | ./grader.sh
-```
-
-## Commands That Support Inline Graders
-
-| Command | Flag | Purpose |
-|---------|------|---------|
-| `capture` | `--grader` | Add score to each result |
-| `trials` | `--grader` | Compute pass@k, pass^k metrics |
-| `grade` | `--grader` | Score extracted results (pipeline) |
-| `calibrate` | `--grader` | Re-score samples with different grader |
-| `validate-refs` | `--grader` | Check reference solutions |
-
-## Best Practices
-
-### Grade Outcomes, Not Paths
-
-**✅ Do: Grade final environmental state**
-- Did the tests pass?
-- Was the file created with valid syntax?
-- Is the answer semantically correct?
-- Does the build succeed?
-
-**❌ Don't: Grade procedural steps**
-- Don't require specific tool usage ("must use WebSearch")
-- Don't enforce reasoning patterns ("must think step-by-step")
-- Don't mandate particular approaches ("must read file before editing")
-
-**Why this matters:** Agents should be free to find novel solutions. If the outcome is correct, the path doesn't matter. Use git-based grading to check environmental changes, not trajectory inspection to enforce procedures.
-
-### Other Best Practices
-
-1. **Grade in isolation** - Each input/output should be scored independently
-2. **Deterministic scoring** - Same input should always produce same score
-3. **Always return valid JSON** - Use `JSON.stringify()` or `json.dumps()`
-4. **Handle missing fields** - `hint`, `trajectory`, and `cwd` may be undefined
-5. **Include reasoning** - Helps debug failures during calibration
-6. **Test independently** - Validate grader before running full eval
-7. **Keep graders simple** - Complex logic is hard to debug and calibrate
-8. **Use git for outcomes** - Let git detect file changes instead of parsing output text
-9. **Return structured outcomes** - Makes downstream analysis and aggregation easier
-
-## Related Documentation
-
-- [comparison-graders.md](comparison-graders.md) - Multi-run comparison graders
-- [calibration.md](calibration.md) - Grader calibration workflow
-- [eval-concepts.md](eval-concepts.md) - pass@k, pass^k metrics
diff --git a/.agents/skills/agent-eval-harness/references/output-formats.md b/.agents/skills/agent-eval-harness/references/output-formats.md
deleted file mode 100644
index 325b907..0000000
--- a/.agents/skills/agent-eval-harness/references/output-formats.md
+++ /dev/null
@@ -1,261 +0,0 @@
-# Output Formats
-
-The harness uses a "capture once, derive many views" approach. The `capture` command produces full trajectory JSONL, and derived views are created with separate commands.
-
-## Capture Output (Full Trajectory)
-
-The `capture` command always outputs full trajectory JSONL:
-
-```bash
-agent-eval-harness capture prompts.jsonl --schema ./claude-headless.json -o results.jsonl
-```
-
-### Schema
-
-```typescript
-type CaptureResult = {
-  id: string                      // Prompt identifier
-  input: string | string[]        // Single prompt or multi-turn conversation
-  output: string                  // Final agent response
-  hint?: string                   // Grader context (if provided in prompt)
-  trajectory: TrajectoryStep[]    // Full execution trajectory
-  metadata: Record<string, unknown>  // Prompt metadata
-  timing: {
-    start: number                 // Unix timestamp (ms)
-    end: number                   // Unix timestamp (ms)
-    firstResponse?: number        // Time to first response (ms)
-    sessionCreation: number       // Time to create session (ms)
-    total: number                 // Total duration (end - start, ms)
-    inputTokens?: number          // Input tokens consumed (if available)
-    outputTokens?: number         // Output tokens generated (if available)
-  }
-  toolErrors: boolean             // Whether any tool calls failed
-  errors?: string[]               // Error messages (if any)
-  score?: GraderResult            // Grader score (if grader was provided)
-}
-
-type TrajectoryStep =
-  | { type: 'thought'; content: string; timestamp: number; stepId?: string }
-  | { type: 'message'; content: string; timestamp: number; stepId?: string }
-  | {
-      type: 'tool_call'
-      name: string              // Tool title
-      status: string            // pending, in_progress, completed, failed
-      input?: unknown           // Raw input parameters
-      output?: unknown          // Raw output
-      duration?: number         // Execution time (ms)
-      timestamp: number
-      stepId?: string
-    }
-  | { type: 'plan'; entries: unknown[]; timestamp: number; stepId?: string }
-
-type GraderResult = {
-  pass: boolean
-  score: number                 // 0.0 to 1.0
-  reasoning?: string
-}
-```
-
-### Example Output
-
-```jsonl
-{"id":"test-001","input":"Create a primary button","output":"I created the button in src/button.tsx","trajectory":[{"type":"thought","content":"I'll create a styled button template","timestamp":100,"stepId":"test-001-step-1"},{"type":"tool_call","name":"Write","status":"completed","input":{"file_path":"src/button.tsx","content":"..."},"output":"File written","duration":234,"timestamp":150,"stepId":"test-001-step-2"},{"type":"message","content":"I created the button","timestamp":500,"stepId":"test-001-step-3"}],"metadata":{"category":"ui"},"timing":{"start":1704067200000,"end":1704067201234,"firstResponse":100},"toolErrors":false}
-```
-
-## Summary Format
-
-The `summarize` command derives compact JSONL from full trajectory:
-
-```bash
-agent-eval-harness summarize results.jsonl -o summary.jsonl
-```
-
-### Schema
-
-```typescript
-type SummaryResult = {
-  id: string                    // Prompt identifier
-  input: string                 // Original prompt text
-  output: string                // Final agent response
-  toolCalls: string[]           // List of tool names used
-  duration: number              // Total execution time (ms)
-}
-```
-
-### Example Output
-
-```jsonl
-{"id":"test-001","input":"Create a primary button","output":"I created the button in src/button.tsx","toolCalls":["Write"],"duration":1234}
-{"id":"test-002","input":"Fix the TypeScript error","output":"I fixed the type error...","toolCalls":["Read","Edit"],"duration":2567}
-```
-
-### Analysis with jq
-
-```bash
-# Calculate average duration
-cat summary.jsonl | jq -s 'map(.duration) | add / length'
-
-# Count tool usage
-cat summary.jsonl | jq -s 'map(.toolCalls) | flatten | group_by(.) | map({tool: .[0], count: length})'
-
-# Filter by output content
-cat summary.jsonl | jq 'select(.output | contains("error"))'
-```
-
-## Markdown Format
-
-The `summarize` command can also produce markdown for LLM-as-judge workflows:
-
-```bash
-agent-eval-harness summarize results.jsonl --markdown -o results.md
-```
-
-### Structure
-
-```markdown
-## Evaluation Record: <id>
-
-**Input:** <original prompt>
-
-**Trajectory:**
-1. [THOUGHT] <truncated content> [->stepId]
-2. [TOOL:<name>] -> <status> (<duration>ms) [->stepId]
-   File: <path> (<size> chars)
-   ```<ext>
-   <head lines>
-
-   // ... N lines omitted ...
-
-   <tail lines>
-   ```
-3. [PLAN] <plan summary> [->stepId]
-4. [MESSAGE] <truncated content> [->stepId]
-
-**Output:** <truncated final output>
-**Metadata:** category=ui, agent=claude-headless, ...
-**Tool Errors:** false
-**Duration:** <ms>ms
-
----
-```
-
-**Step ID Format:** `<prompt-id>-step-<N>` (e.g., `test-001-step-2`)
-
-**Truncation Rules:**
-- Thought/message content: First 100 characters
-- Output: First 200 characters
-- Code preview: Head (8 lines) + tail (4 lines) for files > 12 lines
-
-## Trials Output
-
-The `trials` command produces per-prompt trial results:
-
-```bash
-agent-eval-harness trials prompts.jsonl --schema ./claude-headless.json -k 5 --grader ./grader.ts -o trials.jsonl
-```
-
-### Schema
-
-```typescript
-type TrialResult = {
-  id: string                    // Prompt identifier
-  input: string                 // Original prompt text
-  hint?: string                 // Grader context (if provided)
-  k: number                     // Number of trials
-  passRate?: number             // passes / k (with grader only)
-  passAtK?: number              // 1 - (1-passRate)^k (with grader only)
-  passExpK?: number             // passRate^k (with grader only)
-  trials: TrialEntry[]          // Individual trial results
-}
-
-type TrialEntry = {
-  trialNum: number              // Trial number (1-indexed)
-  output: string                // Agent output for this trial
-  trajectory: TrajectoryStep[]  // Full trajectory for this trial
-  duration: number              // Duration in milliseconds
-  pass?: boolean                // Pass/fail (if grader provided)
-  score?: number                // Numeric score (if grader provided)
-  reasoning?: string            // Grader reasoning (if grader provided)
-}
-```
-
-### Example (Without Grader)
-
-```jsonl
-{"id":"search-001","input":"Find the CEO of Anthropic","k":5,"trials":[{"trialNum":1,"output":"Dario Amodei...","trajectory":[...],"duration":1234},{"trialNum":2,"output":"The CEO is Dario...","trajectory":[...],"duration":1100},...]}
-```
-
-### Example (With Grader)
-
-```jsonl
-{"id":"search-001","input":"Find the CEO of Anthropic","k":5,"passRate":0.8,"passAtK":0.9997,"passExpK":0.3277,"trials":[{"trialNum":1,"output":"Dario Amodei...","pass":true,"score":1.0,"duration":1234},{"trialNum":2,"output":"I don't know...","pass":false,"score":0.0,"reasoning":"Missing hint content","duration":1100},...]}
-```
-
-## Step-Level Retrieval Pattern
-
-For step-specific analysis, use the step IDs in the trajectory:
-
-```typescript
-// Load results
-const results = (await Bun.file('results.jsonl').text())
-  .trim()
-  .split('\n')
-  .map(line => JSON.parse(line))
-
-// Build step index
-const stepIndex = new Map<string, unknown>()
-for (const result of results) {
-  for (const step of result.trajectory) {
-    stepIndex.set(step.stepId, step)
-  }
-}
-
-// Retrieve specific step by ID
-const stepId = 'test-001-step-2'  // From markdown [->stepId]
-const fullStep = stepIndex.get(stepId)
-console.log(fullStep.input)  // Complete tool input
-```
-
-## toolErrors Field
-
-The `toolErrors` field indicates whether any tool calls failed during execution:
-
-| `toolErrors` | Meaning |
-|--------------|---------|
-| `false` | All tool calls completed successfully |
-| `true` | One or more tool calls had `status: 'failed'` |
-
-**Note:** `toolErrors` only indicates tool-level failures. For semantic pass/fail (did the agent accomplish the task?), use a grader:
-
-```bash
-agent-eval-harness capture prompts.jsonl --schema ./claude-headless.json --grader ./grader.ts -o results.jsonl
-```
-
-## Input Format
-
-All commands accept the same JSONL input:
-
-```jsonl
-{"id":"test-001","input":"Create a primary button","hint":"should contain <button>","metadata":{"category":"ui"}}
-```
-
-| Field | Required | Description |
-|-------|----------|-------------|
-| `id` | Yes | Unique identifier |
-| `input` | Yes | Prompt text for the agent |
-| `hint` | No | Grader context - what to look for |
-| `reference` | No | Reference solution (for validate-refs) |
-| `metadata` | No | Tags, category, difficulty for filtering |
-| `timeout` | No | Override default timeout for this prompt |
-
-## Streaming Behavior
-
-All commands stream output line-by-line as results complete:
-
-```bash
-# Watch results in real-time
-agent-eval-harness capture prompts.jsonl --schema ./claude-headless.json --progress -o results.jsonl &
-tail -f results.jsonl
-```
-
-Use `--append` to continue interrupted runs without overwriting previous results.
diff --git a/.agents/skills/headless-adapters/SKILL.md b/.agents/skills/headless-adapters/SKILL.md
deleted file mode 100644
index 21392de..0000000
--- a/.agents/skills/headless-adapters/SKILL.md
+++ /dev/null
@@ -1,144 +0,0 @@
----
-name: headless-adapters
-description: Discover, create, and validate headless adapters for agent integration. Includes scaffolding tools and schema-driven compliance testing.
-compatibility: Bun >= 1.2.9
----
-
-# Headless Adapters
-
-## Purpose
-
-Schema-driven adapter for headless CLI agents. **No code required** - just define a JSON schema describing how to interact with the CLI.
-
-| Use Case | Tool |
-|----------|------|
-| Wrap headless CLI agent | `headless` command |
-| Create new schemas | [Schema Creation Guide](references/schema-creation-guide.md) |
-
-## Quick Start
-
-1. **Create a schema** for your CLI agent using the [Schema Creation Guide](references/schema-creation-guide.md)
-2. **Run the adapter:**
-   ```bash
-   bunx @plaited/agent-eval-harness headless --schema ./my-agent-headless.json
-   ```
-
-Any CLI agent that outputs JSON can be wrapped — no agent-specific code required.
-
-## CLI Commands
-
-### headless
-
-Schema-driven adapter for ANY headless CLI agent.
-
-```bash
-bunx @plaited/agent-eval-harness headless --schema <path>
-```
-
-**Options:**
-| Flag | Description | Required |
-|------|-------------|----------|
-| `-s, --schema` | Path to adapter schema (JSON) | Yes |
-
-**Schema Format:**
-
-```json
-{
-  "version": 1,
-  "name": "my-agent",
-  "command": ["my-agent-cli"],
-  "sessionMode": "stream",
-  "prompt": { "flag": "-p" },
-  "output": { "flag": "--output-format", "value": "stream-json" },
-  "autoApprove": ["--allow-all"],
-  "outputEvents": [
-    {
-      "match": { "path": "$.type", "value": "message" },
-      "emitAs": "message",
-      "extract": { "content": "$.text" }
-    },
-    {
-      "match": { "path": "$.type", "value": "tool_use" },
-      "emitAs": "tool_call",
-      "extract": { "title": "$.name", "status": "'pending'", "input": "$.input" }
-    },
-    {
-      "match": { "path": "$.type", "value": "tool_result" },
-      "emitAs": "tool_call",
-      "extract": { "title": "$.name", "status": "'completed'", "output": "$.content" }
-    }
-  ],
-  "result": {
-    "matchPath": "$.type",
-    "matchValue": "result",
-    "contentPath": "$.content"
-  }
-}
-```
-
-**Session Modes:**
-
-| Mode | Description | Use When |
-|------|-------------|----------|
-| `stream` | Keep process alive, multi-turn via stdin | CLI supports session resume |
-| `iterative` | New process per turn, accumulate history | CLI is stateless |
-
-## Creating a Schema
-
-1. Run the CLI's `--help` to identify prompt, output format, and auto-approve flags
-2. Capture sample JSON output from the CLI
-3. Map JSONPath patterns to output events (including `input`/`output` for tool calls)
-4. Create the schema file
-5. Test with `headless` command
-
-See [Schema Creation Guide](references/schema-creation-guide.md) for the complete workflow.
-
-## Security Considerations
-
-### Trust Boundary: CLI Output is Untrusted
-
-The headless adapter parses JSON output from CLI agents. This output may contain content from **external sources** (web searches, file reads, API responses) that flows into trajectory data:
-
-```
-CLI Agent → JSON stdout → JSONPath extraction → ParsedUpdate → TrajectoryStep
-```
-
-Trajectory fields — especially `tool_call.input` and `tool_call.output` — should be treated as **untrusted content** by downstream consumers (graders, LLM-as-judge, analysis scripts). Do not:
-- Execute trajectory content as code
-- Use trajectory content in unsanitized shell commands
-- Pass trajectory content to LLMs without injection-aware prompting
-
-### autoApprove Flags
-
-The `autoApprove` field bypasses the CLI agent's safety confirmation prompts. Use the **least permissive** flags your evaluation requires:
-
-| Risk Level | Example | When to Use |
-|------------|---------|-------------|
-| **High** | `["--dangerously-skip-permissions"]` | Only in isolated containers (Docker, CI) |
-| **Medium** | `["--allowedTools", "Read,Write,Glob"]` | Scoped to specific tools |
-| **Low** | `["--auto-approve", "read-only"]` | Read-only evaluations |
-
-**Never run high-risk autoApprove flags outside isolated environments.** Use `--workspace-dir` or Docker for evaluations that modify the filesystem.
-
-## Troubleshooting
-
-### Common Issues
-
-| Issue | Likely Cause | Solution |
-|-------|--------------|----------|
-| Tool calls not captured | JSONPath not iterating arrays | Use `[*]` wildcard syntax - [see guide](references/troubleshooting-guide.md#tool-calls-not-appearing) |
-| Tool input/output missing | Extract config missing `input`/`output` fields | Add `input`/`output` paths - [see guide](references/troubleshooting-guide.md#tool-input-output-missing) |
-| "unexpected argument" error | Stdin mode misconfigured | Use `stdin: true` - [see guide](references/troubleshooting-guide.md#stdin-mode-issues) |
-| 401 Authentication errors | API key not properly configured | Set the correct API key environment variable for your agent |
-| Timeout on prompt | JSONPath not matching | Capture raw CLI output, verify paths - [see guide](references/troubleshooting-guide.md#jsonpath-debugging) |
-| Empty responses | Content extraction failing | Check extract paths - [see guide](references/troubleshooting-guide.md#output-event-matching) |
-
-**Complete troubleshooting documentation:** [Troubleshooting Guide](references/troubleshooting-guide.md)
-
-## External Resources
-
-- **AgentSkills Spec**: [agentskills.io](https://agentskills.io)
-
-## Related
-
-- **[agent-eval-harness skill](../agent-eval-harness/SKILL.md)** - Running evaluations against adapters
diff --git a/.agents/skills/headless-adapters/references/schema-creation-guide.md b/.agents/skills/headless-adapters/references/schema-creation-guide.md
deleted file mode 100644
index 3e16486..0000000
--- a/.agents/skills/headless-adapters/references/schema-creation-guide.md
+++ /dev/null
@@ -1,310 +0,0 @@
-# Schema Creation Guide
-
-Step-by-step workflow for creating headless adapter schemas for CLI coding agents.
-
-## Overview
-
-The headless adapter transforms any CLI agent with JSON output into a protocol-compatible adapter. You just need a schema file describing how to interact with the CLI.
-
-## Workflow
-
-```mermaid
-flowchart TD
-    A["1. Explore CLI Help"] --> B["2. Identify Output Format"]
-    B --> C["3. Capture Sample Output"]
-    C --> D["4. Map JSONPath Patterns"]
-    D --> E["5. Create Schema File"]
-    E --> F["6. Test with Debug Mode"]
-```
-
-### Step 1: Explore CLI Help
-
-Start by examining the CLI's options for non-interactive execution:
-
-```bash
-# Main help
-<agent> --help
-
-# Subcommand help (if applicable)
-<agent> exec --help
-<agent> run --help
-```
-
-**Key flags to identify:**
-
-| Purpose | Common Flags | Notes |
-|---------|--------------|-------|
-| Prompt input | `-p`, `--prompt`, `--message`, positional | How to pass the prompt |
-| Output format | `--output-format`, `-o`, `--json` | JSON streaming mode |
-| Auto-approve | `--auto`, `--skip-permissions`, `--force` | Non-interactive mode |
-| Working directory | `--cwd`, `--directory`, `-C` | Project context |
-| Session resume | `--resume`, `--session-id`, `-s` | Multi-turn support |
-
-### Step 2: Identify Output Format
-
-Most modern agents support JSON streaming. Look for:
-
-- `stream-json` - Newline-delimited JSON objects
-- `json` - Single JSON response or NDJSON
-- `stream-jsonrpc` - JSON-RPC framing
-
-**Example from Droid CLI:**
-```bash
-droid exec --help
-# Options:
-#   -o, --output-format <format>  Output format (default: "text")
-#   ...
-```
-
-### Step 3: Capture Sample Output
-
-Run a simple prompt and capture the JSON structure:
-
-```bash
-# Capture raw output
-AGENT_API_KEY=... <agent> exec -o stream-json "Say hello" > output.jsonl
-
-# Or pipe to jq for formatting
-AGENT_API_KEY=... <agent> exec -o stream-json "Say hello" | jq -c '.'
-```
-
-**Expected patterns:**
-
-```json
-{"type": "message", "content": "Hello!"}
-{"type": "tool_use", "name": "Read", "input": {...}}
-{"type": "tool_result", "name": "Read", "output": "..."}
-{"type": "result", "content": "Task completed"}
-```
-
-### Step 4: Map JSONPath Patterns
-
-Analyze the output to create event mappings:
-
-| JSON Event | Event Type | Extract Fields |
-|------------|---------------|----------------|
-| `{"type": "message", ...}` | `message` | `$.content` |
-| `{"type": "tool_use", ...}` | `tool_call` | `$.name` (title), `"pending"` (status), `$.input` (input) |
-| `{"type": "tool_result", ...}` | `tool_call` | `$.name` (title), `"completed"` (status), `$.content` (output) |
-| `{"type": "result", ...}` | result detection | `$.content` |
-
-**Supported JSONPath syntax:**
-
-| Pattern | Description |
-|---------|-------------|
-| `$.field` | Top-level field |
-| `$.nested.field` | Nested field access |
-| `$.array[0].field` | Array index access |
-| `$.array[*]` | Array wildcard (iterate all items) |
-| `$.array[*].field` | Array wildcard with field access |
-| `'literal'` | Static string value |
-
-**Unsupported JSONPath syntax** (silently returns `undefined`):
-
-| Pattern | Issue | Use Instead |
-|---------|-------|-------------|
-| `$.array[?(@.type=='x')]` | Filter expressions not supported | Wildcard match `[*]` + item-relative extract |
-| `$..field` | Recursive descent not supported | Explicit path to field |
-| `$.array[0,1]` | Multi-index not supported | Separate mappings or `[*]` |
-
-#### Extract Fields Reference
-
-The `extract` object maps JSONPath expressions to `ParsedUpdate` fields. These five fields are consumed by the output parser:
-
-| Field | Type Coercion | Purpose | Used By |
-|-------|---------------|---------|---------|
-| `content` | `→ string` | Main text content | message, thought |
-| `title` | `→ string` | Tool name / identifier | tool_call |
-| `status` | `→ string` | Tool status (`'pending'`, `'completed'`) | tool_call |
-| `input` | preserved as object | Tool input arguments | tool_call |
-| `output` | preserved as object | Tool result content | tool_call (on completion) |
-
-**Key difference:** `input` and `output` preserve native types (objects, arrays, strings) for downstream grader inspection. The other three fields coerce values to strings.
-
-Additional string-valued fields (e.g., `toolName`, `mcpServer`) are allowed and preserved during schema validation but are not consumed by the parser.
-
-#### Array Wildcard Matching
-
-When tool events are nested in arrays, use `[*]` in the **match** path. Extract paths are then **relative to the matched item**, not the root event:
-
-```json
-{
-  "match": { "path": "$.message.content[*].type", "value": "tool_use" },
-  "emitAs": "tool_call",
-  "extract": {
-    "title": "$.name",
-    "input": "$.input"
-  }
-}
-```
-
-Given `{"message": {"content": [{"type": "tool_use", "name": "Read", "input": {...}}]}}`:
-- Match iterates `content[]`, finds item with `type == "tool_use"`
-- Extract evaluates `$.name` against that **item** → `"Read"`
-- Extract evaluates `$.input` against that **item** → `{...}` (preserved as object)
-
-### Step 5: Create Schema File
-
-Create a new schema file for your agent using the template below. Adapt the command, flags, and JSONPath patterns to match your CLI's output:
-
-```json
-{
-  "version": 1,
-  "name": "my-agent-headless",
-  "command": ["my-agent", "exec"],
-  "sessionMode": "stream",
-  "prompt": { "flag": "-p" },
-  "output": { "flag": "-o", "value": "stream-json" },
-  "autoApprove": ["--auto", "high"],
-  "cwdFlag": "--cwd",
-  "resume": { "flag": "-s", "sessionIdPath": "$.session_id" },
-  "outputEvents": [
-    {
-      "match": { "path": "$.type", "value": "message" },
-      "emitAs": "message",
-      "extract": { "content": "$.content" }
-    },
-    {
-      "match": { "path": "$.type", "value": "tool_use" },
-      "emitAs": "tool_call",
-      "extract": { "title": "$.name", "status": "'pending'", "input": "$.input" }
-    },
-    {
-      "match": { "path": "$.type", "value": "tool_result" },
-      "emitAs": "tool_call",
-      "extract": { "title": "$.name", "status": "'completed'", "output": "$.content" }
-    }
-  ],
-  "result": {
-    "matchPath": "$.type",
-    "matchValue": "result",
-    "contentPath": "$.content"
-  }
-}
-```
-
-### Step 6: Test with Headless
-
-Run the headless adapter with your schema:
-
-```bash
-# Test the adapter
-AGENT_API_KEY=... bunx @plaited/agent-eval-harness headless --schema ./my-agent-headless.json
-```
-
-### Step 7: Test with Debug Mode
-
-Use debug mode to verify JSONPath extraction:
-
-```bash
-AGENT_API_KEY=... bunx @plaited/agent-eval-harness headless --schema ./my-agent-headless.json --debug
-```
-
-Debug mode shows:
-- Raw CLI output lines
-- JSONPath match attempts
-- Extracted values for each event
-
-## Schema Field Reference
-
-### Required Fields
-
-| Field | Type | Description |
-|-------|------|-------------|
-| `version` | `1` | Schema version (always 1) |
-| `name` | string | Unique schema identifier |
-| `command` | string[] | CLI command and subcommands |
-| `sessionMode` | `"stream"` \| `"iterative"` | Process lifecycle mode |
-| `prompt` | object | How to pass prompt text |
-| `output` | object | Output format flags |
-| `outputEvents` | array | Event mapping rules |
-| `result` | object | Final result detection |
-
-### Optional Fields
-
-| Field | Type | Description |
-|-------|------|-------------|
-| `autoApprove` | string[] | Flags for non-interactive mode (see [Security](#security-autoapprove)) |
-| `cwdFlag` | string | Working directory flag |
-| `resume` | object | Session resume configuration |
-| `historyTemplate` | string | Template for iterative mode |
-
-### Session Modes
-
-| Mode | When to Use |
-|------|-------------|
-| `stream` | CLI keeps process alive for multi-turn via stdin |
-| `iterative` | CLI is stateless; new process per turn |
-
-### Security: autoApprove {#security-autoapprove}
-
-The `autoApprove` field bypasses the agent's safety confirmation prompts. Choose the **least permissive** option:
-
-| Approach | Example | Risk |
-|----------|---------|------|
-| Scoped tools | `["--allowedTools", "Read,Write"]` | Low — only named tools |
-| Read-only | `["--auto-approve", "read-only"]` | Low — no writes |
-| Full bypass | `["--dangerously-skip-permissions"]` | High — all tools, no confirmation |
-
-**Full bypass flags should only be used in isolated environments** (Docker containers, ephemeral CI runners, `--workspace-dir` sandboxes). Never run them against production filesystems or shared workspaces.
-
-## CLI Documentation Links
-
-> **7 of 8 agents compatible.** The headless adapter requires JSON streaming output.
-
-| Agent | JSON Output Flag | Prompt Flag | CLI Documentation |
-|-------|------------------|-------------|-------------------|
-| Amp | `--stream-json` | `-x` | [ampcode.com/manual#cli](https://ampcode.com/manual#cli) |
-| Codex | `--json` | positional | [developers.openai.com/codex/cli](https://developers.openai.com/codex/cli/) |
-| Cursor | `--output-format stream-json --print` | `-p` | [cursor.com/docs/cli/reference/output-format](https://cursor.com/docs/cli/reference/output-format) |
-| Droid | `-o stream-json` | positional | [docs.factory.ai/cli/droid-exec/overview](https://docs.factory.ai/cli/droid-exec/overview) |
-| Goose | `--output-format stream-json` | `-t` | [block.github.io/goose/.../goose-cli-commands](https://block.github.io/goose/docs/guides/goose-cli-commands/) |
-| Letta | `--output-format stream-json` | `-p` | [docs.letta.com/letta-code/cli-reference](https://docs.letta.com/letta-code/cli-reference/) |
-| OpenCode | `--format json` | positional | [opencode.ai/docs/cli](https://opencode.ai/docs/cli/) |
-
-**Not yet compatible:** [Copilot CLI](https://docs.github.com/en/copilot/concepts/agents/about-copilot-cli) (no JSON output)
-
-## Troubleshooting
-
-### Common Issues
-
-| Issue | Solution |
-|-------|----------|
-| Timeout waiting for result | Check `result.matchPath/matchValue` matches actual output |
-| No updates received | Verify `outputEvents` patterns match JSON structure |
-| Process exits immediately | Check `command` includes all required subcommands |
-| Authentication errors | Verify API key environment variable is set |
-
-### Debugging Tips
-
-**1. Capture raw CLI output to analyze:**
-```bash
-# Capture raw CLI output to analyze
-<agent> exec -o stream-json "Test prompt" 2>&1 | tee raw-output.jsonl
-
-# Pretty-print for analysis
-cat raw-output.jsonl | jq '.'
-```
-
-**2. Debug headless adapter directly:**
-```bash
-# Test initialize and session creation
-printf '{"jsonrpc":"2.0","id":1,"method":"initialize","params":{"protocolVersion":1}}\n{"jsonrpc":"2.0","id":2,"method":"session/new","params":{}}\n' | \
-  bunx @plaited/agent-eval-harness headless --schema ./my-schema.json 2>&1
-```
-
-**3. Common JSONPath issues:**
-
-| Path | Issue | Fix |
-|------|-------|-----|
-| `$.content[0].text` | Array indexing not working | Verify JSONPath impl supports `[0]` |
-| `$.message.content` | Returns object, not string | Add `[0].text` for array access |
-| `$.result` | No match | Check actual JSON has `type: "result"` |
-
-**4. Test JSONPath extraction manually:**
-```bash
-# Parse actual agent output
-echo '{"type":"assistant","message":{"content":[{"type":"text","text":"Hello"}]}}' | \
-  jq '.message.content[0].text'
-# Expected: "Hello"
-```
diff --git a/.agents/skills/headless-adapters/references/troubleshooting-guide.md b/.agents/skills/headless-adapters/references/troubleshooting-guide.md
deleted file mode 100644
index 4f592b8..0000000
--- a/.agents/skills/headless-adapters/references/troubleshooting-guide.md
+++ /dev/null
@@ -1,497 +0,0 @@
-# Troubleshooting Guide for Headless Adapters
-
-This guide documents common issues encountered when creating headless adapter schemas, based on real debugging sessions.
-
-## Table of Contents
-
-1. [Tool Calls Not Appearing in Trajectories](#tool-calls-not-appearing)
-2. [Stdin Mode Issues](#stdin-mode-issues)
-3. [Tool Input/Output Not Captured](#tool-input-output-missing)
-4. [JSONPath Debugging](#jsonpath-debugging)
-5. [Output Event Matching](#output-event-matching)
-
----
-
-## Tool Calls Not Appearing in Trajectories {#tool-calls-not-appearing}
-
-### Symptom
-
-- Trajectories show `"trajectoryRichness": "messages-only"`
-- Zero tool_call events in captured output
-- Agent responses suggest tool usage (long response times, mentions of tools)
-- Result output includes information that clearly required external tool calls
-
-### Root Cause
-
-Tool calls are often nested inside arrays in the CLI's JSON output, but the schema's `outputEvents` mappings only check single JSONPath locations without array iteration.
-
-**Example:** Claude Code emits tool calls inside `$.message.content[]` arrays:
-```json
-{
-  "type": "assistant",
-  "message": {
-    "content": [
-      {"type": "text", "text": "I'll search for that..."},
-      {"type": "tool_use", "name": "WebSearch", "input": {...}},
-      {"type": "tool_result", "tool_use_id": "...", "content": "..."}
-    ]
-  }
-}
-```
-
-### Solution
-
-Use wildcard `[*]` syntax in JSONPath expressions to iterate over array items:
-
-```json
-{
-  "outputEvents": [
-    {
-      "match": { "path": "$.message.content[*].type", "value": "tool_use" },
-      "emitAs": "tool_call",
-      "extract": { "title": "$.name", "status": "'pending'", "input": "$.input" }
-    },
-    {
-      "match": { "path": "$.message.content[*].type", "value": "tool_result" },
-      "emitAs": "tool_call",
-      "extract": { "title": "$.tool_use_id", "status": "'completed'", "output": "$.content" }
-    }
-  ]
-}
-```
-
-### Debugging Steps
-
-1. **Capture raw CLI output:**
-   ```bash
-   <agent> -p "Use a tool" --output-format stream-json 2>&1 | tee raw-output.jsonl
-   ```
-
-2. **Examine JSON structure:**
-   ```bash
-   cat raw-output.jsonl | jq '.' | less
-   ```
-
-3. **Look for tool-related fields:**
-   ```bash
-   cat raw-output.jsonl | jq 'paths | select(.[-1] | tostring | test("tool|call|use"))'
-   ```
-
-4. **Test JSONPath extraction:**
-   ```bash
-   cat raw-output.jsonl | jq '.message.content[] | select(.type == "tool_use")'
-   ```
-
-5. **Update schema with correct paths** and test with `headless --debug`
-
----
-
-## Stdin Mode Issues {#stdin-mode-issues}
-
-### Symptom
-
-- CLI returns error: `error: unexpected argument '' found`
-- Adapter works when prompt is passed via flag, fails with stdin
-- Command construction looks correct but CLI rejects it
-
-### Root Cause
-
-When using stdin mode (where CLI reads prompt from stdin via `-` or similar), the headless adapter was incorrectly adding an empty string as a positional argument, resulting in commands like:
-
-```bash
-codex exec --json - ""  # ❌ Empty string causes error
-```
-
-Instead of:
-
-```bash
-echo "prompt" | codex exec --json -  # ✅ Correct
-```
-
-### Solution
-
-Use the `stdin: true` field in the prompt configuration:
-
-```json
-{
-  "command": ["codex", "exec", "--json", "-"],
-  "prompt": {
-    "stdin": true
-  }
-}
-```
-
-This tells the adapter to:
-1. Not add the prompt text to the command arguments
-2. Use `stdin: 'pipe'` when spawning the process
-3. Write the prompt to the process's stdin stream
-
-### Important Notes
-
-- **The `-` marker** (or equivalent) must be in the `command` array, not added automatically
-- **Empty `flag: ""`** is different from `stdin: true`:
-  - `flag: ""` = positional argument (appends prompt to command args)
-  - `stdin: true` = write to stdin (no prompt in command args)
-
-### When to Use Stdin Mode
-
-Use `stdin: true` when:
-- CLI documentation shows `-` for stdin (e.g., `codex exec -`)
-- CLI accepts prompts via pipe: `echo "prompt" | cli-command`
-- CLI has `--stdin` or similar flag expecting piped input
-
-**Do not use** when:
-- CLI expects prompt as positional argument: `cli-command "prompt text"`
-- CLI uses flag for prompt: `cli-command -p "prompt text"`
-
-### Debugging Steps
-
-1. **Test CLI manually with stdin:**
-   ```bash
-   echo "Say hello" | <agent> <flags> -
-   ```
-
-2. **Verify no trailing arguments:**
-   ```bash
-   # Should work:
-   echo "test" | codex exec --json -
-
-   # Will fail:
-   echo "test" | codex exec --json - ""
-   ```
-
-3. **Check process spawn in adapter:**
-   - Enable verbose mode: `headless --debug --verbose`
-   - Look for command construction in output
-
-4. **Update schema:**
-   - Add `"stdin": true` to prompt config
-   - Remove empty `flag` field if present
-   - Ensure `-` is in command array
-
----
-
-## JSONPath Debugging {#jsonpath-debugging}
-
-### Common JSONPath Patterns
-
-#### Nested Fields
-```json
-{
-  "message": {
-    "content": "Hello"
-  }
-}
-```
-**Path:** `$.message.content`
-
-#### Array Index
-```json
-{
-  "items": [
-    {"text": "First"},
-    {"text": "Second"}
-  ]
-}
-```
-**Path:** `$.items[0].text` → "First"
-
-#### Array Wildcard
-```json
-{
-  "items": [
-    {"type": "tool", "name": "Read"},
-    {"type": "tool", "name": "Write"}
-  ]
-}
-```
-**Path:** `$.items[*].type` → Returns array of items where you can check `type`
-
-#### Nested Array Access
-```json
-{
-  "message": {
-    "content": [
-      {"type": "text"},
-      {"type": "tool_use", "name": "Search"}
-    ]
-  }
-}
-```
-**Path:** `$.message.content[*].type` → Iterate over content array
-
-#### Literal Values
-Sometimes you need to return a fixed value:
-```json
-{
-  "extract": {
-    "status": "'pending'"  // Single quotes = literal string
-  }
-}
-```
-
-### Testing JSONPath Expressions
-
-Use `jq` to test paths against real CLI output:
-
-```bash
-# Test basic path
-cat output.jsonl | jq '.message.content'
-
-# Test array access
-cat output.jsonl | jq '.message.content[0]'
-
-# Test wildcard iteration
-cat output.jsonl | jq '.message.content[] | select(.type == "tool_use")'
-
-# Test extraction
-cat output.jsonl | jq '.message.content[] | select(.type == "tool_use") | .name'
-```
-
-### Common Mistakes
-
-❌ **Missing `$` prefix:**
-```json
-"path": "type"  // Wrong
-"path": "$.type"  // Correct
-```
-
-❌ **Wrong array syntax:**
-```json
-"path": "$.items.*"  // Wrong
-"path": "$.items[*]"  // Correct
-```
-
-❌ **Nested wildcard without intermediate property:**
-```json
-"path": "$.[*].type"  // Wrong (missing property name)
-"path": "$.content[*].type"  // Correct
-```
-
-❌ **Trying to use jq-specific syntax:**
-```json
-"path": "$.items[] | select(.active)"  // Wrong (jq syntax)
-"path": "$.items[*]"  // Correct (JSONPath syntax)
-```
-
-❌ **Using JSONPath filter expressions:**
-```json
-"extract": {
-  "title": "$.content[?(@.type=='tool_use')].name"  // Wrong (filter not supported)
-}
-```
-Filter expressions `[?()]` are not supported and silently return `undefined`. Use wildcard matching at the `match` level instead, then simple item-relative paths in `extract`:
-```json
-{
-  "match": { "path": "$.content[*].type", "value": "tool_use" },
-  "extract": { "title": "$.name" }
-}
-```
-
----
-
-## Tool Input/Output Not Captured {#tool-input-output-missing}
-
-### Symptom
-
-- Trajectory `tool_call` steps have `name` and `status` but no `input` or `output`
-- Graders can't inspect what arguments were passed to tools
-- Graders can't inspect what results tools returned
-- Step-level analysis is limited to tool name only
-
-### Root Cause
-
-The schema's `extract` configuration for tool events only maps `title` and `status`, omitting `input` and `output` fields:
-
-```json
-{
-  "match": { "path": "$.type", "value": "tool_use" },
-  "emitAs": "tool_call",
-  "extract": { "title": "$.name", "status": "'pending'" }
-}
-```
-
-### Solution
-
-Add `input` to the tool start event and `output` to the tool completion event:
-
-```json
-{
-  "match": { "path": "$.type", "value": "tool_use" },
-  "emitAs": "tool_call",
-  "extract": { "title": "$.name", "status": "'pending'", "input": "$.input" }
-},
-{
-  "match": { "path": "$.type", "value": "tool_result" },
-  "emitAs": "tool_call",
-  "extract": { "title": "$.name", "status": "'completed'", "output": "$.content" }
-}
-```
-
-**Key details:**
-- `input` and `output` preserve native types (objects, arrays) — they are **not** coerced to string
-- The JSONPath varies by CLI: Claude uses `$.input`/`$.content`, Codex uses `$.item.arguments`/`$.item.result`
-- Both fields are optional: schemas without them continue to work, tool calls just lack argument/result data
-
-### Debugging Steps
-
-1. **Capture raw output and identify input/output fields:**
-   ```bash
-   <agent> -p "Read a file" --output-format stream-json 2>&1 | \
-     jq 'select(.type == "tool_use" or .type == "tool_result")'
-   ```
-
-2. **Map the correct JSONPaths** for your agent's tool events
-
-3. **Add `input`/`output`** to the extract config and test with `headless --debug`
-
----
-
-## Output Event Matching {#output-event-matching}
-
-### Understanding Match Logic
-
-Output events use a two-step process:
-1. **Match:** Find JSON lines that match a pattern
-2. **Extract:** Pull specific fields from matched lines
-
-```json
-{
-  "match": { "path": "$.type", "value": "message" },
-  "emitAs": "message",
-  "extract": { "content": "$.text" }
-}
-```
-
-This means:
-- Check if `$.type` equals `"message"`
-- If yes, emit a session `message` update
-- Extract content from `$.text`
-
-### Wildcard Matching
-
-When using array wildcards `[*]`, the match checks each array item:
-
-```json
-{
-  "match": { "path": "$.items[*].type", "value": "tool_use" },
-  "emitAs": "tool_call",
-  "extract": { "title": "$.name" }
-}
-```
-
-This means:
-- Iterate over `$.items[]` array
-- For each item where `type == "tool_use"`
-- Emit a `tool_call` update
-- Extract title from `$.name` (relative to that item)
-
-### Extract Paths are Relative
-
-**Important:** Extract paths are relative to the matched object, not the root!
-
-```json
-{
-  "type": "assistant",
-  "message": {
-    "content": [
-      {"type": "tool_use", "name": "Read", "input": {...}}
-    ]
-  }
-}
-```
-
-```json
-{
-  "match": { "path": "$.message.content[*].type", "value": "tool_use" },
-  "extract": {
-    "title": "$.name",   // ✅ Relative to matched item
-    "input": "$.input"   // ✅ Relative to matched item, preserved as object
-  }
-}
-```
-
-NOT:
-```json
-{
-  "extract": {
-    "title": "$.message.content[0].name"  // ❌ This won't work
-  }
-}
-```
-
-### Result Events
-
-The `result` configuration marks when the agent is done:
-
-```json
-{
-  "result": {
-    "matchPath": "$.type",
-    "matchValue": "completed",
-    "contentPath": "$.summary"
-  }
-}
-```
-
-- **matchPath + matchValue:** Identify the completion event
-- **contentPath:** Extract final output (can be any field, not necessarily the full response)
-
-**Common patterns:**
-
-```json
-// Match specific type
-{
-  "matchPath": "$.type",
-  "matchValue": "turn.completed"
-}
-
-// Match any non-null value
-{
-  "matchPath": "$.status",
-  "matchValue": "*"
-}
-
-// Extract token stats as result
-{
-  "contentPath": "$.usage.output_tokens"
-}
-
-// Extract nothing (just signal completion)
-{
-  "contentPath": "$.type"
-}
-```
-
-### Debugging Match Issues
-
-1. **Check if events are being matched at all:**
-   ```bash
-   # Run headless --debug with verbose mode
-   bunx @plaited/agent-eval-harness headless --schema schema.json --debug
-   ```
-
-2. **Verify JSON structure matches your paths:**
-   ```bash
-   cat raw-output.jsonl | jq 'select(.type == "your-expected-type")'
-   ```
-
-3. **Test extraction paths:**
-   ```bash
-   cat raw-output.jsonl | jq 'select(.type == "tool_use") | .name'
-   ```
-
-4. **Common issues:**
-   - Path doesn't exist: Returns `undefined`, no update emitted
-   - Wrong array syntax: Match fails, no updates
-   - Extract path points to wrong object: Empty or wrong content in updates
-
-### Match Debugging Checklist
-
-- [ ] Raw CLI output contains the events you're trying to match
-- [ ] `matchPath` points to an existing field
-- [ ] `matchValue` exactly matches the field value (case-sensitive)
-- [ ] For wildcards `[*]`, the array exists and isn't empty
-- [ ] Extract paths are relative to the matched object
-- [ ] Result event is actually emitted by the CLI (not just inferred)
-

From 5d45878f2f875dfff19ea521b9a962693ddac230 Mon Sep 17 00:00:00 2001
From: Edward Irby <e.irby@pm.me>
Date: Tue, 10 Mar 2026 22:10:36 -0700
Subject: [PATCH 3/7] chore: remove stale .gemini config

Gemini Code Assist MCP server config from the old pipeline.
No longer relevant after the trial-runner replacement.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .gemini/settings.json | 16 ----------------
 1 file changed, 16 deletions(-)
 delete mode 100644 .gemini/settings.json

diff --git a/.gemini/settings.json b/.gemini/settings.json
deleted file mode 100644
index 424179b..0000000
--- a/.gemini/settings.json
+++ /dev/null
@@ -1,16 +0,0 @@
-{
-  "mcpServers": {
-    "agent-skills-spec": {
-      "httpUrl": "https://agentskills.io/mcp",
-      "trust": true
-    },
-    "bun-docs": {
-      "httpUrl": "https://bun.com/docs/mcp",
-      "trust": true
-    },
-    "agent-client-protocol": {
-      "httpUrl": "https://agentclientprotocol.com/mcp",
-      "trust": true
-    }
-  }
-}

From 854390c8116e9094d6a257a73a00d117e16147ae Mon Sep 17 00:00:00 2001
From: Edward Irby <e.irby@pm.me>
Date: Tue, 10 Mar 2026 22:11:05 -0700
Subject: [PATCH 4/7] chore: remove stale bun-test-wrapper script

Old integration test wrapper for the previous pipeline's
integration_tests/ directory which no longer exists.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 scripts/bun-test-wrapper.sh | 27 ---------------------------
 1 file changed, 27 deletions(-)
 delete mode 100755 scripts/bun-test-wrapper.sh

diff --git a/scripts/bun-test-wrapper.sh b/scripts/bun-test-wrapper.sh
deleted file mode 100755
index 86c63aa..0000000
--- a/scripts/bun-test-wrapper.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/bash
-# Wrapper for bun test that handles Bun's post-test cleanup crash
-# See: https://github.com/oven-sh/bun/issues/23643
-#
-# Bun 1.3.x has a known bug where the test runner crashes during cleanup
-# after all tests complete successfully. This wrapper catches that crash
-# (exit code 133 = SIGTRAP) and exits cleanly if tests actually passed.
-
-# Create temp file for output
-tmpfile=$(mktemp)
-trap "rm -f $tmpfile" EXIT
-
-# Run integration tests with output to both terminal and file
-bun test ./**/integration_tests/*.spec.ts 2>&1 | tee "$tmpfile"
-exit_code=${PIPESTATUS[0]}
-
-# Check if tests passed (look for "X pass" and "0 fail" in output)
-if grep -q " pass" "$tmpfile" && grep -q "0 fail" "$tmpfile"; then
-  # Tests passed - exit 0 even if Bun crashed during cleanup
-  if [ $exit_code -eq 133 ]; then
-    echo ""
-    echo "Note: Bun crashed during cleanup (known bug), but all tests passed."
-    exit 0
-  fi
-fi
-
-exit $exit_code

From c2ced8a946a42c1b439efd60591c9c57b37647ef Mon Sep 17 00:00:00 2001
From: Edward Irby <e.irby@pm.me>
Date: Tue, 10 Mar 2026 22:12:02 -0700
Subject: [PATCH 5/7] chore: remove stale Docker test infrastructure and
 simplify CI
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Delete docker-compose.test.yml, Dockerfile.test, and the
test-integration CI job — all referenced integration_tests/
which no longer exists. CI now runs a single test job with
check + test.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/workflows/ci.yml | 35 +--------------------------------
 Dockerfile.test          | 42 ----------------------------------------
 docker-compose.test.yml  | 22 ---------------------
 3 files changed, 1 insertion(+), 98 deletions(-)
 delete mode 100644 Dockerfile.test
 delete mode 100644 docker-compose.test.yml

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index ea55cde..d4dc7e1 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -10,23 +10,7 @@ permissions:
   contents: read
 
 jobs:
-  # Detect which paths changed to conditionally run expensive jobs
-  changes:
-    runs-on: ubuntu-latest
-    permissions:
-      pull-requests: read
-    outputs:
-      src: ${{ steps.filter.outputs.src }}
-    steps:
-      - uses: actions/checkout@v4
-      - uses: dorny/paths-filter@v3
-        id: filter
-        with:
-          filters: |
-            src:
-              - 'src/**'
-
-  test-pr:
+  test:
     runs-on: ubuntu-latest
 
     steps:
@@ -40,20 +24,3 @@ jobs:
         run: bun run check
       - name: Run test
         run: bun run test
-
-  # Required GitHub Secrets:
-  #   - ANTHROPIC_API_KEY: API key for Claude Code integration tests
-  #   - GEMINI_API_KEY: API key for Gemini CLI integration tests
-  test-integration:
-    needs: changes
-    if: ${{ needs.changes.outputs.src == 'true' }}
-    runs-on: ubuntu-latest
-
-    steps:
-      - uses: actions/checkout@v4
-
-      - name: Run integration tests
-        env:
-          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
-          GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
-        run: docker compose -f docker-compose.test.yml run --rm test
diff --git a/Dockerfile.test b/Dockerfile.test
deleted file mode 100644
index 5021d5b..0000000
--- a/Dockerfile.test
+++ /dev/null
@@ -1,42 +0,0 @@
-# Dockerfile for integration tests requiring Docker environment
-#
-# Integration tests require:
-# - External API keys (Anthropic, Gemini)
-# - Global CLI tools (claude, gemini)
-
-FROM oven/bun:1.2.9
-
-# Install git, curl, and Node.js 22 (required for Gemini CLI)
-RUN apt-get update && apt-get install -y git curl ca-certificates gnupg && \
-    mkdir -p /etc/apt/keyrings && \
-    curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key | gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg && \
-    echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_24.x nodistro main" > /etc/apt/sources.list.d/nodesource.list && \
-    apt-get update && apt-get install -y nodejs && \
-    rm -rf /var/lib/apt/lists/*
-
-# Install Gemini CLI globally (accessible to all users)
-RUN npm install -g @google/gemini-cli
-
-# Create non-root user (Claude CLI blocks --dangerously-skip-permissions as root)
-RUN useradd -m -s /bin/bash testuser
-
-# Switch to testuser for Claude CLI installation
-USER testuser
-WORKDIR /home/testuser
-
-# Install Claude CLI as testuser (required for --dangerously-skip-permissions)
-RUN curl -fsSL https://claude.ai/install.sh | bash
-
-# Add Claude CLI to PATH
-ENV PATH="/home/testuser/.local/bin:$PATH"
-
-WORKDIR /home/testuser/app
-
-# Copy source (ownership set to testuser)
-COPY --chown=testuser:testuser . .
-
-# Install dependencies
-RUN bun install --frozen-lockfile
-
-# Run integration tests (wrapper handles Bun cleanup crash workaround)
-CMD ["bash", "scripts/bun-test-wrapper.sh"]
diff --git a/docker-compose.test.yml b/docker-compose.test.yml
deleted file mode 100644
index ef5def7..0000000
--- a/docker-compose.test.yml
+++ /dev/null
@@ -1,22 +0,0 @@
-# Docker Compose for integration tests requiring Docker environment
-#
-# Integration tests live in integration_tests/ directories and require:
-# - External API keys (Anthropic, Gemini)
-# - Global CLI tools (claude, gemini)
-#
-# Usage:
-#   Local:  ANTHROPIC_API_KEY=sk-... bun run test:integration
-#   CI:     Uses secrets from GitHub Actions
-
-services:
-  test:
-    build:
-      context: .
-      dockerfile: Dockerfile.test
-    environment:
-      # Passes through from host environment (works in CI and local)
-      - ANTHROPIC_API_KEY
-      - GEMINI_API_KEY
-    # Optional: mount src for live code debugging
-    # volumes:
-    #   - ./src:/app/src:ro

From e4a15248b4784fd26a9c732c64d97ffe811d76f7 Mon Sep 17 00:00:00 2001
From: Edward Irby <e.irby@pm.me>
Date: Tue, 10 Mar 2026 22:13:04 -0700
Subject: [PATCH 6/7] chore: remove stale .env.example

API keys were for the deleted Docker integration tests.
The trial runner delegates credential management to adapters.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .env.example | 2 --
 1 file changed, 2 deletions(-)
 delete mode 100644 .env.example

diff --git a/.env.example b/.env.example
deleted file mode 100644
index c2c924e..0000000
--- a/.env.example
+++ /dev/null
@@ -1,2 +0,0 @@
-ANTHROPIC_API_KEY=<anthropic api key>
-GEMINI_API_KEY=<gemini api key>

From 16e594e30223783d0e4005d58322d2d63474b401 Mon Sep 17 00:00:00 2001
From: Edward Irby <e.irby@pm.me>
Date: Tue, 10 Mar 2026 22:17:55 -0700
Subject: [PATCH 7/7] =?UTF-8?q?fix:=20address=20PR=20review=20=E2=80=94=20?=
 =?UTF-8?q?update=20AGENTS.md,=20add=20CLI=20tests,=20verify=20exports?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

AGENTS.md rewritten to reflect new trial runner architecture:
updated structure diagram, skills table, capabilities, commands.
Removed stale Docker test references and old skill names.

Added src/tests/cli.spec.ts with coverage for:
- CLI routing (trials/compare/calibrate/unknown/no-command)
- parseCli meta flags (--help, -h, --schema input)
- Input validation (valid JSON, invalid JSON, missing input)
- Export contract verification (runTrial, schemas)

All 65 tests pass. Types clean.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 AGENTS.md             | 181 ++++++++++++++++----------------------
 src/tests/cli.spec.ts | 198 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 273 insertions(+), 106 deletions(-)
 create mode 100644 src/tests/cli.spec.ts

diff --git a/AGENTS.md b/AGENTS.md
index a3e47cc..b39347c 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -4,29 +4,35 @@ Agent guidance for this repository.
 
 ## Overview
 
-CLI tool capturing agent trajectories from headless CLI agents. Executes prompts, captures tools/thoughts/plans, outputs JSONL for evaluation.
+General-purpose eval harness for running trials against CLI agents. Executes prompts via adapter scripts, captures trajectories (thoughts, tool calls, messages), grades outputs, and writes JSONL results.
 
 ## Capabilities
 
 - **Multi-turn**: `input: string | string[]` executes sequentially in same session
 - **Isolation**: Fresh session per JSONL entry
-- **Parallelization**: `-j N` runs N prompts concurrently via worker pool
-- **Workspace isolation**: `--workspace-dir` creates per-prompt directories
-- **MCP auto-discovery**: No explicit `--mcp-server` flag needed
-- **Headless adapter**: Schema-driven JSON wrapper for any CLI agent
+- **Parallelization**: Concurrency control via worker pool
+- **Workspace isolation**: Creates per-prompt directories for adapter execution
+- **Polyglot adapters**: TS/JS modules (import `adapt` function) or executable scripts (stdin/stdout JSON protocol)
+- **Polyglot graders**: TS/JS modules (import `grade` function) or executable scripts
+- **pass@k metrics**: Multiple trials per prompt with statistical aggregation
 
 ## Structure
 
 ```
 src/
-├── harness/        # Core capture engine
-├── headless/       # Headless adapter implementation
-├── pipeline/       # Unix-style pipeline commands
-└── schemas/        # Zod schemas + types
-
-.agents/skills/     # AI agent skills (symlinked to .claude/, .cursor/)
-├── agent-eval-harness/
-└── headless-adapters/
+├── cli.ts             # Unified CLI entry point (trials/compare/calibrate)
+├── cli.utils.ts       # Shared CLI parsing utilities
+├── trial.ts           # Trial runner library + CLI handler
+├── trial.schemas.ts   # Zod schemas (single source of truth)
+├── trial.utils.ts     # Loaders, worker pool, trajectory analysis
+├── trial.constants.ts # Default timeout, default k
+└── tests/
+    └── trial.spec.ts  # Trial runner tests
+
+.agents/skills/
+├── trial-runner/      # Running trials with adapters
+├── trial-adapters/    # Writing adapter scripts
+└── compare-trials/    # Statistical comparison of trial results
 ```
 
 ## Commands
@@ -36,53 +42,47 @@ src/
 | `bun install` | Setup (requires bun >= v1.2.9) |
 | `bun run check` | Type/lint/format check |
 | `bun run check:write` | Auto-fix lint/format |
-| `bun test` | Unit tests |
+| `bun test src/` | Unit tests |
+
+## CLI
 
-**Docker integration tests:**
 ```bash
-ANTHROPIC_API_KEY=sk-... GEMINI_API_KEY=... \
-  docker compose -f docker-compose.test.yml run --rm test
+bunx @plaited/agent-eval-harness trials '{"adapterPath": "./adapter.ts", "promptsPath": "./prompts.jsonl", "k": 3}'
 ```
 
-## Skills
+| Subcommand | Status | Purpose |
+|------------|--------|---------|
+| `trials` | Implemented | Run trials against an adapter with optional grading |
+| `compare` | Stub | Statistical comparison of trial results |
+| `calibrate` | Stub | Grader calibration |
 
-| Skill | Commands | Use Case |
-|-------|----------|----------|
-| **agent-eval-harness** | `capture`, `trials`, `summarize`, `calibrate`, `validate-refs`, `balance`, `schemas`, `run`, `extract`, `grade`, `format`, `compare` | Trajectory capture, training data, regression tests, A/B comparison |
-| **headless-adapters** | `headless` | Find/create/validate adapter schemas |
+## Package Exports
 
-**Install:** `npx skills add plaited/agent-eval-harness` or `bunx skills add plaited/agent-eval-harness`
+| Import Path | What It Exports |
+|------------|----------------|
+| `@plaited/agent-eval-harness` | `runTrial`, `calculatePassAtK`, `calculatePassExpK`, `trialCli` |
+| `@plaited/agent-eval-harness/schemas` | All Zod schemas and types (`Grader`, `Adapter`, `TrajectoryStep`, etc.) |
+
+## Skills
+
+| Skill | Use Case |
+|-------|----------|
+| **trial-runner** | Running trials with adapters, interpreting results |
+| **trial-adapters** | Writing adapter scripts for different agents |
+| **compare-trials** | Statistical comparison of trial result sets |
 
 ## Constraints
 
 - **Bun required**: >= v1.2.9
-- **ES2024**: Uses `Promise.withResolvers()` and modern APIs
+- **ES2024**: Uses modern APIs
 
 ## Verification
 
 **Before commit:**
 - `bun run check` passes
-- `bun test` passes (unit tests)
+- `bun test src/` passes
 - No `--no-verify` on git commits
 
-**Skill validation:**
-```bash
-bunx @plaited/development-skills validate-skill .agents/skills/<name>
-```
-
-## Workflow
-
-1. **Plan first**: Use TodoWrite for multi-step tasks
-2. **Read before edit**: Verify current code before proposing changes
-3. **Verify incrementally**: Run checks after each change
-4. **No over-engineering**: Only requested changes
-
-Development rules in `.agents/rules/` - reference via @.agents/rules/[name].md in CLAUDE.md
-
-## Learnings
-
-*Dated entries from actual issues encountered will appear here*
-
 <!-- PLAITED-RULES-START -->
 
 ## Rules
@@ -95,12 +95,14 @@ Development rules in `.agents/rules/` - reference via @.agents/rules/[name].md i
 - `Bun.file(path).exists()` not `fs.existsSync()`
 - `Bun.file(path).text()` not `readFileSync()`
 - `Bun.write(path, data)` not `writeFileSync()`
-*Verify:* `grep 'from .node:fs' src/`  
+*Verify:* `grep 'from .node:fs' src/`
 *Fix:* Replace with Bun.file/Bun.write
 
+**When Node.js OK:** `appendFile` (no Bun async append equivalent), `mkdir` with `{ recursive: true }`, `node:path` utilities
+
 **Shell commands:**
 - `Bun.$\`cmd\`` not `child_process.spawn()`
-*Verify:* `grep 'child_process' src/`  
+*Verify:* `grep 'child_process' src/`
 *Fix:* Replace with Bun.$ template literal
 
 **Path resolution:**
@@ -113,8 +115,6 @@ Development rules in `.agents/rules/` - reference via @.agents/rules/[name].md i
 - `Bun.which(cmd)` to check if command exists
 - `Bun.$\`bun add pkg\`` for package management
 
-**When Node.js OK:** readline (interactive input), node:path utilities, APIs without Bun equivalents
-
 **Docs:** https://bun.sh/docs
 
 
@@ -122,9 +122,9 @@ Development rules in `.agents/rules/` - reference via @.agents/rules/[name].md i
 
 ## Git Commits
 
-**Conventional commits** - `feat:`, `fix:`, `refactor:`, `docs:`, `chore:`, `test:`  
-**Multi-line messages** - Use for detailed context  
-**Never --no-verify** - Fix the issue, don't bypass hooks  
+**Conventional commits** - `feat:`, `fix:`, `refactor:`, `docs:`, `chore:`, `test:`
+**Multi-line messages** - Use for detailed context
+**Never --no-verify** - Fix the issue, don't bypass hooks
 *Verify:* Check git log format
 
 ## GitHub CLI
@@ -145,7 +145,7 @@ gh api repos/<owner>/<repo>/pulls/<n>/comments
 
 **PR checklist:**
 - [ ] Human reviewer comments
-- [ ] AI code review comments  
+- [ ] AI code review comments
 - [ ] Security alerts (ReDoS, injection)
 - [ ] Code quality comments
 - [ ] Inline suggestions
@@ -162,12 +162,12 @@ gh api repos/<owner>/<repo>/pulls/<n>/comments
 
 # Module Organization
 
-**No index.ts** - Never use index files, they create implicit magic  
-*Verify:* `find . -name 'index.ts'`  
+**No index.ts** - Never use index files, they create implicit magic
+*Verify:* `find . -name 'index.ts'`
 *Fix:* Rename to feature name: `feature/index.ts` → `feature.ts` at parent level
 
-**Explicit .ts extensions** - `import { x } from './file.ts'` not `'./file'`  
-*Verify:* `grep "from '\./.*[^s]'" src/` (imports without .ts)  
+**Explicit .ts extensions** - `import { x } from './file.ts'` not `'./file'`
+*Verify:* `grep "from '\./.*[^s]'" src/` (imports without .ts)
 *Fix:* Add `.ts` extension
 
 **Re-export at boundaries** - Parent `feature.ts` re-exports from `feature/feature.ts`
@@ -179,7 +179,7 @@ graph TD
     B --> D[feature.ts]
     B --> E[tests/]
     E --> F[feature.spec.ts]
-    
+
     C -.Re-exports.-> D
 ```
 
@@ -189,41 +189,37 @@ graph TD
 - `feature.constants.ts` - Constants, error codes
 - `feature.ts` - Main implementation
 
-**Direct imports** - Import from specific files, not through re-exports within module  
-*Verify:* Check for circular imports  
+**Direct imports** - Import from specific files, not through re-exports within module
+*Verify:* Check for circular imports
 *Fix:* Import directly: `from './feature.types.ts'` not `from './feature.ts'`
 
 
 # Testing
 
-**Use test not it** - `test('description', ...)` instead of `it('...')`  
-*Verify:* `grep '\bit(' src/**/*.spec.ts`  
+**Use test not it** - `test('description', ...)` instead of `it('...')`
+*Verify:* `grep '\bit(' src/**/*.spec.ts`
 *Fix:* Replace `it(` with `test(`
 
-**No conditional assertions** - Never `if (x) expect(x.value)`  
-*Verify:* `grep 'if.*expect\|&&.*expect' src/**/*.spec.ts`  
+**No conditional assertions** - Never `if (x) expect(x.value)`
+*Verify:* `grep 'if.*expect\|&&.*expect' src/**/*.spec.ts`
 *Fix:* Assert condition first: `expect(x).toBeDefined(); expect(x.value)...`
 
-**Test both branches** - Try/catch, conditionals, fallbacks need both paths tested  
-*Verify:* Review test coverage for error paths  
+**Test both branches** - Try/catch, conditionals, fallbacks need both paths tested
+*Verify:* Review test coverage for error paths
 *Fix:* Add test for catch block, else branch, fallback case
 
-**Use real dependencies** - Prefer installed packages over mocks when testing module resolution  
-*Verify:* Review test imports for fake paths  
+**Use real dependencies** - Prefer installed packages over mocks when testing module resolution
+*Verify:* Review test imports for fake paths
 *Fix:* Use actual package like `typescript`
 
-**Organize with describe** - Group related tests in `describe('feature', () => {...})`  
-*Verify:* Check for flat test structure  
+**Organize with describe** - Group related tests in `describe('feature', () => {...})`
+*Verify:* Check for flat test structure
 *Fix:* Add describe blocks by category (happy path, edge cases, errors)
 
-**Coverage checklist** - Happy path, edge cases, error paths, real integrations  
+**Coverage checklist** - Happy path, edge cases, error paths, real integrations
 *Verify:* Review test file completeness
 
-**Docker tests** - `*.docker.ts` for external APIs, run via docker-compose  
-*Verify:* Check if test needs API key or external service  
-*Fix:* Rename to `.docker.ts`, update CI gating
-
-**Run:** `bun test` before commit
+**Run:** `bun test src/` before commit
 
 
 # Accuracy
@@ -239,12 +235,6 @@ graph TD
 - Present issue to user for resolution
 - Never invent solutions
 
-**TypeScript verification** - Use LSP tools for type-aware analysis:
-- `lsp-find` - Search symbols across workspace
-- `lsp-refs` - Find all usages before modifying
-- `lsp-hover` - Verify type signatures
-- `lsp-analyze` - Batch analysis of file structure
-
 **Dynamic exploration:**
 - Read tool for direct file verification
 - Grep/Glob for content and pattern searches
@@ -256,8 +246,6 @@ graph TD
 - Code review: Read files before commenting
 - Patterns: Confirm examples reflect actual usage
 
-See rules/testing.md for verification in test contexts.
-
 
 # Skill Activation
 
@@ -272,16 +260,6 @@ See rules/testing.md for verification in test contexts.
 *Verify:* Did you check available skills before starting implementation?
 *Fix:* Pause, evaluate skills, activate relevant ones, then continue
 
-**Example:**
-```
-- code-patterns: NO - not writing code
-- git-workflow: YES - need commit conventions
-- documentation: YES - writing README
-
-> Skill(git-workflow)
-> Skill(documentation)
-```
-
 **Activation before implementation** - Evaluating skills without calling `Skill()` provides no benefit
 *Verify:* Check that `Skill()` was called for each YES evaluation
 *Fix:* Call `Skill(skill-name)` for skipped activations
@@ -306,16 +284,16 @@ See rules/testing.md for verification in test contexts.
  */
 ```
 
-**No @example** - Tests are living examples  
-**Use @internal** - Mark non-public APIs  
-**Mermaid only** - No ASCII box-drawing diagrams  
+**No @example** - Tests are living examples
+**Use @internal** - Mark non-public APIs
+**Mermaid only** - No ASCII box-drawing diagrams
 *Verify:* `grep '[┌│└─]' *.md`
 
 
 # Core Conventions
 
 **Type over interface** - `type User = {` instead of `interface User {`
-*Verify:* `lsp-find interface` or `grep 'interface [A-Z]' src/`
+*Verify:* `grep 'interface [A-Z]' src/`
 *Fix:* Replace `interface X {` with `type X = {`
 
 **No any types** - Use `unknown` with type guards
@@ -332,29 +310,20 @@ See rules/testing.md for verification in test contexts.
 
 **Object params >2 args** - `fn({ a, b, c }: { ... })` not `fn(a, b, c)`
 *Exception:* CLI entry points take `args: string[]`
-*Verify:* Review function signatures with `lsp-hover`
+*Verify:* Review function signatures
 
 **Private fields** - Use `#field` (ES2022) not `private field` (TypeScript)
 *Verify:* `grep 'private \w' src/`
 *Fix:* Replace `private x` with `#x`
 
 **JSON imports** - `import x from 'file.json' with { type: 'json' }`
-*Verify:* `grep "from.*\.json['\"]" src/` (check for missing `with`)
-*Fix:* Add `with { type: 'json' }`
 
 **@ts-ignore needs description** - `// @ts-ignore - reason here`
-*Verify:* `grep '@ts-ignore' src/` (check for missing comment)
 
 **Short-circuit/ternary OK** - `condition && doSomething()` is acceptable
 
-**Empty interface extending single** - `interface Custom extends Base {}` is OK for branded types
-
 **Mermaid diagrams only** - No ASCII box-drawing in markdown
-*Verify:* `grep '[┌│└─]' *.md`
 
 **No @example in TSDoc** - Tests are living examples
 
-**AgentSkills validation** - `bunx @plaited/development-skills validate-skill <path>`
-
-
 <!-- PLAITED-RULES-END -->
diff --git a/src/tests/cli.spec.ts b/src/tests/cli.spec.ts
new file mode 100644
index 0000000..3b38894
--- /dev/null
+++ b/src/tests/cli.spec.ts
@@ -0,0 +1,198 @@
+/**
+ * Tests for CLI entry point and utilities.
+ *
+ * @remarks
+ * Covers: CLI routing (trials/compare/calibrate/unknown),
+ * parseCli meta flags (--help, --schema), and input validation.
+ */
+
+import { describe, expect, test } from 'bun:test'
+import * as z from 'zod'
+
+// ============================================================================
+// CLI Entry Point Routing
+// ============================================================================
+
+describe('CLI entry point', () => {
+  const cliPath = `${import.meta.dir}/../cli.ts`
+
+  test('trials command routes to trialCli', async () => {
+    const proc = Bun.spawn(['bun', cliPath, 'trials', '--help'], {
+      stdout: 'pipe',
+      stderr: 'pipe',
+    })
+    const [, stderr, exitCode] = await Promise.all([
+      new Response(proc.stdout).text(),
+      new Response(proc.stderr).text(),
+      proc.exited,
+    ])
+    expect(exitCode).toBe(0)
+    expect(stderr).toContain('Usage:')
+  })
+
+  test('compare command exits with error (stub)', async () => {
+    const proc = Bun.spawn(['bun', cliPath, 'compare'], {
+      stdout: 'pipe',
+      stderr: 'pipe',
+    })
+    const [, stderr, exitCode] = await Promise.all([
+      new Response(proc.stdout).text(),
+      new Response(proc.stderr).text(),
+      proc.exited,
+    ])
+    expect(exitCode).toBe(1)
+    expect(stderr).toContain('not yet implemented')
+  })
+
+  test('calibrate command exits with error (stub)', async () => {
+    const proc = Bun.spawn(['bun', cliPath, 'calibrate'], {
+      stdout: 'pipe',
+      stderr: 'pipe',
+    })
+    const [, stderr, exitCode] = await Promise.all([
+      new Response(proc.stdout).text(),
+      new Response(proc.stderr).text(),
+      proc.exited,
+    ])
+    expect(exitCode).toBe(1)
+    expect(stderr).toContain('not yet implemented')
+  })
+
+  test('unknown command exits with error', async () => {
+    const proc = Bun.spawn(['bun', cliPath, 'bogus'], {
+      stdout: 'pipe',
+      stderr: 'pipe',
+    })
+    const [, stderr, exitCode] = await Promise.all([
+      new Response(proc.stdout).text(),
+      new Response(proc.stderr).text(),
+      proc.exited,
+    ])
+    expect(exitCode).toBe(1)
+    expect(stderr).toContain('Unknown command: bogus')
+    expect(stderr).toContain('Available commands')
+  })
+
+  test('no command exits with error', async () => {
+    const proc = Bun.spawn(['bun', cliPath], {
+      stdout: 'pipe',
+      stderr: 'pipe',
+    })
+    const [, stderr, exitCode] = await Promise.all([
+      new Response(proc.stdout).text(),
+      new Response(proc.stderr).text(),
+      proc.exited,
+    ])
+    expect(exitCode).toBe(1)
+    expect(stderr).toContain('Unknown command')
+  })
+})
+
+// ============================================================================
+// parseCli
+// ============================================================================
+
+describe('parseCli', () => {
+  // parseCli calls process.exit() on meta flags, so we test via subprocess
+  const runParseCli = async (args: string[]) => {
+    // Create an inline script that uses parseCli
+    const script = `
+      import { parseCli } from '${import.meta.dir}/../cli.utils.ts'
+      import * as z from 'zod'
+      const schema = z.object({ name: z.string(), count: z.number().default(1) })
+      const result = await parseCli(${JSON.stringify(args)}, schema, { name: 'test-cmd' })
+      console.log(JSON.stringify(result))
+    `
+    const proc = Bun.spawn(['bun', '-e', script], {
+      stdout: 'pipe',
+      stderr: 'pipe',
+    })
+    const [stdout, stderr, exitCode] = await Promise.all([
+      new Response(proc.stdout).text(),
+      new Response(proc.stderr).text(),
+      proc.exited,
+    ])
+    return { stdout: stdout.trim(), stderr: stderr.trim(), exitCode }
+  }
+
+  test('--help exits 0 with usage', async () => {
+    const { stderr, exitCode } = await runParseCli(['--help'])
+    expect(exitCode).toBe(0)
+    expect(stderr).toContain('Usage:')
+    expect(stderr).toContain('test-cmd')
+  })
+
+  test('-h exits 0 with usage', async () => {
+    const { stderr, exitCode } = await runParseCli(['-h'])
+    expect(exitCode).toBe(0)
+    expect(stderr).toContain('Usage:')
+  })
+
+  test('--schema input exits 0 with JSON Schema', async () => {
+    const { stdout, exitCode } = await runParseCli(['--schema', 'input'])
+    expect(exitCode).toBe(0)
+    const schema = JSON.parse(stdout)
+    expect(schema.type).toBe('object')
+    expect(schema.properties).toBeDefined()
+    expect(schema.properties.name).toBeDefined()
+  })
+
+  test('parses positional JSON arg', async () => {
+    const { stdout, exitCode } = await runParseCli(['{"name": "hello"}'])
+    expect(exitCode).toBe(0)
+    const result = JSON.parse(stdout)
+    expect(result.name).toBe('hello')
+    expect(result.count).toBe(1) // default applied
+  })
+
+  test('invalid JSON exits 2', async () => {
+    const { stderr, exitCode } = await runParseCli(['not-json'])
+    expect(exitCode).toBe(2)
+    expect(stderr).toContain('Invalid JSON')
+  })
+
+  test('schema validation failure exits 2', async () => {
+    const { stderr, exitCode } = await runParseCli(['{"wrong": "field"}'])
+    expect(exitCode).toBe(2)
+  })
+
+  test('no input exits 2', async () => {
+    const { stderr, exitCode } = await runParseCli([])
+    expect(exitCode).toBe(2)
+    expect(stderr).toContain('Usage:')
+  })
+})
+
+// ============================================================================
+// Export Contract
+// ============================================================================
+
+describe('export contract', () => {
+  test('@plaited/agent-eval-harness exports runTrial', async () => {
+    const mod = await import('../trial.ts')
+    expect(typeof mod.runTrial).toBe('function')
+    expect(typeof mod.calculatePassAtK).toBe('function')
+    expect(typeof mod.calculatePassExpK).toBe('function')
+    expect(typeof mod.trialCli).toBe('function')
+  })
+
+  test('@plaited/agent-eval-harness/schemas exports Grader type and schemas', async () => {
+    const schemas = await import('../trial.schemas.ts')
+    // Schemas exist and are Zod schemas
+    expect(schemas.GraderResultSchema).toBeDefined()
+    expect(schemas.TrajectoryStepSchema).toBeDefined()
+    expect(schemas.TrialResultSchema).toBeDefined()
+    expect(schemas.TrialEntrySchema).toBeDefined()
+    expect(schemas.PromptCaseSchema).toBeDefined()
+    expect(schemas.AdapterResultSchema).toBeDefined()
+    expect(schemas.AdapterInputSchema).toBeDefined()
+
+    // Verify Grader type works — parse a valid result
+    const result = schemas.GraderResultSchema.parse({
+      pass: true,
+      score: 1.0,
+      reasoning: 'test',
+    })
+    expect(result.pass).toBe(true)
+  })
+})