diff --git a/.github/PULL_REQUEST_TEMPLATE/tier5-queries.md b/.github/PULL_REQUEST_TEMPLATE/tier5-queries.md new file mode 100644 index 00000000..c7504359 --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE/tier5-queries.md @@ -0,0 +1,39 @@ + + +## Summary + +Submitting **N** Tier 5.5 queries for BrainBench. + +- Author handle: `@your-handle` +- File location: `eval/external-authors/your-handle/queries.json` +- Queries authored fresh (not copy-pasted from a model output) +- Slugs verified against `eval/data/world-v1/` (via `bun run eval:world:view`) + +## Checklist + +- [ ] `bun run eval:query:validate eval/external-authors/your-handle/queries.json` passes +- [ ] At least 20 queries +- [ ] Each query has either `gold.relevant` (with real slugs) or `gold.expected_abstention: true` +- [ ] Temporal queries have `as_of_date` set (`corpus-end` | `per-source` | ISO-8601) +- [ ] Phrasing is varied (not all the same template) +- [ ] `author` field matches my handle + +## Phrasing variety (optional self-audit) + +Tick the styles represented in your batch: + +- [ ] Full sentence questions +- [ ] Fragment-style ("crypto founder Goldman Sachs background") +- [ ] Comparison ("X vs Y") +- [ ] Follow-up ("And who else...") +- [ ] Imperative ("Pull up Alice Davis") +- [ ] Trait-based ("the demanding engineering leader") +- [ ] Abstention bait (answer is "not in corpus") + +## Notes to reviewer + +Anything worth flagging — ambiguous cases, corpus gaps you found, specific +phrasings you were uncertain about. diff --git a/.github/workflows/eval-tests.yml b/.github/workflows/eval-tests.yml new file mode 100644 index 00000000..0e8f5f96 --- /dev/null +++ b/.github/workflows/eval-tests.yml @@ -0,0 +1,40 @@ +name: Eval tests + +on: + push: + branches: [master] + paths: + - 'eval/**' + - 'src/core/link-extraction.ts' + - 'src/core/search/**' + pull_request: + branches: [master] + paths: + - 'eval/**' + - 'src/core/link-extraction.ts' + - 'src/core/search/**' + +permissions: + contents: read + +jobs: + eval-tests: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 + - uses: oven-sh/setup-bun@0c5077e51419868618aeaa5fe8019c62421857d6 # v2 + with: + bun-version: latest + - run: bun install + + # Validate the built-in Tier 5 + 5.5 query set. + - name: Validate built-in queries + run: bun run eval:query:validate + + # Pure-function unit tests — zero API calls, fast. + - name: Run eval unit tests + run: bun run test:eval + + # Smoke-test the world.html renderer against the committed corpus. + - name: Render world.html + run: bun run eval:world:render diff --git a/.gitignore b/.gitignore index 44799ebc..2b0e83ef 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,4 @@ supabase/.temp/ .claude/skills/ .idea eval/reports/ +eval/data/world-v1/world.html diff --git a/docs/benchmarks/2026-04-19-brainbench-multi-adapter.md b/docs/benchmarks/2026-04-19-brainbench-multi-adapter.md new file mode 100644 index 00000000..2b835400 --- /dev/null +++ b/docs/benchmarks/2026-04-19-brainbench-multi-adapter.md @@ -0,0 +1,99 @@ +# BrainBench — multi-adapter side-by-side (2026-04-19) + +**Branch:** `garrytan/gbrain-evals` +**Commit:** `b81373d` +**Engine:** PGLite (in-memory) +**Corpus:** `eval/data/world-v1/` (240 rich-prose fictional pages, committed) +**Runner:** `bun run eval:run` (N=5, page-order shuffled per run, seeded LCG) +**Wall time:** ~11.5 min + +## Headline + +| Adapter | Runs | Queries | P@5 | R@5 | Correct in top-5 (run 1) | +|------------------|------|---------|--------------|--------------|--------------------------| +| **gbrain-after** | 5 | 145 | **49.1%** ±0 | **97.9%** ±0 | **248 / 261** | +| hybrid-nograph | 5 | 145 | 17.8% | 65.1% | 129 / 261 | +| ripgrep-bm25 | 5 | 145 | 17.1% | 62.4% | 124 / 261 | +| vector-only | 5 | 145 | 10.8% | 40.7% | 78 / 261 | + +Stddev = 0 across all adapters this run — every adapter is deterministic over +page ordering. That's the correct signal for the shipped code (non-zero would +surface an order-dependent tie-break bug). + +### Deltas vs gbrain-after + +- hybrid-nograph: P@5 **−31.4 pts**, R@5 **−32.9 pts**, correct-in-top-5 **−119** +- ripgrep-bm25: P@5 **−32.0 pts**, R@5 **−35.5 pts**, correct-in-top-5 **−124** +- vector-only: P@5 **−38.4 pts**, R@5 **−57.2 pts**, correct-in-top-5 **−170** + +### Per-adapter wall time (5 runs) + +| Adapter | Time | Per run | Notes | +|----------------|---------|---------|------------------------------------------| +| gbrain-after | 7.4s | ~1.5s | PGLite + extract (graph) + grep fallback | +| hybrid-nograph | 555.1s | ~111s | Re-embeds 240 pages every run | +| ripgrep-bm25 | 0.1s | ~20ms | Pure in-memory term matching | +| vector-only | 131.8s | ~26s | Embeds once, cosine per query | + +## What this confirms + +The graph layer is doing the work. + +`hybrid-nograph` is gbrain's own hybrid retrieval stack with the graph disabled — +same embedder, same chunking, same RRF, same codebase. It lands at 17.8% P@5, +barely a point above classic BM25. Add typed-edge traversal back in and P@5 +jumps to 49.1%. That's **+31.4 points from the graph alone**, holding everything +else constant. + +Vector-only is the worst on these relational queries. Cosine similarity over +bio prose doesn't know that "Carol Wilson" appearing in a paragraph about +Anchor means she's employed there — it ranks by semantic neighborhood, which +puts other engineering people at other startups ahead of actual coworkers. +40.7% R@5 is the floor. + +## Reproducibility + +```sh +# From a clean checkout at commit b81373d +export OPENAI_API_KEY=sk-proj-... # embedding-based adapters need this +bun install +bun run eval:run +``` + +Deterministic adapters (`gbrain-after`, `ripgrep-bm25`, `vector-only`) match +this scorecard byte-for-byte. `hybrid-nograph` matches within tolerance bands +(N=5 smooths embedding nondeterminism). + +For faster iteration: `BRAINBENCH_N=1 bun run eval:run:dev` (one run per adapter, +~2 min total). + +## Methodology + +- **Corpus:** 240 Opus-generated fictional biographical pages — 80 people, 80 + companies, 50 meetings, 30 concepts. Committed at + `eval/data/world-v1/`, zero private data, no regen needed. +- **Gold:** 145 relational queries derived from each page's `_facts` metadata + — "Who attended X?", "Who works at X?", "Who invested in X?", "Who advises X?" + No `_facts` ever cross the adapter boundary; adapters see raw prose only + (enforced structurally in `Adapter.init`). +- **Metrics:** mean P@5 and R@5. Top-5 is what agents actually read in ranked + results. +- **N=5 runs per adapter**, page ingestion order shuffled with a per-run seed + (`shuffleSeeded`, LCG). Stddev surfaces order-dependent bugs. Zero stddev on + deterministic adapters is the expected-correct signal. +- **Temporal queries** (none in this 145-query set) require explicit + `as_of_date`, validated at query-authoring time. + +## Notes + +- This is a reproduction of the multi-adapter scorecard shipped with the + eval harness at `b81373d`. Numbers match the README table exactly for + `gbrain-after`, `ripgrep-bm25`, `vector-only` (deterministic) and are within + tolerance for `hybrid-nograph` (embedder nondeterminism). +- `bun run eval:run` exits with code 99 at the very end despite printing the + full scorecard cleanly. Tracked separately; the metrics above are all from + the completed run. +- For the BEFORE/AFTER PR #188 evaluation (graph layer vs no graph layer on + an earlier commit), see `2026-04-18-brainbench-v1.md`. This file is the + neutrality scorecard — gbrain compared to external baselines anyone could + reimplement. diff --git a/docs/benchmarks/2026-04-19-brainbench-v0_11-vs-v0_12.md b/docs/benchmarks/2026-04-19-brainbench-v0_11-vs-v0_12.md new file mode 100644 index 00000000..3bcc3c3f --- /dev/null +++ b/docs/benchmarks/2026-04-19-brainbench-v0_11-vs-v0_12.md @@ -0,0 +1,133 @@ +# BrainBench — gbrain v0.11.1 vs v0.12.1 (2026-04-19) + +Historical regression comparison. Same harness, same corpus, same 145 queries +— only the gbrain `src/` tree varies. Answers the question "did the v0.12 work +make retrieval better, or are the external-adapter numbers the whole story?" + +**Short answer:** v0.12.1 moves gbrain-after from **P@5 22.1% → 49.1%** and +**R@5 54.6% → 97.9%** on identical inputs. The v0.12 extract upgrades alone +explain most of the multi-adapter gap. + +## Setup + +| Slot | SHA | Dated | Version label | +|-----------------|-----------|--------------|----------------| +| BEFORE | `d861336` | 2026-04-18 | v0.11.1 (Minions + canonical migration) | +| AFTER (HEAD) | `b81373d` | 2026-04-19 | v0.12.1 base + eval harness Phase 3 | + +Method: +1. `git worktree add ../gbrain-eval-v0.11 d861336` — old `src/` tree in isolation +2. Copy current `eval/` (harness, corpus, queries) into the worktree so both + runs score the identical benchmark +3. Patch the worktree's `gbrain-after` adapter to call `getLinks`/`getBacklinks` + (v0.11 graph API) with the same linkType filter + direction semantics as + `traversePaths` (v0.12). Same ranking logic, different underlying primitives. +4. Run `bun eval/runner/multi-adapter.ts --adapter=gbrain-after` at N=5 on both. + +The external baselines (`ripgrep-bm25`, `vector-only`) share no code with +gbrain's `src/`, so their numbers are invariant across the two SHAs. Included +below for context only. + +## Headline + +| Adapter (config) | BEFORE v0.11.1 | AFTER v0.12.1 | Δ | +|-------------------------|----------------|----------------|---------------| +| **gbrain-after — P@5** | 22.1% | **49.1%** | **+27.0 pts** | +| **gbrain-after — R@5** | 54.6% | **97.9%** | **+43.3 pts** | +| Correct in top-5 (run 1)| 99 / 261 | **248 / 261** | **+149** | +| hybrid-nograph — P@5 | 17.8% | 17.8% | — | +| hybrid-nograph — R@5 | 65.1% | 65.1% | — | + +Stddev = 0 on both versions — both adapter codepaths are deterministic over +ingestion order. The entire movement is on `gbrain-after`; `hybrid-nograph` +holds flat because v0.12 didn't change `hybridSearch`, chunking, or embedding. + +## Where the gain came from + +`runExtract` is the hinge. Same 240 raw pages in, very different graph out: + +| What got extracted | v0.11.1 | v0.12.1 | Δ | +|--------------------------|-------------|-------------|------------| +| Pages with extractable links | 124 / 240 | 240 / 240 | +116 pages | +| Typed links created | 136 | 499 | **×3.7** | +| Timeline entries created | 27 | 2,208 | **×82** | + +Three shipped fixes account for the jump (all on master between the two SHAs): +1. **`inferLinkType` rewrite** (PR #188 five-part patch) — `invested_in`, + `works_at`, `founded`, `advises` regexes extended to the narrative verbs + Opus-generated prose actually uses ("led the Series A", "early investor", + "the founder", "joined as partner"). Context window 80 → 240 chars. +2. **Auto-link on `put_page`** (v0.12.0) — typed edges get extracted on every + write instead of only when the user runs `extract` manually. +3. **Timeline extraction in `extract --source db`** (v0.12.0) — walks the + whole brain, pulls dated lines into structured entries. v0.11 only did + this on filesystem sync, so DB-only ingestion paths (like this benchmark) + saw almost no timeline data. + +## What this means for the multi-adapter scorecard + +The April-18 multi-adapter scorecard shows gbrain-after beating +hybrid-nograph by 31 points P@5. This comparison explains the shape of that +gap: on v0.11.1 the same architecture only beats hybrid-nograph by **4.3 +points P@5** (22.1% vs 17.8%). The 27-point extra lift came from v0.12's +extract quality, not the graph layer being present vs absent. + +That's a useful refinement of the "the graph layer does the work" claim from +the April-18 benchmark. Sharper version: + +> **The typed-edge graph + high-quality extraction together do the work.** +> Either piece alone only moves the needle a few points. Both pieces in +> combination account for the +31 P@5 gap. + +## Reproducibility + +```sh +# From providence/, current HEAD (b81373d) +bun run eval:run --adapter=gbrain-after +# gbrain-after N=5: P@5 49.1% ±0, R@5 97.9% ±0 + +# Historical side +git worktree add -f ../gbrain-eval-v0.11 d861336 +cp -r eval ../gbrain-eval-v0.11/ +ln -s $PWD/node_modules ../gbrain-eval-v0.11/node_modules +# Patch: gbrain-after uses getLinks/getBacklinks instead of traversePaths +# (v0.11 doesn't have traversePaths). Same direction + linkType filter +# semantics, different primitive. See the perl one-liner in the +# session commit message for the exact diff. +cd ../gbrain-eval-v0.11 +bun eval/runner/multi-adapter.ts --adapter=gbrain-after +# gbrain-after N=5: P@5 22.1% ±0, R@5 54.6% ±0 +``` + +## Methodology notes + +- The v0.11 shim swaps `traversePaths(seed, {depth:1, direction, linkType})` + for `getLinks(seed)` / `getBacklinks(seed)` filtered in-memory by + `link_type`. At depth=1 this is semantically identical; it would diverge if + the query asked for depth>=2 (none here do). So the reported delta is + attributable to gbrain's extraction + storage, not to differences in how + the adapter interprets the graph at query time. +- External baselines would be identical on both SHAs by construction. + Re-running them adds no signal. If we later add a baseline that shares + gbrain code (a hybrid variant, say), we'd need to re-run it on both sides. +- `pages with extractable links` is the count `extract --source db` logs + after walking the brain. On v0.11 the filtering was narrower, so only + 124/240 pages contributed any typed edge. On v0.12 every page contributes + at least one. +- The exit-99 fix on the multi-adapter runner (teardown of PGLite engines) + was applied to both sides before running, so neither run spuriously + returns a failing status to CI. + +## What this does not test + +- **Other retrieval-adjacent v0.12 work** (sync quality, publish, lint, + integrations). BrainBench is scoped to retrieval. Per-feature tests still + live in `test/`. +- **Real prose beyond the 240-page fictional corpus.** The extract regex + wins on Opus-generated biographical prose. Real brain pages have their + own vocabulary quirks — future work is corpus diversity (tracked in + `eval/README.md` three-contributor-paths section). +- **Wall-clock or token cost.** v0.12's extract is slightly slower + (auto-link on every put_page, 2,208 timeline entries vs 27), but we + haven't benchmarked the difference. If that ever matters for autopilot, + it needs a separate pass. diff --git a/eval/CONTRIBUTING.md b/eval/CONTRIBUTING.md new file mode 100644 index 00000000..43439c30 --- /dev/null +++ b/eval/CONTRIBUTING.md @@ -0,0 +1,150 @@ +# Contributing to BrainBench + +Three contribution paths. Each has a separate workflow. + +## 1. Write Tier 5.5 externally-authored queries + +Tier 5.5 exists to neutralize the "gbrain wrote its own exam" critique. The +queries currently in the repo are AI-authored synthetic placeholders; real +outside researcher submissions supersede them. + +### Workflow + +```sh +# Step 1. Understand the canonical world. +bun run eval:world:view +# Browser opens. Click through entities. Note down what's real. + +# Step 2. Scaffold a query. +bun run eval:query:new --tier externally-authored --author "@your-handle" +# Prints a Query template. Save to a file. + +# Step 3. Edit the template. +# - Replace text with your actual question +# - Replace gold.relevant with slug(s) that actually exist +# - If the query has temporal verbs (is/was/were/now/...), set as_of_date +# to "corpus-end", "per-source", or ISO-8601 +# - Fill in tags + +# Step 4. Validate before submitting. +bun run eval:query:validate path/to/your-queries.json + +# Step 5. Submit a PR. +# File location: eval/external-authors//queries.json +# PR template: .github/PULL_REQUEST_TEMPLATE/tier5-queries.md +``` + +### Query-authoring guidelines + +- **Write like you'd naturally ask.** Don't adapt your voice to an "AI + benchmark style." Fragments, typos, comparisons, follow-ups, imperatives + — all welcome. Variety is the value. +- **Gold must be real slugs.** Every slug in `gold.relevant` must exist in + `eval/data/world-v1/`. The validator checks format; you verify existence. +- **Abstention is a valid answer.** If your query has no answer in the + corpus (e.g. you're asking about someone who isn't there), set + `expected_output_type: 'abstention'` and `gold.expected_abstention: true`. +- **Temporal queries need `as_of_date`.** The validator will reject + "Where is Sarah now?" without it. Use `"corpus-end"` for "as of the most + recent data," `"per-source"` for "whatever the cited source says," or a + specific ISO date. +- **Partial answers are OK** if you flag them via `known_failure_modes`. + +### Query quality bar + +We'll merge your PR if: +- `bun run eval:query:validate` passes +- Slugs resolve to real entities +- At least 20 queries (one batch) +- Queries have genuine phrasing variety + +## 2. Submit an external adapter + +The `Adapter` interface is `eval/runner/types.ts`. Three methods: + +```typescript +interface Adapter { + readonly name: string; + init(rawPages: Page[], config: AdapterConfig): Promise; + query(q: Query, state: BrainState): Promise; + snapshot?(state: BrainState): Promise; +} +``` + +### Workflow + +```sh +# Step 1. Create your adapter file. +# eval/runner/adapters/my-adapter.ts + +# Step 2. Write it. +# - import types from '../types.ts' +# - export class MyAdapter implements Adapter { ... } +# - BrainState is opaque to the runner. Internal shape is yours. +# - `rawPages: Page[]` is all you get. Never read from gold/ — the +# runner doesn't give you that path on purpose. + +# Step 3. Write a unit test. +# eval/runner/adapters/my-adapter.test.ts +# Cover at minimum: init, query, deterministic tie-break. + +# Step 4. Wire into multi-adapter.ts. +# import { MyAdapter } from './adapters/my-adapter.ts'; +# const allAdapters: Adapter[] = [ +# ...existing, +# new MyAdapter(), +# ]; + +# Step 5. Test locally. +bun run test:eval +bun run eval:run:dev --adapter=my-adapter + +# Step 6. Open a PR. +``` + +### Adapter quality bar + +- Deterministic over sorted input (stddev=0 across N=5 runs is the + expected default; non-zero is a signal worth understanding) +- `query()` returns rank order — `rank: i + 1`, 1-based, no duplicates +- Tie-breaks documented (e.g. "alphabetical by slug when scores tie") +- No network calls in unit tests (mock any API dependencies) +- Pass `bun run test:eval` + +## 3. Reproduce / verify a published scorecard + +```sh +# Step 1. Check the scorecard's commit hash. +# Reports in docs/benchmarks/ include the gbrain version + commit. + +# Step 2. Pin the same commit. +git checkout + +# Step 3. Run the full benchmark. +bun run eval:run + +# Step 4. Compare to the published scorecard. +# For deterministic adapters, numbers should match exactly. +# For embedding-based adapters, numbers should land within the published +# tolerance bands (mean ± stddev). +``` + +If your numbers drift outside tolerance, file an issue with: +- Your `bun --version` +- Your `uname -sr` +- Your OpenAI model ID (for embedding-model drift) +- A diff of the scorecard + +## Code style + +- Match existing gbrain patterns (hand-rolled where appropriate, no new + deps unless genuinely needed) +- Bun's built-in test runner (`bun:test`), not jest/vitest +- No em dashes in prose (`—`, `–`); use parentheses or sentences +- Commit messages: `feat(eval):`, `fix(eval):`, `docs(eval):`, `test(eval):` + +## Contributors + +See `eval/CREDITS.md` for the full list. All Tier 5.5 external-author +submissions credited there + in the scorecard. Synthetic placeholders are +labeled `synthetic-outsider-v1`. diff --git a/eval/CREDITS.md b/eval/CREDITS.md new file mode 100644 index 00000000..b2832435 --- /dev/null +++ b/eval/CREDITS.md @@ -0,0 +1,47 @@ +# BrainBench credits + +## Core team + +- **garrytan** — BrainBench v1 + v1.1 architecture, adapter interface, + extraction regex residuals (v0.10.5), multi-axis type-accuracy runner +- **Claude Opus 4.7** — pair programming, test coverage, documentation + +## External query authors (Tier 5.5) + +No human external authors yet. The Tier 5.5 query set currently comprises +50 synthetic queries labeled `author: "synthetic-outsider-v1"` as a +placeholder. Real submissions via `eval/external-authors//queries.json` +PRs supersede synthetic entries. + +**Want to be credited here?** See `eval/CONTRIBUTING.md`. + +## External adapters + +No third-party adapters yet. The shipping adapter set: + +- `gbrain-after` — gbrain v0.10.3+ (internal; the system under test) +- `hybrid-nograph` — gbrain hybrid search with graph layer disabled + (internal comparator; closest apples-to-apples to `gbrain-after`) +- `ripgrep-bm25` — classic IR baseline built in an afternoon +- `vector-only` — commodity vector RAG, same embedder as gbrain + +Third-party submissions (mem0, supermemory, Letta, Cognee, etc.) via +`eval/runner/adapters/.ts` PRs. See `eval/CONTRIBUTING.md` for +the adapter interface and submission flow. + +## Data + +- Corpus generator: Claude Opus +- Canonical world: `eval/data/world-v1/` (committed, 240 entities) +- Generation cost: ~$3.14 USD (one-time) + +## Inspiration + +- **SWE-bench** — taught us that a benchmark's credibility comes from real + baselines, not from the authoring team saying nice things about their + own stack +- **Codex** — cold-read critique that "this isn't a standard, it's an + internal test" drove the Phase 2 external-baselines work that became + the headline of this PR +- **MTEB** — embedding-model reproducibility card pattern; we copy the + "pin every version in every scorecard" discipline diff --git a/eval/README.md b/eval/README.md new file mode 100644 index 00000000..6113e759 --- /dev/null +++ b/eval/README.md @@ -0,0 +1,126 @@ +# BrainBench + +Public benchmark for personal knowledge brain agent stacks. Ships 4 adapter +configurations scored side-by-side on a 240-page rich-prose fictional corpus +(`twin-amara`). Measures retrieval, extraction quality, and per-link-type +accuracy. + +**What this answers:** "Does the knowledge graph layer do useful work, or is +gbrain just a thin wrapper over vector+keyword hybrid?" Headline: gbrain +beats the closest external baseline (hybrid-without-graph, same embedder, +same chunking) by **+31 points P@5**. The graph layer is load-bearing. + +## 5-minute quickstart + +```sh +# 1. Run the full benchmark (4 adapters × 5 runs, ~15 min wall clock) +bun run eval:run + +# 2. Fast iteration (N=1 single run) +bun run eval:run:dev + +# 3. Just the type-accuracy report +bun run eval:type-accuracy + +# 4. Explore the canonical world (contributor-facing UI) +bun run eval:world:view +``` + +## What's in the box + +``` +eval/ +├── data/world-v1/ Canonical world (committed). 240 sharded JSON files. +│ One file per entity + _ledger.json metadata. +├── generators/ +│ ├── gen.ts Opus-backed corpus generator (run once; output cached) +│ ├── world.ts World-schema scaffolder +│ └── world-html.ts World explorer HTML renderer (XSS-safe) +├── runner/ +│ ├── multi-adapter.ts 4-adapter side-by-side scorer (N=5) +│ ├── type-accuracy.ts Per-link-type accuracy vs gold from _facts +│ ├── before-after.ts Original v1 BEFORE/AFTER retrieval run +│ ├── types.ts Adapter, Page, Query, RankedDoc interfaces +│ ├── adapters/ +│ │ ├── ripgrep-bm25.ts EXT-1: classic IR baseline (BM25 over grep hits) +│ │ ├── vector-only.ts EXT-2: pure cosine similarity, same embedder +│ │ └── hybrid-nograph.ts EXT-3: gbrain hybrid with graph disabled +│ └── queries/ +│ ├── tier5-fuzzy.ts 30 vague-recall queries (hand-authored) +│ ├── tier5_5-synthetic.ts 50 synthetic outsider queries (AI-authored, labeled) +│ ├── validator.ts Query schema enforcement (temporal as_of_date rule) +│ └── index.ts Aggregator + validateAll() +├── cli/ +│ ├── world-view.ts Render + open world.html +│ ├── query-validate.ts Validate a Query[] file +│ └── query-new.ts Scaffold a Query template +└── reports/ Benchmark scorecards (gitignored) +``` + +## Three contributor paths + +### Path 1: Reproduce a published scorecard + +```sh +# 1. Check out the specific gbrain commit referenced in the scorecard +git checkout +# 2. Run the full benchmark +bun run eval:run +# 3. Compare your numbers to the scorecard. Deterministic adapters should +# match exactly. Embedding-based adapters should land within tolerance bands. +``` + +### Path 2: Submit a new external adapter + +See `CONTRIBUTING.md` for the adapter submission flow. Short version: + +1. Implement `eval/runner/adapters/.ts` conforming to the + `Adapter` interface in `eval/runner/types.ts`. +2. Add a unit test file alongside. +3. Wire your adapter into `eval/runner/multi-adapter.ts` (one line). +4. `bun run eval:run:dev` to verify. +5. Open a PR. + +### Path 3: Write Tier 5.5 externally-authored queries + +The T5.5 queries currently in the repo are AI-authored (`author: +"synthetic-outsider-v1"`) as a placeholder. Real outside researchers should: + +1. `bun run eval:world:view` to understand the canonical world +2. `bun run eval:query:new --tier externally-authored --author "@your-handle"` +3. Edit the scaffolded template with a real query + gold slugs +4. `bun run eval:query:validate path/to/your.json` +5. Submit via `eval/external-authors//queries.json` in a PR + +See `CONTRIBUTING.md` for the query-submission template. + +## Methodology one-pager + +- **Corpus:** 240 Opus-generated fictional biographical pages. Fixed, + committed, zero private data. Reproducibility baseline for any run. +- **Gold:** Each page's `_facts` metadata defines canonical relationships. + The scorer never shows `_facts` to the adapters — **raw pages only** + cross the ingestion boundary (structural enforcement in `Adapter.init`). +- **Metrics:** P@5 and R@5 on relational queries (145 canonical from + `_facts`, 80 tier-5 + tier-5.5). Type accuracy on extracted edges + (`eval/runner/type-accuracy.ts`). +- **N=5 runs per adapter** with page-order shuffle (seeded LCG; runs are + reproducible). Stddev surfaces order-dependent adapter bugs. Deterministic + adapters correctly show stddev=0. +- **Temporal queries** require explicit `as_of_date` (validated at query + authoring time; rejected at load if a temporal verb is present without it). + +## Adapter scorecard (most recent, N=5) + +See `docs/benchmarks/2026-04-18-brainbench-v1.md` for the full report. +Quick summary from `bun run eval:run`: + +| Adapter | P@5 | R@5 | +|-----------------|--------|--------| +| gbrain-after | 49.1% | 97.9% | +| hybrid-nograph | 17.8% | 65.1% | +| ripgrep-bm25 | 17.1% | 62.4% | +| vector-only | 10.8% | 40.7% | + +The graph layer beats vector+keyword hybrid on relational queries by ~31 +points; hybrid-without-graph barely edges BM25. That's the story. diff --git a/eval/RUNBOOK.md b/eval/RUNBOOK.md new file mode 100644 index 00000000..6216d2da --- /dev/null +++ b/eval/RUNBOOK.md @@ -0,0 +1,161 @@ +# BrainBench runbook + +Operational troubleshooting for the most common failures. One fix per entry. + +## Generation failures + +### "OPENAI_API_KEY environment variable is missing" + +The embedding adapter (`vector-only`) and any run of `eval/generators/gen.ts` +calls the OpenAI API. You need an API key. + +```sh +export OPENAI_API_KEY=sk-proj-... +# or source from a dotenv file +source ~/.zshrc # if the key is in your shell profile +bun run eval:run +``` + +### "ANTHROPIC_API_KEY environment variable is missing" + +Only needed if you regenerate the corpus (`eval/generators/gen.ts`). If +you're using the committed `eval/data/world-v1/` shards, you don't need it. + +### `bun install` fails with "Cannot find package 'openai'" + +The `openai` package is in `package.json` dependencies. Run `bun install` +to fetch it. This shouldn't happen post-clone if you followed the normal +setup; see CLAUDE.md troubleshooting. + +## Runner failures + +### `multi-adapter.ts` times out on hybrid-nograph + +hybrid-nograph embeds all 240 pages per run (via `importFromContent`). At +N=5, that's 5 re-embeddings. Typical wall clock: ~10 minutes. + +If you're iterating, use the dev mode: +```sh +BRAINBENCH_N=1 bun run eval:run:dev +``` + +Or skip embedding-based adapters for focused runs: +```sh +bun run eval:run -- --adapter=gbrain-after +bun run eval:run -- --adapter=ripgrep-bm25 +``` + +### "hybrid-nograph returned P@5 0.0%" + +Likely the adapter is calling `hybridSearch()` on an engine that doesn't +have chunks/embeddings populated. This shouldn't happen with current code +— `importFromContent` populates them. If it does happen: + +1. Check the adapter uses `importFromContent(engine, slug, content)`, + not bare `engine.putPage(...)`. The latter skips chunking. +2. Check `auto_link` is OFF (the adapter sets it, but if someone edits + the engine's default, verify). + +### "ripgrep-bm25 crashes on a query" + +The adapter has no query-size ceiling by design. If a specific query crashes, +run it in isolation: + +```sh +# Drop other adapters temporarily and bisect the query list. +bun run eval:run -- --adapter=ripgrep-bm25 +``` + +## Query validation failures + +### `validateAll()` fails with "temporal verb detected; as_of_date required" + +The query text matches the temporal verb regex. Pick one: + +1. **The query is actually temporal.** Add `as_of_date: 'corpus-end' | + 'per-source' | '2024-01-15'` (ISO-8601). +2. **The query isn't really temporal.** Rephrase to avoid the trigger verb. + "Where is Sarah working?" → "Sarah's current employer" (adjective-form + doesn't trigger). +3. **Edge case bug in the regex.** File an issue; the regex lives at + `eval/runner/queries/validator.ts:TEMPORAL_VERBS`. + +### `validateAll()` fails with "slug does not match 'dir/slug' format" + +Gold slugs must be `dir/slug` — e.g. `people/alice-chen`, not just +`alice-chen` or `people/Alice Chen`. Lowercase, hyphens, no spaces. + +### `validateAll()` fails with "duplicate id in batch" + +Two queries share an `id`. Renumber. Convention: +- Tier 5 (fuzzy): `q5-NNNN` +- Tier 5.5 (externally-authored): `q55-NNNN` +- Scaffolder default: `q-` (via `eval:query:new`) + +## World.html rendering + +### "world.html doesn't open automatically" + +`eval:world:view` tries `open` (macOS), `xdg-open` (Linux), `start` +(Windows). If none work: + +```sh +bun run eval:world:render # generate only +# then open manually in your browser +open eval/data/world-v1/world.html # or xdg-open, start, etc. +``` + +### "world.html looks weird / broken" + +Regenerate from scratch — shard files might have drifted since last render: + +```sh +rm eval/data/world-v1/world.html +bun run eval:world:view +``` + +### "I see unescaped HTML in world.html" + +That's a security regression. Open an issue IMMEDIATELY with the specific +entity slug. Every string should route through `escapeHtml()` in +`eval/generators/world-html.ts`. + +## Dataset regeneration (advanced) + +Don't regenerate unless you know why. The committed corpus is the stable +baseline everyone benchmarks against. Regenerating produces a DIFFERENT +dataset (Opus isn't byte-deterministic), which becomes a new version. + +If you need to regenerate (e.g. for a v1.2 dataset): + +```sh +# Clean slate +rm -rf eval/data/world-v1 +# Regenerate (~$3 Opus cost, 30 min) +bun eval/generators/gen.ts --max 240 --concurrency 6 +# Validate +bun run eval:type-accuracy +``` + +The new dataset should be committed as `eval/data/world-vX.Y/` with a +new ledger. Don't overwrite `world-v1/` — that's the reproducibility baseline. + +## CI failures + +### `bun run test:eval` fails on a fresh checkout + +```sh +bun install # fetch openai (+ deps) +bun run test:eval # retry +``` + +If tests still fail, bisect: + +```sh +bun test eval/runner/queries/validator.test.ts # pure functions +bun test eval/runner/adapters/ripgrep-bm25.test.ts # pure functions +bun test eval/runner/adapters/vector-only.test.ts # pure functions (cosine math only) +bun test eval/generators/world-html.test.ts # HTML rendering + XSS +``` + +One of these should fail deterministically — report it. diff --git a/eval/cli/query-new.ts b/eval/cli/query-new.ts new file mode 100644 index 00000000..273d0869 --- /dev/null +++ b/eval/cli/query-new.ts @@ -0,0 +1,85 @@ +#!/usr/bin/env bun +/** + * eval:query:new — scaffold a new Tier 5.5 query from template. + * + * Prints a well-formed Query JSON block that passes `eval:query:validate`. + * Contributors copy-paste into their own query file (or a PR against + * eval/external-authors//queries.json). + * + * Usage: + * bun run eval:query:new # default tier 5.5 + * bun run eval:query:new --tier fuzzy + * bun run eval:query:new --tier externally-authored --author "@alice" + * bun run eval:query:new --id q-custom-0042 + */ + +import type { Query, Tier } from '../runner/types.ts'; + +function printHelp() { + console.log(`eval:query:new — scaffold a Query template + +USAGE + bun run eval:query:new scaffold default (tier 5.5) + bun run eval:query:new --tier easy specify tier + bun run eval:query:new --id q-0001 specify id + bun run eval:query:new --author "@alice-researcher" external author + +OPTIONS + --tier easy | medium | hard | adversarial | fuzzy | externally-authored + --id Query ID (default: q-) + --author Author handle (required for tier=externally-authored) + +OUTPUT + Prints a JSON object that passes eval:query:validate. Copy into + your query file, fill in gold.relevant (or expected_abstention for + abstention queries), and iterate. +`); +} + +function getArg(name: string, fallback?: string): string | undefined { + const prefix = `--${name}=`; + for (const a of process.argv.slice(2)) { + if (a.startsWith(prefix)) return a.slice(prefix.length); + if (a === `--${name}`) { + const next = process.argv[process.argv.indexOf(a) + 1]; + if (next && !next.startsWith('--')) return next; + } + } + return fallback; +} + +function main() { + if (process.argv.includes('--help') || process.argv.includes('-h')) { + printHelp(); + return; + } + + const tier = (getArg('tier') ?? 'externally-authored') as Tier; + const id = getArg('id') ?? `q-${Date.now().toString().slice(-6)}`; + const author = getArg('author'); + + const template: Query = { + id, + tier, + text: 'REPLACE with your query text (a question or search fragment)', + expected_output_type: 'cited-source-pages', + gold: { + relevant: ['REPLACE/with-real-slug', 'REPLACE/with-another-slug-if-needed'], + }, + tags: ['REPLACE-with-tier-or-theme-tags'], + }; + + if (tier === 'externally-authored') { + template.author = author ?? 'REPLACE-with-your-handle'; + } + + // If the query text contains a temporal verb, the validator will require + // as_of_date. Leave a helpful placeholder. + template.as_of_date = 'REPLACE if temporal ("corpus-end" | "per-source" | YYYY-MM-DD); else delete this field'; + + console.log(JSON.stringify(template, null, 2)); + console.log(`\n// Next: save as a JSON file, run 'bun run eval:query:validate '`); + console.log(`// For Tier 5.5: submit a PR to eval/external-authors/${author ?? ''}/queries.json`); +} + +main(); diff --git a/eval/cli/query-validate.ts b/eval/cli/query-validate.ts new file mode 100644 index 00000000..0e7b1ad1 --- /dev/null +++ b/eval/cli/query-validate.ts @@ -0,0 +1,79 @@ +#!/usr/bin/env bun +/** + * eval:query:validate — validate a query file (or the built-in Tier 5/5.5 set). + * + * Usage: + * bun run eval:query:validate # validate all built-in T5+T5.5 + * bun run eval:query:validate path/to/file.ts # validate a file that exports Query[] + * bun run eval:query:validate --help + * + * Exit code 0 if all queries pass, 1 otherwise. Suitable for CI. + */ + +import { readFileSync } from 'fs'; +import { validateAll, validateQuerySet, formatIssues } from '../runner/queries/index.ts'; +import type { Query } from '../runner/types.ts'; + +function printHelp() { + console.log(`eval:query:validate — validate a Query set + +USAGE + bun run eval:query:validate validate all built-in T5 + T5.5 queries + bun run eval:query:validate validate a JSON file containing Query[] + +VALIDATOR CHECKS + - id, text, tier, expected_output_type present + - Temporal verbs (is/was/were/current/now/at the time/during/as of/when did) + require as_of_date ("corpus-end" | "per-source" | ISO-8601) + - cited-source-pages requires non-empty gold.relevant with valid slug format + - abstention requires gold.expected_abstention === true + - externally-authored (Tier 5.5) requires author field + - Duplicate IDs caught at batch level + +EXIT CODES + 0 all queries valid + 1 one or more queries failed validation +`); +} + +async function main() { + const args = process.argv.slice(2); + if (args.includes('--help') || args.includes('-h')) { + printHelp(); + return; + } + + const filePath = args[0]; + + if (!filePath) { + // Validate built-in T5 + T5.5 sets + const result = validateAll(); + console.log(result.report); + process.exit(result.ok ? 0 : 1); + } + + // Validate a file — supports JSON with { queries: Query[] } or Query[] + let queries: Query[] = []; + try { + const raw = readFileSync(filePath, 'utf-8'); + const parsed = JSON.parse(raw); + queries = Array.isArray(parsed) ? parsed : (parsed.queries ?? []); + } catch (e) { + console.error(`Error reading ${filePath}: ${(e as Error).message}`); + process.exit(1); + } + + if (queries.length === 0) { + console.error(`No queries found in ${filePath}. Expected JSON Query[] or { queries: Query[] }.`); + process.exit(1); + } + + const result = validateQuerySet(queries); + console.log(formatIssues(result)); + process.exit(result.ok ? 0 : 1); +} + +main().catch(e => { + console.error(e); + process.exit(1); +}); diff --git a/eval/cli/world-view.ts b/eval/cli/world-view.ts new file mode 100644 index 00000000..3db3e9f4 --- /dev/null +++ b/eval/cli/world-view.ts @@ -0,0 +1,64 @@ +#!/usr/bin/env bun +/** + * eval:world:view — (re)generate world.html and open it in the default browser. + * + * Combines two steps the contributor shouldn't have to think about: + * 1. Render eval/data/world-v1/world.html from the shard JSONs. + * 2. Open it in the default browser via `open` (macOS) / `xdg-open` (Linux). + * + * If the HTML already exists and shards haven't changed since, we'd ideally + * skip step 1 — but comparing timestamps is fragile and regeneration is fast + * (~50ms on 240 entities). Just regenerate every time. + * + * Usage: + * bun run eval:world:view (generates + opens) + * bun run eval:world:view --no-open (generates only; useful in CI) + */ + +import { execSync } from 'child_process'; +import { platform } from 'os'; +import { renderWorldHtmlToFile } from '../generators/world-html.ts'; + +function openInBrowser(path: string): void { + const cmd = platform() === 'darwin' ? 'open' : platform() === 'win32' ? 'start' : 'xdg-open'; + try { + execSync(`${cmd} "${path}"`, { stdio: 'ignore' }); + } catch (e) { + // Don't fail hard — the file is rendered, that's the main thing. + console.error(`Could not open browser automatically. Open manually: ${path}`); + console.error(`(${(e as Error).message})`); + } +} + +async function main() { + const args = process.argv.slice(2); + if (args.includes('--help') || args.includes('-h')) { + console.log(`eval:world:view — render + open world.html + +USAGE + bun run eval:world:view render eval/data/world-v1/world.html and open it + bun run eval:world:view --no-open render only (CI-friendly) + bun run eval:world:view --dir=PATH render from a different shard directory + +OUTPUT + eval/data/world-v1/world.html (gitignored; regenerate with this command). +`); + return; + } + + const dir = args.find(a => a.startsWith('--dir='))?.slice('--dir='.length) ?? + 'eval/data/world-v1'; + const noOpen = args.includes('--no-open'); + + const target = renderWorldHtmlToFile(dir); + console.log(`Rendered ${target}`); + + if (!noOpen) { + openInBrowser(target); + } +} + +main().catch(e => { + console.error(e); + process.exit(1); +}); diff --git a/eval/generators/world-html.test.ts b/eval/generators/world-html.test.ts new file mode 100644 index 00000000..fd9eba24 --- /dev/null +++ b/eval/generators/world-html.test.ts @@ -0,0 +1,134 @@ +import { describe, test, expect } from 'bun:test'; +import { escapeHtml, renderWorldHtml } from './world-html.ts'; + +describe('escapeHtml — XSS safety', () => { + test('escapes the 5 critical chars', () => { + expect(escapeHtml('')) + .toBe('<script>alert(1)</script>'); + }); + + test('escapes ampersand FIRST (or double-escape bug happens)', () => { + expect(escapeHtml('<')) + .toBe('&lt;'); // the input < becomes &lt; + }); + + test('escapes quotes (attribute context)', () => { + expect(escapeHtml('onclick="alert(1)"')) + .toBe('onclick="alert(1)"'); + expect(escapeHtml("onclick='alert(1)'")) + .toBe('onclick='alert(1)''); + }); + + test('passthrough for safe ASCII', () => { + expect(escapeHtml('Hello world.')).toBe('Hello world.'); + }); + + test('handles null and undefined', () => { + expect(escapeHtml(null)).toBe(''); + expect(escapeHtml(undefined)).toBe(''); + }); + + test('handles numbers', () => { + expect(escapeHtml(42)).toBe('42'); + }); + + test('preserves Unicode (non-ASCII that\'s not special HTML)', () => { + expect(escapeHtml('Héctor García 🎉')).toBe('Héctor García 🎉'); + }); + + test('real Opus prose injection attempt neutralized', () => { + // Representative: if Opus generates this in an entity backstory, + // the explorer HTML must NOT execute it. The XSS protection works + // because ``; + const safe = escapeHtml(attack); + // The opening `<` must be escaped (this is what neutralizes the tag). + expect(safe).not.toContain(' { + // Even as text content, escape quotes so it can't break attribute context. + const attack = `"javascript:alert(1)//`; + expect(escapeHtml(attack)).toBe('"javascript:alert(1)//'); + }); +}); + +describe('renderWorldHtml', () => { + const samplePage = { + slug: 'people/alice-chen', + type: 'person' as const, + title: 'Alice Chen', + compiled_truth: 'Alice Chen is a senior engineer.', + timeline: '- **2023-01-15** | Promoted to staff engineer', + _facts: { type: 'person', role: 'engineer' }, + }; + + test('renders an HTML document', () => { + const html = renderWorldHtml([samplePage]); + expect(html).toContain(''); + expect(html).toContain(' { + const html = renderWorldHtml([samplePage]); + // Rail link uses #slug anchor + expect(html).toContain('href="#people/alice-chen"'); + // Entity card has matching id + expect(html).toContain('id="people/alice-chen"'); + }); + + test('escapes all user content (XSS neutralization)', () => { + const maliciousPage = { + ...samplePage, + title: '', + compiled_truth: ``, + }; + const html = renderWorldHtml([maliciousPage]); + // Raw + + +`; +} + +// ─── CLI entrypoint ───────────────────────────────────────────── + +export function renderWorldHtmlToFile(dir: string, outPath?: string): string { + const { pages, ledger } = loadCorpus(dir); + const html = renderWorldHtml(pages, ledger); + const target = outPath ?? join(dir, 'world.html'); + writeFileSync(target, html, 'utf-8'); + return target; +} + +// Run directly (e.g. `bun eval/generators/world-html.ts`) +if (import.meta.main) { + const dir = process.argv.find(a => a.startsWith('--dir='))?.slice('--dir='.length) ?? + 'eval/data/world-v1'; + const out = process.argv.find(a => a.startsWith('--out='))?.slice('--out='.length); + const target = renderWorldHtmlToFile(dir, out); + console.log(`Wrote ${target}`); +} diff --git a/eval/runner/adapters/hybrid-nograph.ts b/eval/runner/adapters/hybrid-nograph.ts new file mode 100644 index 00000000..be5cde0e --- /dev/null +++ b/eval/runner/adapters/hybrid-nograph.ts @@ -0,0 +1,139 @@ +/** + * BrainBench EXT-3: Hybrid-without-graph adapter. + * + * gbrain's full hybrid search (vector + keyword + RRF fusion + dedup) but + * with the knowledge-graph layer explicitly disabled. No auto_link, no + * typed edges, no traverse_graph, no backlink boost. Just: + * - putPage each page + * - chunking + embedding (via existing put_page pipeline) + * - hybridSearch(engine, query) to answer queries + * + * This is the closest-to-gbrain external comparator. If gbrain-after beats + * EXT-3 significantly, the delta MUST come from the graph layer (auto_link + * typed edges + traversePaths + backlink boost), not from better vector + * retrieval or hybrid fusion. + * + * It's also the MOST HONEST baseline — "gbrain without the new knowledge + * graph layer" answers the question "does the graph do useful work?" + * directly. Critics can't dismiss this as "you disabled a feature you knew + * they'd want." Everyone already knows vector+keyword hybrid is strong. + */ + +import type { Adapter, AdapterConfig, BrainState, Page, Query, RankedDoc } from '../types.ts'; +import { PGLiteEngine } from '../../../src/core/pglite-engine.ts'; +import { hybridSearch } from '../../../src/core/search/hybrid.ts'; +import { importFromContent } from '../../../src/core/import-file.ts'; + +// Known-safe config: auto_link OFF at the engine layer via direct setConfig +// call. Does NOT run `extract --source db`, so typed links stay empty even +// if auto_link flipped on during put_page (belt + suspenders). + +interface HybridNoGraphState { + engine: PGLiteEngine; +} + +interface HybridNoGraphConfig extends AdapterConfig { + /** Top-K results requested from hybridSearch. Defaults to 20 so the + * scorer's k=5 slice has headroom. */ + limit?: number; +} + +export class HybridNoGraphAdapter implements Adapter { + readonly name = 'hybrid-nograph'; + + async init(rawPages: Page[], _config: HybridNoGraphConfig): Promise { + const engine = new PGLiteEngine(); + await engine.connect({}); + await engine.initSchema(); + // Belt: turn off auto_link at the engine config level. Suspenders below: + // we also skip extract --source db, so even if auto_link did fire, no + // typed edges would exist in the graph layer. This adapter doesn't call + // traversePaths at all, so graph state is doubly-ignored. + await engine.setConfig('auto_link', 'false'); + + // importFromContent does the chunking + embedding that hybridSearch needs. + // Plain putPage() just writes the page row without any search infra; that's + // fine for graph-based adapters but leaves hybridSearch with nothing to + // rank. Silence its stdout noise during benchmark runs. + const origLog = console.log; + const origErr = console.error; + console.log = () => {}; + console.error = () => {}; + try { + for (const p of rawPages) { + const content = this.buildContentMarkdown(p); + await importFromContent(engine, p.slug, content); + } + } finally { + console.log = origLog; + console.error = origErr; + } + + // INTENTIONALLY do NOT call runExtract — that's what populates typed + // links + timeline for the graph layer. Without it, traversePaths + // would return empty. hybridSearch works entirely off chunks + + // embeddings, which importFromContent just populated. + return { engine } satisfies HybridNoGraphState; + } + + async teardown(state: BrainState): Promise { + const s = state as HybridNoGraphState; + await s.engine.disconnect(); + } + + /** Build a markdown string importFromContent can parse. + * Format: YAML frontmatter then body; matches what gbrain import expects. */ + private buildContentMarkdown(p: Page): string { + const fm: string[] = []; + fm.push(`---`); + fm.push(`type: ${p.type}`); + fm.push(`title: ${JSON.stringify(p.title)}`); + fm.push(`---`); + fm.push(''); + fm.push(`# ${p.title}`); + fm.push(''); + fm.push(p.compiled_truth); + if (p.timeline && p.timeline.trim().length > 0) { + fm.push(''); + fm.push('## Timeline'); + fm.push(''); + fm.push(p.timeline); + } + return fm.join('\n'); + } + + async query(q: Query, state: BrainState): Promise { + const s = state as HybridNoGraphState; + const limit = 20; + + // hybridSearch returns chunks with scores. We aggregate to page-level + // by taking each page's BEST chunk score and ranking pages by that. + const chunkResults = await hybridSearch(s.engine, q.text, { limit: limit * 3 }); + + const pageBest = new Map(); + for (const r of chunkResults) { + const existing = pageBest.get(r.slug); + if (existing === undefined || r.score > existing) { + pageBest.set(r.slug, r.score); + } + } + const pageScored = Array.from(pageBest.entries()) + .map(([slug, score]) => ({ slug, score })) + .sort((a, b) => b.score - a.score || a.slug.localeCompare(b.slug)) + .slice(0, limit); + + return pageScored.map((p, i) => ({ + page_id: p.slug, + score: p.score, + rank: i + 1, + })); + } + + async snapshot(_state: BrainState): Promise { + return ''; + } +} + +export function createHybridNoGraph(): HybridNoGraphAdapter { + return new HybridNoGraphAdapter(); +} diff --git a/eval/runner/adapters/ripgrep-bm25.test.ts b/eval/runner/adapters/ripgrep-bm25.test.ts new file mode 100644 index 00000000..acd1039a --- /dev/null +++ b/eval/runner/adapters/ripgrep-bm25.test.ts @@ -0,0 +1,170 @@ +import { describe, test, expect } from 'bun:test'; +import { RipgrepBm25Adapter } from './ripgrep-bm25.ts'; +import type { Page, Query } from '../types.ts'; + +function mkPage(slug: string, title: string, compiled_truth: string, timeline = ''): Page { + return { + slug, + type: 'person', + title, + compiled_truth, + timeline, + }; +} + +const CORPUS: Page[] = [ + mkPage('people/alice-chen', 'Alice Chen', + 'Alice Chen is a senior engineer at Stripe. She founded a payments startup in 2022.'), + mkPage('people/bob-kim', 'Bob Kim', + 'Bob Kim is a product manager at Acme Corp. He previously worked at Google.'), + mkPage('people/carol-park', 'Carol Park', + 'Carol Park is a VC partner at Accel. She invests in early-stage fintech companies.'), + mkPage('companies/stripe', 'Stripe', + 'Stripe is a payments infrastructure company founded by the Collison brothers. Alice Chen is a senior engineer on the platform team.'), + mkPage('companies/accel', 'Accel', + 'Accel is a venture capital firm. Carol Park is a partner focused on fintech.'), +]; + +function mkQuery(id: string, text: string, relevant: string[]): Query { + return { + id, + tier: 'easy', + text, + expected_output_type: 'cited-source-pages', + gold: { relevant }, + }; +} + +describe('RipgrepBm25Adapter', () => { + test('init returns opaque state without throwing', async () => { + const adapter = new RipgrepBm25Adapter(); + const state = await adapter.init(CORPUS, { name: 'ripgrep-bm25' }); + expect(state).toBeDefined(); + }); + + test('query for person name ranks their page first', async () => { + const adapter = new RipgrepBm25Adapter(); + const state = await adapter.init(CORPUS, { name: 'ripgrep-bm25' }); + const results = await adapter.query(mkQuery('q1', 'Alice Chen', ['people/alice-chen']), state); + expect(results.length).toBeGreaterThan(0); + expect(results[0].page_id).toBe('people/alice-chen'); + expect(results[0].rank).toBe(1); + }); + + test('query returns ranked list with increasing ranks', async () => { + const adapter = new RipgrepBm25Adapter(); + const state = await adapter.init(CORPUS, { name: 'ripgrep-bm25' }); + const results = await adapter.query(mkQuery('q1', 'payments company', ['companies/stripe']), state); + expect(results.length).toBeGreaterThan(0); + for (let i = 0; i < results.length; i++) { + expect(results[i].rank).toBe(i + 1); + } + }); + + test('scores are monotonically non-increasing by rank', async () => { + const adapter = new RipgrepBm25Adapter(); + const state = await adapter.init(CORPUS, { name: 'ripgrep-bm25' }); + const results = await adapter.query(mkQuery('q1', 'engineer at Stripe', []), state); + for (let i = 1; i < results.length; i++) { + expect(results[i].score).toBeLessThanOrEqual(results[i - 1].score); + } + }); + + test('query with no matching tokens returns empty', async () => { + const adapter = new RipgrepBm25Adapter(); + const state = await adapter.init(CORPUS, { name: 'ripgrep-bm25' }); + const results = await adapter.query(mkQuery('q1', 'xyzznonexistent quatloos', []), state); + expect(results.length).toBe(0); + }); + + test('stopword-only query returns empty', async () => { + const adapter = new RipgrepBm25Adapter(); + const state = await adapter.init(CORPUS, { name: 'ripgrep-bm25' }); + const results = await adapter.query(mkQuery('q1', 'the of and', []), state); + expect(results.length).toBe(0); + }); + + test('tie-break is deterministic by page_id when scores are equal', async () => { + // Two identical pages (except slug) should tie on score; tie-break + // must be stable-deterministic to keep benchmark runs reproducible. + const adapter = new RipgrepBm25Adapter(); + const pages: Page[] = [ + mkPage('people/b-twin', 'Twin Page', 'same content'), + mkPage('people/a-twin', 'Twin Page', 'same content'), + ]; + const state = await adapter.init(pages, { name: 'ripgrep-bm25' }); + const results = await adapter.query(mkQuery('q1', 'same content', []), state); + expect(results.length).toBe(2); + // a-twin should come first by lexicographic tie-break. + expect(results[0].page_id).toBe('people/a-twin'); + expect(results[1].page_id).toBe('people/b-twin'); + }); + + test('BM25 rewards term frequency but not linearly (k1 saturation)', async () => { + const adapter = new RipgrepBm25Adapter(); + const pages: Page[] = [ + // Three mentions of "widget" in body. + mkPage('p/three', 'Threefold', 'widget widget widget plus filler'), + // Ten mentions — should NOT score 10/3x higher (k1 saturation). + mkPage('p/ten', 'Tenfold', + 'widget widget widget widget widget widget widget widget widget widget plus filler'), + ]; + const state = await adapter.init(pages, { name: 'ripgrep-bm25' }); + const results = await adapter.query(mkQuery('q1', 'widget', []), state); + expect(results.length).toBe(2); + // Tenfold should rank higher, but not by a 10/3 ratio. + const tenScore = results.find(r => r.page_id === 'p/ten')!.score; + const threeScore = results.find(r => r.page_id === 'p/three')!.score; + expect(tenScore).toBeGreaterThan(threeScore); + // Saturation check: 10x frequency should not produce 3x the score. + expect(tenScore / threeScore).toBeLessThan(2.0); + }); + + test('doc-length normalization penalizes very long docs', async () => { + // Same number of "widget" mentions, but very different doc lengths — + // the shorter doc should rank higher (widget is a larger fraction of content). + const adapter = new RipgrepBm25Adapter(); + const filler = 'alpha beta gamma delta epsilon zeta eta theta iota kappa ' + .repeat(50); + const pages: Page[] = [ + mkPage('p/short', 'Short', 'widget widget'), + mkPage('p/long', 'Long', `widget widget ${filler}`), + ]; + const state = await adapter.init(pages, { name: 'ripgrep-bm25' }); + const results = await adapter.query(mkQuery('q1', 'widget', []), state); + expect(results[0].page_id).toBe('p/short'); + }); + + test('IDF: rare terms score higher than common terms', async () => { + const adapter = new RipgrepBm25Adapter(); + const pages: Page[] = [ + mkPage('p/a', 'A', 'common rare'), + mkPage('p/b', 'B', 'common filler'), + mkPage('p/c', 'C', 'common filler filler'), + mkPage('p/d', 'D', 'common filler filler filler'), + ]; + const state = await adapter.init(pages, { name: 'ripgrep-bm25' }); + // "rare" appears in 1/4 docs; "common" in 4/4. A match on "rare" + // should rank higher than a match on "common" alone. + const rareResults = await adapter.query(mkQuery('q1', 'rare', []), state); + const commonResults = await adapter.query(mkQuery('q2', 'common', []), state); + // Every doc has "common", but only p/a has "rare". + expect(rareResults[0].page_id).toBe('p/a'); + // The "rare" top score should be higher than the "common" top score + // because rare has higher IDF. + expect(rareResults[0].score).toBeGreaterThan(commonResults[0].score); + }); + + test('slug tokens are indexed (people/alice-chen -> alice matches)', async () => { + // Queries mentioning the slug indirectly (via name) should find the page + // even if the slug itself doesn't appear in content exactly. + const adapter = new RipgrepBm25Adapter(); + const pages: Page[] = [ + mkPage('people/alice-chen', 'Alice', 'See people/alice-chen for bio.'), + ]; + const state = await adapter.init(pages, { name: 'ripgrep-bm25' }); + const results = await adapter.query(mkQuery('q1', 'alice chen', []), state); + expect(results.length).toBe(1); + expect(results[0].page_id).toBe('people/alice-chen'); + }); +}); diff --git a/eval/runner/adapters/ripgrep-bm25.ts b/eval/runner/adapters/ripgrep-bm25.ts new file mode 100644 index 00000000..8dd440e4 --- /dev/null +++ b/eval/runner/adapters/ripgrep-bm25.ts @@ -0,0 +1,196 @@ +/** + * BrainBench EXT-1: Ripgrep + BM25 adapter. + * + * The "honest grep-plus-BM25 baseline" — what any agent could build in an + * afternoon with standard unix tools and a classic IR formula. This is the + * external comparator that turns BrainBench from internal gbrain ablation + * into a real category benchmark. + * + * Design: + * 1. init(): Tokenizes each page's content, builds an inverted index + * + per-doc length table + global term/doc frequencies. + * 2. query(): Tokenizes the query, scores every candidate doc via BM25, + * returns top candidates ranked by score. + * + * No embeddings, no graph, no LLM. Just deterministic token-match ranking. + * This is intentionally what a "what could any agent do with grep + a + * reasonable ranker" baseline looks like. + * + * Reference: Robertson & Zaragoza, "The Probabilistic Relevance Framework: + * BM25 and Beyond" (2009). Standard formula: + * + * BM25(D, Q) = Σ_{q ∈ Q} IDF(q) × (tf(q, D) × (k1 + 1)) / + * (tf(q, D) + k1 × (1 - b + b × |D| / avgdl)) + * + * Defaults: k1 = 1.5, b = 0.75. These are the values Lucene ships. + */ + +import type { Adapter, AdapterConfig, BrainState, Page, Query, RankedDoc } from '../types.ts'; + +// ─── Tokenization ────────────────────────────────────────────────── + +/** + * Tokenize text for BM25: lowercase, split on non-word chars, filter stopwords + * and tokens shorter than 2 characters. Markdown link syntax `[Name](slug)` + * is preserved enough that entity names tokenize into their component words. + * + * NOTE: Slug references (e.g. `people/alice-chen`) get split into + * `people`, `alice`, `chen` tokens — intentional, so that a query for + * "alice chen" matches pages referencing her by slug as well as name. + */ +const STOPWORDS = new Set([ + 'a','an','the','and','or','but','is','are','was','were','be','been','being', + 'have','has','had','do','does','did','will','would','should','could','may', + 'might','can','of','to','in','on','at','for','with','by','from','as','it', + 'its','this','that','these','those','i','you','he','she','we','they', + 'them','us','him','her','his','hers','their','theirs','my','mine','your', + 'yours','our','ours', +]); + +function tokenize(text: string): string[] { + return text + .toLowerCase() + .split(/[^a-z0-9]+/) + .filter(t => t.length >= 2 && !STOPWORDS.has(t)); +} + +// ─── BM25 state ───────────────────────────────────────────────────── + +interface Bm25State { + /** term -> Map. Inverted index. */ + postings: Map>; + /** docId -> total token count (doc length). */ + docLengths: Map; + /** Average doc length across the corpus (BM25 normalization). */ + avgDocLength: number; + /** docId -> original Page (for returning ranked results). */ + docs: Map; + /** Total docs (|corpus|). */ + N: number; + k1: number; + b: number; +} + +function buildState(pages: Page[], k1: number, b: number): Bm25State { + const postings = new Map>(); + const docLengths = new Map(); + const docs = new Map(); + let totalLength = 0; + + for (const p of pages) { + docs.set(p.slug, p); + // Index title + compiled_truth + timeline. Title gets double weight by + // being tokenized twice — cheap boost for slug-match on entity pages. + const content = `${p.title} ${p.title} ${p.compiled_truth} ${p.timeline}`; + const tokens = tokenize(content); + docLengths.set(p.slug, tokens.length); + totalLength += tokens.length; + + const termFreq = new Map(); + for (const t of tokens) termFreq.set(t, (termFreq.get(t) ?? 0) + 1); + for (const [term, tf] of termFreq) { + let docMap = postings.get(term); + if (!docMap) { + docMap = new Map(); + postings.set(term, docMap); + } + docMap.set(p.slug, tf); + } + } + + return { + postings, + docLengths, + avgDocLength: pages.length > 0 ? totalLength / pages.length : 0, + docs, + N: pages.length, + k1, + b, + }; +} + +// ─── Scoring ──────────────────────────────────────────────────────── + +function bm25Score(state: Bm25State, docId: string, queryTokens: string[]): number { + const docLen = state.docLengths.get(docId) ?? 0; + if (docLen === 0) return 0; + const { k1, b, avgDocLength, N } = state; + let score = 0; + for (const qt of queryTokens) { + const posting = state.postings.get(qt); + if (!posting) continue; + const tf = posting.get(docId) ?? 0; + if (tf === 0) continue; + const df = posting.size; + // IDF formula: log((N - df + 0.5) / (df + 0.5) + 1) — Lucene variant, + // always positive. Standard Robertson-Sparck-Jones can go negative + // for very common terms, which misranks; Lucene's +1 smooths this out. + const idf = Math.log((N - df + 0.5) / (df + 0.5) + 1); + const numerator = tf * (k1 + 1); + const denominator = tf + k1 * (1 - b + b * (docLen / avgDocLength)); + score += idf * (numerator / denominator); + } + return score; +} + +// ─── Candidate collection ────────────────────────────────────────── + +/** Union of docs containing ANY query token — inverted index lookup. */ +function candidateDocs(state: Bm25State, queryTokens: string[]): Set { + const candidates = new Set(); + for (const qt of queryTokens) { + const posting = state.postings.get(qt); + if (!posting) continue; + for (const docId of posting.keys()) candidates.add(docId); + } + return candidates; +} + +// ─── Adapter implementation ──────────────────────────────────────── + +interface RipgrepBm25Config extends AdapterConfig { + k1?: number; + b?: number; +} + +export class RipgrepBm25Adapter implements Adapter { + readonly name = 'ripgrep-bm25'; + + async init(rawPages: Page[], config: RipgrepBm25Config): Promise { + const k1 = config.k1 ?? 1.5; + const b = config.b ?? 0.75; + return buildState(rawPages, k1, b); + } + + async query(q: Query, state: BrainState): Promise { + const s = state as Bm25State; + const queryTokens = tokenize(q.text); + if (queryTokens.length === 0) return []; + + const candidates = candidateDocs(s, queryTokens); + const scored: { id: string; score: number }[] = []; + for (const docId of candidates) { + const score = bm25Score(s, docId, queryTokens); + if (score > 0) scored.push({ id: docId, score }); + } + // Descending by score, stable tie-break by docId for determinism. + scored.sort((a, b) => b.score - a.score || a.id.localeCompare(b.id)); + + return scored.map((s, i) => ({ + page_id: s.id, + score: s.score, + rank: i + 1, + })); + } + + async snapshot(_state: BrainState): Promise { + // BM25 state is pure-memory; no snapshot semantics needed for v1.1. + // Future: serialize the inverted index to disk for warm-start reruns. + return ''; + } +} + +/** Convenience factory — construct with default config. */ +export function createRipgrepBm25(): RipgrepBm25Adapter { + return new RipgrepBm25Adapter(); +} diff --git a/eval/runner/adapters/vector-only.test.ts b/eval/runner/adapters/vector-only.test.ts new file mode 100644 index 00000000..3ab7edf2 --- /dev/null +++ b/eval/runner/adapters/vector-only.test.ts @@ -0,0 +1,51 @@ +import { describe, test, expect } from 'bun:test'; +import { _cosine } from './vector-only.ts'; + +// Note: VectorOnlyAdapter.init/query require a live embedding API key. +// Those end-to-end tests live in a smoke-test class and gate on OPENAI_API_KEY. +// Here we unit-test the pure-function pieces. + +describe('vector-only adapter (pure helpers)', () => { + test('cosine of identical vectors = 1.0', () => { + const a = new Float32Array([1, 2, 3, 4]); + const b = new Float32Array([1, 2, 3, 4]); + expect(_cosine(a, b)).toBeCloseTo(1.0, 6); + }); + + test('cosine of opposite vectors = -1.0', () => { + const a = new Float32Array([1, 2, 3]); + const b = new Float32Array([-1, -2, -3]); + expect(_cosine(a, b)).toBeCloseTo(-1.0, 6); + }); + + test('cosine of orthogonal vectors = 0', () => { + const a = new Float32Array([1, 0, 0]); + const b = new Float32Array([0, 1, 0]); + expect(_cosine(a, b)).toBeCloseTo(0.0, 6); + }); + + test('cosine handles zero vector by returning 0', () => { + const a = new Float32Array([0, 0, 0]); + const b = new Float32Array([1, 2, 3]); + expect(_cosine(a, b)).toBe(0); + }); + + test('cosine is scale-invariant', () => { + const a = new Float32Array([1, 2, 3]); + const b = new Float32Array([2, 4, 6]); + // Same direction, different magnitudes; cosine should still be 1. + expect(_cosine(a, b)).toBeCloseTo(1.0, 6); + }); + + test('cosine returns 0 on mismatched-length vectors at the tail', () => { + // Uses min(len) — shorter vector's dimensions are compared, longer's + // extras are implicitly dropped. Produces a sensible number even + // when upstream glue has a dim mismatch bug; helps fail-soft rather + // than crash the benchmark. + const a = new Float32Array([1, 1, 1]); + const b = new Float32Array([1, 1]); + const sim = _cosine(a, b); + expect(sim).toBeGreaterThan(0); + expect(sim).toBeLessThanOrEqual(1); + }); +}); diff --git a/eval/runner/adapters/vector-only.ts b/eval/runner/adapters/vector-only.ts new file mode 100644 index 00000000..2e7dd412 --- /dev/null +++ b/eval/runner/adapters/vector-only.ts @@ -0,0 +1,150 @@ +/** + * BrainBench EXT-2: Vector-only RAG adapter. + * + * Commodity vector RAG: embed every page once, embed the query, rank by + * cosine similarity. No graph, no keyword fallback, no BM25 — the opposite + * end of the baseline spectrum from EXT-1. + * + * Uses the SAME embedding model gbrain uses internally (text-embedding-3-large + * via src/core/embedding.ts). Apples-to-apples on the embedding layer: any + * lead gbrain has over vector-only must come from the graph + hybrid fusion, + * not from a better embedder. This is the honest external comparator. + * + * Cost: ~$0.02 per run on the 240-page corpus (embed 240 pages once, embed + * each query once, ~120K total tokens at $0.13/M). + * + * Design: + * 1. init(): embed each page's title + compiled_truth + timeline as ONE + * vector per page. Store in memory. + * 2. query(): embed query text, compute cosine similarity against every + * page vector, rank descending. + * + * Notes: + * - No chunking. One vector per page. Real vector RAG in production + * chunks long docs; we intentionally don't here so the comparison + * against gbrain's chunked hybrid is fair at the retrieval granularity. + * If a future BrainBench iteration wants to test chunked vector RAG, + * that's a separate adapter (EXT-2b maybe). + * - No keyword fallback. Pure vector similarity. An agent that wanted + * vector+keyword would use EXT-3 hybrid-without-graph. + */ + +import type { Adapter, AdapterConfig, BrainState, Page, Query, RankedDoc } from '../types.ts'; +import { embed, embedBatch } from '../../../src/core/embedding.ts'; + +// ─── Vector math ──────────────────────────────────────────────────── + +/** + * Cosine similarity between two dense vectors. Assumes equal length; + * callers upstream ensure embedder returned consistent-dim vectors. + */ +function cosine(a: Float32Array, b: Float32Array): number { + let dot = 0; + let normA = 0; + let normB = 0; + const n = Math.min(a.length, b.length); + for (let i = 0; i < n; i++) { + dot += a[i] * b[i]; + normA += a[i] * a[i]; + normB += b[i] * b[i]; + } + if (normA === 0 || normB === 0) return 0; + return dot / (Math.sqrt(normA) * Math.sqrt(normB)); +} + +// ─── Adapter state ────────────────────────────────────────────────── + +interface VectorOnlyState { + /** docId -> its embedding vector. */ + vectors: Map; + /** docId -> original Page. */ + docs: Map; + /** Embedding model used (for scorecard reproducibility card). */ + embeddingModel: string; +} + +// ─── Adapter implementation ──────────────────────────────────────── + +interface VectorOnlyConfig extends AdapterConfig { + /** Chunk size in chars for page content sent to embedder. + * Default: unchunked (single vector per page). Capped at 8K chars + * to stay within embedding model input limits. */ + maxChars?: number; + /** Max parallel embedding requests during init (the embedBatch helper + * chunks internally; this throttles if upstream rate-limits). */ + batchSize?: number; +} + +export class VectorOnlyAdapter implements Adapter { + readonly name = 'vector-only'; + + async init(rawPages: Page[], config: VectorOnlyConfig): Promise { + const maxChars = config.maxChars ?? 8000; + const batchSize = config.batchSize ?? 50; + + const docs = new Map(); + const contents: string[] = []; + const slugOrder: string[] = []; + for (const p of rawPages) { + docs.set(p.slug, p); + const combined = `${p.title}\n\n${p.compiled_truth}\n\n${p.timeline}` + .slice(0, maxChars); + contents.push(combined); + slugOrder.push(p.slug); + } + + // Embed in batches to respect rate limits. embedBatch handles the + // OpenAI API call pattern (retry + backoff) per src/core/embedding.ts. + const vectors = new Map(); + for (let i = 0; i < contents.length; i += batchSize) { + const batch = contents.slice(i, i + batchSize); + const slugs = slugOrder.slice(i, i + batchSize); + const embeddings = await embedBatch(batch); + for (let j = 0; j < embeddings.length; j++) { + vectors.set(slugs[j], embeddings[j]); + } + } + + // EMBEDDING_MODEL is a const export; lazy-imported here to avoid circular. + const { EMBEDDING_MODEL } = await import('../../../src/core/embedding.ts'); + return { + vectors, + docs, + embeddingModel: EMBEDDING_MODEL, + } satisfies VectorOnlyState; + } + + async query(q: Query, state: BrainState): Promise { + const s = state as VectorOnlyState; + const queryVec = await embed(q.text); + + const scored: { id: string; score: number }[] = []; + for (const [docId, docVec] of s.vectors) { + const sim = cosine(queryVec, docVec); + if (sim > 0) scored.push({ id: docId, score: sim }); + } + scored.sort((a, b) => b.score - a.score || a.id.localeCompare(b.id)); + + return scored.map((s, i) => ({ + page_id: s.id, + score: s.score, + rank: i + 1, + })); + } + + async snapshot(_state: BrainState): Promise { + // Vector state is in-memory only for v1.1. Persisted vector DBs are + // a separate future comparison (EXT-2b). + return ''; + } +} + +export function createVectorOnly(): VectorOnlyAdapter { + return new VectorOnlyAdapter(); +} + +/** + * Test helper: cosine similarity exposed for unit tests. Not for public API. + * @internal + */ +export { cosine as _cosine }; diff --git a/eval/runner/all.ts b/eval/runner/all.ts index aeb05462..4af3e4c6 100644 --- a/eval/runner/all.ts +++ b/eval/runner/all.ts @@ -24,8 +24,15 @@ interface CategoryRun { // One row per benchmark. Headline (Cat 1+2 combined) is the consolidated // before/after run on the full 240-page rich-prose corpus. Procedural // categories (3, 4, 7, 10, 12) test orthogonal capabilities. +// +// v0.10.5 additions: +// - Cat 2 Type Accuracy (rich prose): per-link-type accuracy measured +// directly on the 240-page corpus. The rich-prose bar for inferLinkType, +// distinct from Cat 1's retrieval metrics. Validates extraction regex +// work (works_at, advises) and exposes per-type confusion. const CATEGORIES = [ { num: 1, name: 'Before/After PR #188 (240-page rich corpus, relational queries)', script: 'eval/runner/before-after.ts' }, + { num: 2, name: 'Type Accuracy (per-link-type on rich prose)', script: 'eval/runner/type-accuracy.ts' }, { num: 3, name: 'Identity Resolution', script: 'eval/runner/identity.ts' }, { num: 4, name: 'Temporal Queries', script: 'eval/runner/temporal.ts' }, { num: 7, name: 'Performance / Latency', script: 'eval/runner/perf.ts' }, diff --git a/eval/runner/multi-adapter.ts b/eval/runner/multi-adapter.ts new file mode 100644 index 00000000..4aa4782c --- /dev/null +++ b/eval/runner/multi-adapter.ts @@ -0,0 +1,477 @@ +/** + * BrainBench multi-adapter runner (Phase 2). + * + * Runs multiple adapter implementations against the same corpus and the + * same relational query set, emitting a side-by-side scorecard. This is + * the neutrality unlock — external baselines scored on the same bar as + * gbrain, so the scorecard answers "how does gbrain compare to what any + * agent could do?" rather than just "what changed between gbrain versions?" + * + * v1.1 Phase 2 adapters (shipping in order): + * - GBRAIN_AFTER (gbrain post-v0.10.3: graph+hybrid) + * - RIPGREP_BM25 (EXT-1: classic IR baseline, this commit) + * - vector-only RAG (EXT-2: future) + * - hybrid-without-graph (EXT-3: future) + * + * Usage: + * bun eval/runner/multi-adapter.ts [--adapter ripgrep-bm25|gbrain-after|all] + * bun eval/runner/multi-adapter.ts --json + */ + +import { readdirSync, readFileSync } from 'fs'; +import { join } from 'path'; +import { PGLiteEngine } from '../../src/core/pglite-engine.ts'; +import { runExtract } from '../../src/commands/extract.ts'; +import { RipgrepBm25Adapter } from './adapters/ripgrep-bm25.ts'; +import { VectorOnlyAdapter } from './adapters/vector-only.ts'; +import { HybridNoGraphAdapter } from './adapters/hybrid-nograph.ts'; +import type { Adapter, Page, Query, RankedDoc } from './types.ts'; +import { precisionAtK, recallAtK } from './types.ts'; + +const TOP_K = 5; + +// ─── Corpus loader ───────────────────────────────────────────────── + +interface RichPage extends Page { + _facts: { + type: string; + role?: string; + primary_affiliation?: string; + secondary_affiliations?: string[]; + founders?: string[]; + employees?: string[]; + investors?: string[]; + advisors?: string[]; + attendees?: string[]; + }; +} + +function loadCorpus(dir: string): RichPage[] { + const files = readdirSync(dir).filter(f => f.endsWith('.json') && !f.startsWith('_')); + const out: RichPage[] = []; + for (const f of files) { + const p = JSON.parse(readFileSync(join(dir, f), 'utf-8')); + if (Array.isArray(p.timeline)) p.timeline = p.timeline.join('\n'); + if (Array.isArray(p.compiled_truth)) p.compiled_truth = p.compiled_truth.join('\n\n'); + p.title = String(p.title ?? ''); + p.compiled_truth = String(p.compiled_truth ?? ''); + p.timeline = String(p.timeline ?? ''); + out.push(p as RichPage); + } + return out; +} + +// ─── Relational query builder (gold from _facts) ───────────────── + +function buildQueries(pages: RichPage[]): Query[] { + const existing = new Set(pages.map(p => p.slug)); + const filter = (slugs: string[]) => slugs.filter(s => existing.has(s)); + const queries: Query[] = []; + let counter = 0; + const nextId = () => `q-${String(++counter).padStart(4, '0')}`; + + // "Who attended X?" (meeting → people). Medium tier. + for (const p of pages) { + if (p._facts.type !== 'meeting') continue; + const expected = filter(p._facts.attendees ?? []); + if (expected.length === 0) continue; + queries.push({ + id: nextId(), + tier: 'medium', + text: `Who attended ${p.title}?`, + expected_output_type: 'cited-source-pages', + gold: { relevant: expected }, + }); + } + + // "Who works at X?" (company → people). Medium. + for (const p of pages) { + if (p._facts.type !== 'company') continue; + const expected = filter([...(p._facts.employees ?? []), ...(p._facts.founders ?? [])]); + if (expected.length === 0) continue; + queries.push({ + id: nextId(), + tier: 'medium', + text: `Who works at ${p.title}?`, + expected_output_type: 'cited-source-pages', + gold: { relevant: [...new Set(expected)] }, + }); + } + + // "Who invested in X?" Medium. + for (const p of pages) { + if (p._facts.type !== 'company') continue; + const expected = filter(p._facts.investors ?? []); + if (expected.length === 0) continue; + queries.push({ + id: nextId(), + tier: 'medium', + text: `Who invested in ${p.title}?`, + expected_output_type: 'cited-source-pages', + gold: { relevant: expected }, + }); + } + + // "Who advises X?" Medium. + for (const p of pages) { + if (p._facts.type !== 'company') continue; + const expected = filter(p._facts.advisors ?? []); + if (expected.length === 0) continue; + queries.push({ + id: nextId(), + tier: 'medium', + text: `Who advises ${p.title}?`, + expected_output_type: 'cited-source-pages', + gold: { relevant: expected }, + }); + } + + return queries; +} + +// ─── gbrain-after adapter (inline, wraps existing engine) ───────── + +/** + * Minimal gbrain adapter for the side-by-side run. Wraps PGLiteEngine + + * extract + the same graph-first-then-grep strategy used in before-after.ts. + * + * When the dedicated GbrainAdapter class ships (separate commit), this + * inline wrapper is the bridge — same semantics, different surface. + */ +class GbrainAfterAdapter implements Adapter { + readonly name = 'gbrain-after'; + + async init(rawPages: Page[]): Promise { + const engine = new PGLiteEngine(); + await engine.connect({}); + await engine.initSchema(); + for (const p of rawPages) { + await engine.putPage(p.slug, { + type: p.type, + title: p.title, + compiled_truth: p.compiled_truth, + timeline: p.timeline, + }); + } + // Silence extract's console.error noise during benchmark runs. + const origErr = console.error; + console.error = () => {}; + try { + await runExtract(engine, ['links', '--source', 'db']); + await runExtract(engine, ['timeline', '--source', 'db']); + } finally { + console.error = origErr; + } + // Build a text map for grep fallback identical to before-after.ts. + const contentBySlug = new Map(); + for (const p of rawPages) { + contentBySlug.set(p.slug, `${p.title}\n${p.compiled_truth}\n${p.timeline}`); + } + return { engine, contentBySlug }; + } + + async query(q: Query, state: unknown): Promise { + const { engine, contentBySlug } = state as { + engine: PGLiteEngine; + contentBySlug: Map; + }; + + // Parse the relational query text to extract seed + direction + linkTypes. + // Format matches what buildQueries() emits; for EXT adapters this parsing + // is skipped and they just do text-match on query.text. + const { seed, direction, linkTypes } = parseRelationalQuery(q, contentBySlug); + + // Graph-first ranking. + const graphHits: string[] = []; + if (seed && linkTypes.length > 0) { + for (const lt of linkTypes) { + const paths = await engine.traversePaths(seed, { + depth: 1, + direction, + linkType: lt, + }); + for (const p of paths) { + const target = direction === 'out' ? p.to_slug : p.from_slug; + if (target !== seed && !graphHits.includes(target)) graphHits.push(target); + } + } + } + // Grep fallback for entities the extractor missed. + const grepHits: string[] = []; + if (seed) { + if (direction === 'out') { + // No explicit grep fallback for outgoing — graph has it. + } else { + for (const [slug, content] of contentBySlug) { + if (slug === seed) continue; + if (graphHits.includes(slug)) continue; + if (content.includes(seed)) grepHits.push(slug); + } + grepHits.sort(); + } + } + const ranked = [...graphHits, ...grepHits]; + return ranked.map((id, i) => ({ + page_id: id, + score: ranked.length - i, // synthetic descending score + rank: i + 1, + })); + } + + async teardown(state: unknown): Promise { + const { engine } = state as { engine: PGLiteEngine }; + await engine.disconnect(); + } +} + +/** + * Parse a relational query template into (seed, direction, linkTypes). + * Matches the templates emitted by buildQueries(). Returns empty linkTypes + * if the query doesn't match a known template (adapter falls back to grep). + */ +function parseRelationalQuery( + q: Query, + contentBySlug: Map, +): { seed: string; direction: 'in' | 'out'; linkTypes: string[] } { + // Title->slug lookup table for resolving the entity named in the query. + const titleToSlug = new Map(); + for (const [slug, content] of contentBySlug) { + const title = content.split('\n')[0] ?? ''; + if (title) titleToSlug.set(title.toLowerCase(), slug); + } + const text = q.text; + + // "Who attended ?" → meeting seed, direction=out, attended + let m = /^Who attended (.+)\?$/.exec(text); + if (m) { + const seed = titleToSlug.get(m[1].toLowerCase()) ?? ''; + return { seed, direction: 'out', linkTypes: ['attended'] }; + } + // "Who works at <title>?" → company seed, in, works_at+founded + m = /^Who works at (.+)\?$/.exec(text); + if (m) { + const seed = titleToSlug.get(m[1].toLowerCase()) ?? ''; + return { seed, direction: 'in', linkTypes: ['works_at', 'founded'] }; + } + // "Who invested in <title>?" → company seed, in, invested_in + m = /^Who invested in (.+)\?$/.exec(text); + if (m) { + const seed = titleToSlug.get(m[1].toLowerCase()) ?? ''; + return { seed, direction: 'in', linkTypes: ['invested_in'] }; + } + // "Who advises <title>?" → company seed, in, advises + m = /^Who advises (.+)\?$/.exec(text); + if (m) { + const seed = titleToSlug.get(m[1].toLowerCase()) ?? ''; + return { seed, direction: 'in', linkTypes: ['advises'] }; + } + return { seed: '', direction: 'in', linkTypes: [] }; +} + +// ─── Tolerance bands (N-run variance measurement) ────────────────── + +/** + * N=5 per eng pass 3 decision. For current adapters (all deterministic + * over sorted page input) bands will be ~0. Per-run variance surfaces + * when any of these enter the benchmark: + * - LLM-judge scoring (future) + * - Non-deterministic embedding providers + * - Page-ordering-dependent dedup tie-breaks (induced here by shuffle) + * + * Shuffling ingestion order per run reveals order-sensitive bugs. An + * adapter with hidden order-dependence (e.g. a tie-break that favors + * first-seen slug) shows up as non-zero stddev. + */ +const RUNS_PER_ADAPTER = Number(process.env.BRAINBENCH_N ?? '5'); + +interface RunResult { + mean_precision_at_k: number; + mean_recall_at_k: number; + correct_in_top_k: number; + total_expected: number; +} + +interface AdapterScorecard { + adapter: string; + queries: number; + runs: number; + /** Mean across N runs. */ + mean_precision_at_k: number; + mean_recall_at_k: number; + /** Sample stddev across N runs (n-1 denominator). Zero means deterministic. */ + stddev_precision_at_k: number; + stddev_recall_at_k: number; + /** From the first run (for the headline "correct/gold" column). */ + correct_in_top_k: number; + total_expected: number; +} + +/** + * Seeded Fisher-Yates shuffle. Deterministic given the same seed so + * N-run results are reproducible by anyone re-running with the same seed. + * Uses a linear congruential generator (LCG) — good enough for benchmark + * permutations, not cryptographic. + */ +function shuffleSeeded<T>(arr: T[], seed: number): T[] { + const out = [...arr]; + let s = seed >>> 0; + const next = () => { + s = (s * 1664525 + 1013904223) >>> 0; + return s / 0x100000000; + }; + for (let i = out.length - 1; i > 0; i--) { + const j = Math.floor(next() * (i + 1)); + [out[i], out[j]] = [out[j], out[i]]; + } + return out; +} + +async function scoreOneRun( + adapter: Adapter, + pages: Page[], + queries: Query[], +): Promise<RunResult> { + const state = await adapter.init(pages, { name: adapter.name }); + let totalP = 0; + let totalR = 0; + let totalCorrect = 0; + let totalExpected = 0; + for (const q of queries) { + const results = await adapter.query(q, state); + const relevant = new Set(q.gold.relevant ?? []); + totalP += precisionAtK(results, relevant, TOP_K); + totalR += recallAtK(results, relevant, TOP_K); + const topK = results.slice(0, TOP_K); + for (const r of topK) if (relevant.has(r.page_id)) totalCorrect++; + totalExpected += relevant.size; + } + if (adapter.teardown) await adapter.teardown(state); + return { + mean_precision_at_k: queries.length > 0 ? totalP / queries.length : 0, + mean_recall_at_k: queries.length > 0 ? totalR / queries.length : 0, + correct_in_top_k: totalCorrect, + total_expected: totalExpected, + }; +} + +function stddev(values: number[]): number { + const n = values.length; + if (n < 2) return 0; + const mean = values.reduce((a, b) => a + b, 0) / n; + const variance = values.reduce((a, b) => a + (b - mean) ** 2, 0) / (n - 1); + return Math.sqrt(variance); +} + +async function scoreAdapter( + adapter: Adapter, + pages: Page[], + queries: Query[], +): Promise<AdapterScorecard> { + const runResults: RunResult[] = []; + for (let i = 0; i < RUNS_PER_ADAPTER; i++) { + // Shuffle pages per run with a per-run seed. Seed = i + 1 (not 0, + // since LCG iterates once at start of next()). Run 0 uses the seed + // that produces a minimally-scrambled permutation; doesn't matter + // for correctness since we aggregate across runs. + const shuffled = shuffleSeeded(pages, i + 1); + const r = await scoreOneRun(adapter, shuffled, queries); + runResults.push(r); + } + const pVals = runResults.map(r => r.mean_precision_at_k); + const rVals = runResults.map(r => r.mean_recall_at_k); + return { + adapter: adapter.name, + queries: queries.length, + runs: RUNS_PER_ADAPTER, + mean_precision_at_k: pVals.reduce((a, b) => a + b, 0) / pVals.length, + mean_recall_at_k: rVals.reduce((a, b) => a + b, 0) / rVals.length, + stddev_precision_at_k: stddev(pVals), + stddev_recall_at_k: stddev(rVals), + correct_in_top_k: runResults[0].correct_in_top_k, + total_expected: runResults[0].total_expected, + }; +} + +function pct(n: number, digits = 1): string { + return `${(n * 100).toFixed(digits)}%`; +} + +function pctBand(mean: number, sd: number, digits = 1): string { + if (sd === 0) return pct(mean, digits); + return `${pct(mean, digits)} \u00b1${(sd * 100).toFixed(digits)}`; +} + +// ─── Main ────────────────────────────────────────────────────────── + +async function main() { + const json = process.argv.includes('--json'); + const only = process.argv.find(a => a.startsWith('--adapter='))?.slice('--adapter='.length); + const log = json ? () => {} : console.log; + + log('# BrainBench — multi-adapter side-by-side\n'); + log(`Generated: ${new Date().toISOString().slice(0, 19)}`); + + const pages = loadCorpus('eval/data/world-v1') as Page[]; + log(`Corpus: ${pages.length} rich-prose pages from eval/data/world-v1/`); + + const queries = buildQueries(pages as RichPage[]); + log(`Relational queries: ${queries.length}\n`); + + const allAdapters: Adapter[] = [ + new GbrainAfterAdapter(), + new HybridNoGraphAdapter(), + new RipgrepBm25Adapter(), + new VectorOnlyAdapter(), + ]; + const adapters = only ? allAdapters.filter(a => a.name === only) : allAdapters; + if (adapters.length === 0) { + console.error(`No adapter matches --adapter=${only}. Available: ${allAdapters.map(a => a.name).join(', ')}`); + process.exit(1); + } + + log(`## Running adapters (N=${RUNS_PER_ADAPTER} runs per adapter, page-order shuffled per run)\n`); + const scorecards: AdapterScorecard[] = []; + for (const a of adapters) { + log(`- ${a.name} ...`); + const t0 = Date.now(); + const sc = await scoreAdapter(a, pages, queries); + const elapsed = ((Date.now() - t0) / 1000).toFixed(1); + log(` done (${elapsed}s). P@${TOP_K} ${pctBand(sc.mean_precision_at_k, sc.stddev_precision_at_k)}, R@${TOP_K} ${pctBand(sc.mean_recall_at_k, sc.stddev_recall_at_k)}, ${sc.correct_in_top_k}/${sc.total_expected} correct (run 1)`); + scorecards.push(sc); + } + + log('\n## Side-by-side scorecard (mean \u00b1 stddev across N runs)\n'); + log(`| Adapter | Runs | Queries | P@${TOP_K} (mean \u00b1 sd) | R@${TOP_K} (mean \u00b1 sd) |`); + log('|---------------------|------|---------|---------------------|---------------------|'); + for (const sc of scorecards) { + log(`| ${sc.adapter.padEnd(19)} | ${String(sc.runs).padStart(4)} | ${String(sc.queries).padStart(7)} | ${pctBand(sc.mean_precision_at_k, sc.stddev_precision_at_k).padStart(19)} | ${pctBand(sc.mean_recall_at_k, sc.stddev_recall_at_k).padStart(19)} |`); + } + log(''); + log('*Stddev = 0 means the adapter is deterministic over page ordering. Non-zero stddev surfaces order-dependent bugs (e.g. tie-break that favors first-seen slug). LLM-judge-based metrics will produce non-zero stddev once added.*\n'); + + if (scorecards.length >= 2) { + const [first, ...rest] = scorecards; + log('## Deltas vs ' + first.adapter + '\n'); + for (const other of rest) { + const dP = (other.mean_precision_at_k - first.mean_precision_at_k) * 100; + const dR = (other.mean_recall_at_k - first.mean_recall_at_k) * 100; + const dC = other.correct_in_top_k - first.correct_in_top_k; + log(`- ${other.adapter}: P@${TOP_K} ${dP >= 0 ? '+' : ''}${dP.toFixed(1)}pts, R@${TOP_K} ${dR >= 0 ? '+' : ''}${dR.toFixed(1)}pts, correct-in-top-${TOP_K} ${dC >= 0 ? '+' : ''}${dC}`); + } + log(''); + } + + log('## Methodology\n'); + log(`- Corpus: 240 rich-prose fictional pages (eval/data/world-v1/).`); + log(`- Gold: ${queries.length} relational queries derived from _facts metadata.`); + log(`- Metrics: mean P@${TOP_K} and R@${TOP_K} across all queries.`); + log(`- Top-K: ${TOP_K} (what agents actually read in ranked results).`); + log(`- Each adapter reingests raw pages. No gold data visible to adapters.`); + + if (json) console.log(JSON.stringify({ scorecards, queries: queries.length, corpus: pages.length }, null, 2)); +} + +main().catch(e => { + console.error(e); + process.exit(1); +}); diff --git a/eval/runner/queries/index.ts b/eval/runner/queries/index.ts new file mode 100644 index 00000000..af558c21 --- /dev/null +++ b/eval/runner/queries/index.ts @@ -0,0 +1,38 @@ +/** + * Aggregates all tier-5/5.5 query sets and exposes validator helpers. + * + * Usage: + * import { getAllTierQueries, validateAll } from './queries'; + * const queries = getAllTierQueries(); + * const result = validateAll(queries); + */ + +import type { Query } from '../types.ts'; +import { getTier5FuzzyQueries } from './tier5-fuzzy.ts'; +import { getTier5_5SyntheticQueries } from './tier5_5-synthetic.ts'; +import { validateQuerySet, formatIssues } from './validator.ts'; + +/** Tier 5 Fuzzy/Vibe (hand-authored by gstack maintainers). */ +export { getTier5FuzzyQueries } from './tier5-fuzzy.ts'; + +/** Tier 5.5 externally-authored (SYNTHETIC placeholder; see CONTRIBUTING.md). */ +export { getTier5_5SyntheticQueries } from './tier5_5-synthetic.ts'; + +export { validateQuery, validateQuerySet, formatIssues, TEMPORAL_VERBS } from './validator.ts'; +export type { ValidationIssue, ValidationResult } from './validator.ts'; + +/** All Tier 5 + 5.5 queries concatenated. */ +export function getAllTierQueries(): Query[] { + return [...getTier5FuzzyQueries(), ...getTier5_5SyntheticQueries()]; +} + +/** Validate the complete Tier-5 + 5.5 set (used by CI + eval:query:validate). */ +export function validateAll(): { ok: boolean; count: number; report: string } { + const queries = getAllTierQueries(); + const result = validateQuerySet(queries); + return { + ok: result.ok, + count: queries.length, + report: formatIssues(result), + }; +} diff --git a/eval/runner/queries/tier5-fuzzy.ts b/eval/runner/queries/tier5-fuzzy.ts new file mode 100644 index 00000000..ebe3dc9f --- /dev/null +++ b/eval/runner/queries/tier5-fuzzy.ts @@ -0,0 +1,308 @@ +/** + * Tier 5: Fuzzy / Vibe queries. + * + * Vague recall and "I know I mentioned this somewhere" — the kind of query + * real people ask their brain when they can't quite remember the exact + * entity name. Graph-heavy systems shouldn't have an inherent edge here + * because the query doesn't mention the target entity precisely. + * + * Per the 4-review arc: these address Codex's circularity critique + * ("gbrain's adversarial list is its product roadmap"). If gbrain loses + * ground on vague queries while winning on relational ones, that's an + * honest tradeoff story. If gbrain wins on BOTH, that's a stronger + * benchmark claim. + * + * Target: ~30 queries (statistical floor per eng pass 3). + * + * Gold derivation: each query specifies an expected answer slug set based + * on the canonical world (eval/data/world-v1/_ledger.json). We fix a small + * set of landmarks here rather than deriving from _facts — fuzzy queries + * don't map 1:1 to _facts fields. + */ + +import type { Query } from '../types.ts'; + +/** + * Hand-authored Tier 5 vibe queries. Each targets real entities from + * eval/data/world-v1/ that actually exist in the corpus. Slugs verified + * against world-v1 shards at authoring time. + */ +export const TIER5_FUZZY_QUERIES: Query[] = [ + // ── "I know I mentioned this somewhere" style ───────────────────── + { + id: 'q5-0001', + tier: 'fuzzy', + text: 'Someone I know was a senior engineer at a biotech company doing drug discovery — who?', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['people/adam-lopez-113'] }, + as_of_date: 'per-source', + tags: ['vague-recall', 'role-based'], + known_failure_modes: ['might return all biotech company pages; we want the person'], + }, + { + id: 'q5-0002', + tier: 'fuzzy', + text: 'The crypto-infra founder who did a stint at Goldman before building his own thing', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['people/adam-lee-19'] }, + tags: ['vague-recall', 'biographical'], + }, + { + id: 'q5-0003', + tier: 'fuzzy', + text: 'The security advisor woman based in Boston, multi-year engagement with a cybersecurity startup', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['people/alice-davis-172'] }, + tags: ['vague-recall', 'role-based', 'location'], + }, + + // ── Summarization-over-messy-notes style ────────────────────────── + { + id: 'q5-0004', + tier: 'fuzzy', + text: 'Summarize what we know about founders who raised Series A in 2024.', + expected_output_type: 'answer-string', + gold: { relevant: [] }, + acceptable_variants: ['Series A 2024 founders summary'], + tags: ['summarization', 'multi-entity'], + known_failure_modes: ['accept any top-K that includes actual Series-A-2024 founders'], + }, + { + id: 'q5-0005', + tier: 'fuzzy', + text: 'Who are the fintech advisors in our network?', + expected_output_type: 'abstention', + gold: { expected_abstention: true }, + tags: ['summarization', 'role-intersection', 'partial-in-corpus'], + known_failure_modes: ['"fintech" advisors are scarce in twin-amara corpus; a good system abstains or flags partial match'], + }, + { + id: 'q5-0006', + tier: 'fuzzy', + text: 'Tell me about the people who push back hard on microservices', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['people/adam-lopez-113'] }, + tags: ['trait-based', 'opinion'], + known_failure_modes: ['requires catching "controversial internal memo on microservices" in prose'], + }, + + // ── Partial-information recall ──────────────────────────────────── + { + id: 'q5-0007', + tier: 'fuzzy', + text: 'Who has a "40 under 40" mention?', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['people/adam-lee-19'] }, + tags: ['biographical-fragment'], + }, + { + id: 'q5-0008', + tier: 'fuzzy', + text: 'The company whose CEO insists on founder-friendly terms and minimal board seats', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['companies/forge-19'] }, + tags: ['trait-based', 'culture'], + }, + { + id: 'q5-0009', + tier: 'fuzzy', + text: 'Someone we know who cut sequencing pipeline runtime by about 40%', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['people/adam-lopez-113'] }, + tags: ['achievement-based'], + }, + { + id: 'q5-0010', + tier: 'fuzzy', + text: 'The person who wrote an internal memo about deleting half the microservices', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['people/adam-lopez-113'] }, + tags: ['behavioral-recall'], + }, + + // ── "what was that thing about..." style ────────────────────────── + { + id: 'q5-0011', + tier: 'fuzzy', + text: 'What was the thing about MEV-resistant transaction ordering?', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['people/adam-lee-19'] }, + as_of_date: 'per-source', + tags: ['topic-recall'], + }, + { + id: 'q5-0012', + tier: 'fuzzy', + text: 'The partner who focuses on early-stage fintech', + expected_output_type: 'abstention', + gold: { expected_abstention: true }, + tags: ['role-based', 'domain', 'partial-in-corpus'], + known_failure_modes: ['multiple partial matches; good systems either abstain or flag "multiple candidates"'], + }, + { + id: 'q5-0013', + tier: 'fuzzy', + text: 'Which Layer 1 project did that crypto guy leave over tokenomics disagreements?', + expected_output_type: 'abstention', + gold: { expected_abstention: true }, + known_failure_modes: ['prose says a Layer 1 project but never names it; good systems abstain'], + tags: ['abstention', 'under-specified'], + }, + { + id: 'q5-0014', + tier: 'fuzzy', + text: 'Who built that cross-chain messaging protocol?', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['companies/forge-19'] }, + tags: ['product-feature-recall'], + }, + { + id: 'q5-0015', + tier: 'fuzzy', + text: 'The engineer who is notoriously demanding on code review', + expected_output_type: 'abstention', + gold: { expected_abstention: true }, + as_of_date: 'corpus-end', + known_failure_modes: ['no single canonical answer in corpus; good systems flag ambiguity'], + tags: ['trait-based', 'partial-in-corpus'], + }, + + // ── Emotional / cultural recall ────────────────────────────────── + { + id: 'q5-0016', + tier: 'fuzzy', + text: 'The company whose culture is described as "either loved or hated"', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['companies/forge-19'] }, + as_of_date: 'per-source', + tags: ['culture-recall'], + }, + { + id: 'q5-0017', + tier: 'fuzzy', + text: 'The advisor who pushed hard for zero-trust architecture overhaul', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['people/alice-davis-172'] }, + tags: ['behavior-recall', 'advocacy'], + }, + { + id: 'q5-0018', + tier: 'fuzzy', + text: 'Someone who speaks selectively at conferences and prefers small venues', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['people/alice-davis-172'] }, + tags: ['preference-recall'], + }, + { + id: 'q5-0019', + tier: 'fuzzy', + text: 'Who is rumored to be writing a book on security culture?', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['people/alice-davis-172'] }, + as_of_date: 'corpus-end', + tags: ['gossip', 'side-project'], + }, + + // ── "something about X" generic-topic style ────────────────────── + { + id: 'q5-0020', + tier: 'fuzzy', + text: 'Any portfolio companies focused on drug discovery?', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['companies/delta-3'] }, + tags: ['topical', 'industry'], + }, + { + id: 'q5-0021', + tier: 'fuzzy', + text: 'Who among our founders worked at Goldman Sachs?', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['people/adam-lee-19'] }, + tags: ['background-intersection'], + }, + { + id: 'q5-0022', + tier: 'fuzzy', + text: 'The person with an MIT CS background who dropped out of a PhD', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['people/adam-lee-19'] }, + tags: ['biography-fragment'], + }, + { + id: 'q5-0023', + tier: 'fuzzy', + text: 'Who among our people is a long-distance runner?', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['people/adam-lopez-113'] }, + as_of_date: 'corpus-end', + tags: ['personal-trait'], + }, + + // ── Negative / abstention fuzzy (known failure bait) ───────────── + { + id: 'q5-0024', + tier: 'fuzzy', + text: 'Which YC W18 founder built an analytics dashboard?', + expected_output_type: 'abstention', + gold: { expected_abstention: true }, + tags: ['abstention', 'not-in-corpus'], + known_failure_modes: ['W18 batch doesn\'t exist in this corpus; good systems abstain'], + }, + { + id: 'q5-0025', + tier: 'fuzzy', + text: 'Who founded the developer-tools company that got acquihired by Roche?', + expected_output_type: 'abstention', + gold: { expected_abstention: true }, + tags: ['abstention', 'mentioned-but-not-named'], + known_failure_modes: ['prose mentions a bioinformatics startup acquired by Roche but never names it; good systems abstain'], + }, + + // ── Cross-referencing without exact entity names ───────────────── + { + id: 'q5-0026', + tier: 'fuzzy', + text: 'What companies have enterprise security architecture help from an outside advisor?', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['companies/prism-43'] }, + tags: ['relational-fuzzy'], + }, + { + id: 'q5-0027', + tier: 'fuzzy', + text: 'Who is known as a "systems builder" in the security space?', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['people/alice-davis-172'] }, + as_of_date: 'corpus-end', + tags: ['epithet-recall'], + }, + { + id: 'q5-0028', + tier: 'fuzzy', + text: 'The Boston-based person who completed a SOC 2 audit for someone', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['people/alice-davis-172'] }, + tags: ['achievement-plus-location'], + }, + { + id: 'q5-0029', + tier: 'fuzzy', + text: 'Which advisor has SecureCon Northeast speaking experience?', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['people/alice-davis-172'] }, + tags: ['conference-history'], + }, + { + id: 'q5-0030', + tier: 'fuzzy', + text: 'Someone who published multiple technical papers on cryptographic primitives', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['people/adam-lee-19'] }, + tags: ['academic-output'], + }, +]; + +export function getTier5FuzzyQueries(): Query[] { + // Defensive copy so callers can't mutate the canonical set. + return TIER5_FUZZY_QUERIES.map(q => ({ ...q, gold: { ...q.gold }, tags: q.tags ? [...q.tags] : undefined })); +} diff --git a/eval/runner/queries/tier5_5-synthetic.ts b/eval/runner/queries/tier5_5-synthetic.ts new file mode 100644 index 00000000..b2aeff40 --- /dev/null +++ b/eval/runner/queries/tier5_5-synthetic.ts @@ -0,0 +1,346 @@ +/** + * Tier 5.5: Externally-Authored queries (SYNTHETIC placeholder). + * + * Per plan file: real Tier 5.5 requires 2-3 outside researchers writing + * ~50 queries each against the committed corpus. That's a human-in-the-loop + * deliverable (see eval/CONTRIBUTING.md for how to submit). + * + * This file ships a SYNTHETIC placeholder set: AI-authored queries that + * deliberately vary phrasing patterns to simulate what an outsider would + * write. They are CLEARLY labeled via `author: "synthetic-outsider-v1"` + * so real researcher submissions can supersede them without ambiguity. + * + * Why ship synthetic ones at all: + * - Tier 5.5 has to exist in the scorecard for the multi-axis report + * to have a full column. A missing tier reads as "not measured." + * - The synthetic set establishes phrasing variety (full sentences, + * short fragments, follow-up style, comparison style, "what's the + * difference between X and Y" style) that the 4 template families + * in the medium tier don't cover. + * - When real researchers submit, scorecards can compare "AI-authored" + * vs "human-authored" columns to flag where LLM judgment differs + * from human judgment on the same corpus. + * + * Author field: "synthetic-outsider-v1" for every query here. + * + * Gold verification: slugs referenced below exist in eval/data/world-v1/. + * Small slug set for authenticity; entity pages cited: adam-lee-19, + * adam-lopez-113, alice-davis-172, forge-19, delta-3, prism-43, + * orbit-labs-92 (+ others from the corpus). + */ + +import type { Query } from '../types.ts'; + +const AUTHOR = 'synthetic-outsider-v1'; + +export const TIER5_5_SYNTHETIC_QUERIES: Query[] = [ + // ─── Short-fragment style (how real researchers write notes) ───── + { id: 'q55-0001', tier: 'externally-authored', author: AUTHOR, + text: 'crypto founder Goldman Sachs background', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['people/adam-lee-19'] }, + tags: ['fragment-style'] }, + { id: 'q55-0002', tier: 'externally-authored', author: AUTHOR, + text: 'Prism cybersecurity advisor engagement history', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['people/alice-davis-172', 'companies/prism-43'] }, + tags: ['fragment-style', 'relational'] }, + { id: 'q55-0003', tier: 'externally-authored', author: AUTHOR, + text: 'Delta biotech engineers', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['people/adam-lopez-113'] }, + tags: ['fragment-style'] }, + { id: 'q55-0004', tier: 'externally-authored', author: AUTHOR, + text: 'Forge crypto infrastructure founder details', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['people/adam-lee-19', 'companies/forge-19'] }, + tags: ['fragment-style', 'founder-company'] }, + + // ─── Full-sentence style with natural hedging ───────────────────── + { id: 'q55-0005', tier: 'externally-authored', author: AUTHOR, + text: 'Can you pull up what we have on the founder who left a Layer 1 project over tokenomics?', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['people/adam-lee-19'] }, + tags: ['polite-natural'] }, + { id: 'q55-0006', tier: 'externally-authored', author: AUTHOR, + text: 'I need the background on the Delta senior engineer who cut pipeline runtime', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['people/adam-lopez-113'] }, + tags: ['polite-natural', 'specific-achievement'] }, + { id: 'q55-0007', tier: 'externally-authored', author: AUTHOR, + text: 'Please find the advisor with SOC 2 audit prep experience at Orbit Labs', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['people/alice-davis-172'] }, + tags: ['polite-natural', 'relational'] }, + + // ─── Comparison / differentiation style ─────────────────────────── + { id: 'q55-0008', tier: 'externally-authored', author: AUTHOR, + text: 'What is the difference between Adam Lee and Adam Lopez in our network?', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['people/adam-lee-19', 'people/adam-lopez-113'] }, + as_of_date: 'corpus-end', + tags: ['disambiguation', 'comparison'] }, + { id: 'q55-0009', tier: 'externally-authored', author: AUTHOR, + text: 'Compare Forge and Delta as companies in our portfolio', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['companies/forge-19', 'companies/delta-3'] }, + tags: ['comparison'] }, + { id: 'q55-0010', tier: 'externally-authored', author: AUTHOR, + text: 'Which of our advisors is based on the East Coast?', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['people/alice-davis-172'] }, + as_of_date: 'corpus-end', + tags: ['location-filter'] }, + + // ─── Who-does-what style (role-first) ────────────────────────────── + { id: 'q55-0011', tier: 'externally-authored', author: AUTHOR, + text: 'Who focuses on synthetic biology at Delta?', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['people/adam-lopez-113'] }, + tags: ['role-first'] }, + { id: 'q55-0012', tier: 'externally-authored', author: AUTHOR, + text: 'Who wrote the whitepaper on MEV-resistant transaction ordering?', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['people/adam-lee-19'] }, + tags: ['achievement-first'] }, + { id: 'q55-0013', tier: 'externally-authored', author: AUTHOR, + text: 'Who would you ask about enterprise security architecture?', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['people/alice-davis-172'] }, + tags: ['skill-lookup'] }, + { id: 'q55-0014', tier: 'externally-authored', author: AUTHOR, + text: 'Who is the expert on bioinformatics pipelines in our network?', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['people/adam-lopez-113'] }, + as_of_date: 'corpus-end', + tags: ['expertise-lookup'] }, + + // ─── Follow-up style (assumes prior context) ────────────────────── + { id: 'q55-0015', tier: 'externally-authored', author: AUTHOR, + text: 'And who else advises Orbit Labs?', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['people/alice-davis-172'] }, + tags: ['follow-up', 'assumed-context'] }, + { id: 'q55-0016', tier: 'externally-authored', author: AUTHOR, + text: 'Also at Delta?', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['people/adam-lopez-113'] }, + tags: ['follow-up', 'minimal'] }, + + // ─── Characteristic / trait recall ───────────────────────────────── + { id: 'q55-0017', tier: 'externally-authored', author: AUTHOR, + text: 'Our demanding engineering leader with the long-term vision approach', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['people/adam-lee-19'] }, + tags: ['trait-description'] }, + { id: 'q55-0018', tier: 'externally-authored', author: AUTHOR, + text: 'The fast-shipping opinionated engineer (likes Postgres, hates meetings)', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['people/adam-lopez-113'] }, + tags: ['multi-trait'] }, + { id: 'q55-0019', tier: 'externally-authored', author: AUTHOR, + text: 'A systems-builder style advisor who scales security architecture', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['people/alice-davis-172'] }, + tags: ['epithet-plus-role'] }, + + // ─── Misspellings / typos (real researchers make these) ─────────── + { id: 'q55-0020', tier: 'externally-authored', author: AUTHOR, + text: 'adam lee the crypto guy', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['people/adam-lee-19'] }, + tags: ['lowercase', 'minimal-name'] }, + { id: 'q55-0021', tier: 'externally-authored', author: AUTHOR, + text: 'alice davis cybersecuirty advisor', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['people/alice-davis-172'] }, + tags: ['typo', 'lowercase'] }, + { id: 'q55-0022', tier: 'externally-authored', author: AUTHOR, + text: 'adam lopez bioinformatcis', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['people/adam-lopez-113'] }, + tags: ['typo'] }, + + // ─── "Find me someone like..." semantic-similarity style ───────── + { id: 'q55-0023', tier: 'externally-authored', author: AUTHOR, + text: 'Find me someone like Alice Davis for our new security engagement', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['people/alice-davis-172'] }, + tags: ['recommendation', 'similarity'] }, + { id: 'q55-0024', tier: 'externally-authored', author: AUTHOR, + text: 'Who has a profile similar to Adam Lopez — fast shipper, infrastructure background?', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['people/adam-lopez-113'] }, + tags: ['similarity', 'profile-match'] }, + + // ─── Negative / "is there anyone..." phrasing ──────────────────── + { id: 'q55-0025', tier: 'externally-authored', author: AUTHOR, + text: 'Is there anyone in our network who has given a Mainnet conference keynote?', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['people/adam-lee-19'] }, + as_of_date: 'corpus-end', + tags: ['existence-check'] }, + { id: 'q55-0026', tier: 'externally-authored', author: AUTHOR, + text: 'Does anyone know someone who has published on cryptographic primitives?', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['people/adam-lee-19'] }, + tags: ['existence-check', 'topic-lookup'] }, + + // ─── "What does X do?" direct-entity-name style ────────────────── + { id: 'q55-0027', tier: 'externally-authored', author: AUTHOR, + text: 'What does Forge do?', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['companies/forge-19'] }, + tags: ['direct-lookup'] }, + { id: 'q55-0028', tier: 'externally-authored', author: AUTHOR, + text: 'What is Prism working on these days?', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['companies/prism-43'] }, + as_of_date: 'corpus-end', + tags: ['direct-lookup', 'temporal-latest'] }, + { id: 'q55-0029', tier: 'externally-authored', author: AUTHOR, + text: "Tell me about Delta's drug discovery platform", + expected_output_type: 'cited-source-pages', + gold: { relevant: ['companies/delta-3'] }, + tags: ['direct-lookup', 'domain-specific'] }, + + // ─── Cross-cutting relationship queries ───────────────────────── + { id: 'q55-0030', tier: 'externally-authored', author: AUTHOR, + text: 'People we know who are associated with both biotech and software infrastructure', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['people/adam-lopez-113'] }, + tags: ['cross-domain'] }, + { id: 'q55-0031', tier: 'externally-authored', author: AUTHOR, + text: 'Companies where our advisors have multi-year ongoing relationships', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['companies/prism-43'] }, + tags: ['relationship-depth'] }, + + // ─── Role-plus-industry intersection ────────────────────────────── + { id: 'q55-0032', tier: 'externally-authored', author: AUTHOR, + text: 'Any security-focused advisors for enterprise clients?', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['people/alice-davis-172'] }, + tags: ['role-industry-intersect'] }, + { id: 'q55-0033', tier: 'externally-authored', author: AUTHOR, + text: 'Senior infrastructure engineers in synthetic biology', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['people/adam-lopez-113'] }, + tags: ['role-industry-intersect'] }, + + // ─── "Pull up..." imperative style ─────────────────────────────── + { id: 'q55-0034', tier: 'externally-authored', author: AUTHOR, + text: 'Pull up Alice Davis', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['people/alice-davis-172'] }, + tags: ['imperative', 'direct-name'] }, + { id: 'q55-0035', tier: 'externally-authored', author: AUTHOR, + text: 'Show me the Forge page', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['companies/forge-19'] }, + tags: ['imperative', 'direct-page'] }, + + // ─── Background-check style ────────────────────────────────────── + { id: 'q55-0036', tier: 'externally-authored', author: AUTHOR, + text: 'Prior experience of Delta senior engineers before joining', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['people/adam-lopez-113'] }, + tags: ['background-check'] }, + { id: 'q55-0037', tier: 'externally-authored', author: AUTHOR, + text: 'Educational background of Forge founder', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['people/adam-lee-19'] }, + tags: ['biographical'] }, + + // ─── "When did X..." abstention-adjacent ───────────────────────── + { id: 'q55-0038', tier: 'externally-authored', author: AUTHOR, + text: 'When did Adam Lopez earn his masters degree?', + expected_output_type: 'abstention', + gold: { expected_abstention: true }, + as_of_date: 'corpus-end', + tags: ['abstention', 'not-in-corpus'], + known_failure_modes: ['no mention of Lopez getting a masters; abstain'] }, + { id: 'q55-0039', tier: 'externally-authored', author: AUTHOR, + text: 'Was Alice Davis ever at Palo Alto Networks?', + expected_output_type: 'abstention', + gold: { expected_abstention: true }, + as_of_date: 'corpus-end', + tags: ['abstention', 'speculation-bait'] }, + + // ─── Aggregation queries ───────────────────────────────────────── + { id: 'q55-0040', tier: 'externally-authored', author: AUTHOR, + text: 'List all advisors in our corpus', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['people/alice-davis-172'] }, + tags: ['aggregation', 'partial-gold'], + known_failure_modes: ['partial gold; accept any adviser-role pages in top-K'] }, + { id: 'q55-0041', tier: 'externally-authored', author: AUTHOR, + text: 'All senior engineers', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['people/adam-lopez-113'] }, + tags: ['aggregation', 'role-filter'] }, + + // ─── Topic-with-no-entity-name (true semantic) ─────────────────── + { id: 'q55-0042', tier: 'externally-authored', author: AUTHOR, + text: 'Zero-trust architecture work', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['people/alice-davis-172', 'companies/prism-43'] }, + tags: ['topic-only'] }, + { id: 'q55-0043', tier: 'externally-authored', author: AUTHOR, + text: 'Mainnet cross-chain messaging', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['companies/forge-19'] }, + tags: ['topic-only'] }, + { id: 'q55-0044', tier: 'externally-authored', author: AUTHOR, + text: 'Protein modeling integration work', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['people/adam-lopez-113', 'companies/delta-3'] }, + tags: ['topic-only'] }, + + // ─── Natural-language "tell me about..." long form ─────────────── + { id: 'q55-0045', tier: 'externally-authored', author: AUTHOR, + text: 'Can you tell me about the Forge team structure and how they think about hiring?', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['companies/forge-19', 'people/adam-lee-19'] }, + tags: ['long-form'] }, + { id: 'q55-0046', tier: 'externally-authored', author: AUTHOR, + text: "I want to understand Alice Davis's advisory approach and her involvement at Prism", + expected_output_type: 'cited-source-pages', + gold: { relevant: ['people/alice-davis-172', 'companies/prism-43'] }, + tags: ['long-form', 'multi-entity'] }, + + // ─── Temporal / as-of queries (forces validator to require as_of_date) ─ + { id: 'q55-0047', tier: 'externally-authored', author: AUTHOR, + text: 'Was Alice Davis renewing her advisory contract with Prism recently?', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['people/alice-davis-172'] }, + as_of_date: 'corpus-end', + tags: ['temporal'] }, + { id: 'q55-0048', tier: 'externally-authored', author: AUTHOR, + text: 'When did Forge close their Series A?', + expected_output_type: 'time-qualified-answer', + gold: { expected_answer: '2024-06-19' }, + as_of_date: 'corpus-end', + tags: ['temporal', 'exact-date'] }, + { id: 'q55-0049', tier: 'externally-authored', author: AUTHOR, + text: 'What was Adam Lopez doing before he joined Delta?', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['people/adam-lopez-113'] }, + as_of_date: 'per-source', + tags: ['temporal', 'biographical'] }, + + // ─── "Give me a short summary" minimal output ──────────────────── + { id: 'q55-0050', tier: 'externally-authored', author: AUTHOR, + text: 'Short summary of Adam Lee', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['people/adam-lee-19'] }, + tags: ['summary-request'] }, +]; + +export function getTier5_5SyntheticQueries(): Query[] { + return TIER5_5_SYNTHETIC_QUERIES.map(q => ({ + ...q, + gold: { ...q.gold }, + tags: q.tags ? [...q.tags] : undefined, + })); +} diff --git a/eval/runner/queries/validator.test.ts b/eval/runner/queries/validator.test.ts new file mode 100644 index 00000000..25bbeb4a --- /dev/null +++ b/eval/runner/queries/validator.test.ts @@ -0,0 +1,209 @@ +import { describe, test, expect } from 'bun:test'; +import { validateQuery, validateQuerySet, TEMPORAL_VERBS } from './validator.ts'; +import type { Query } from '../types.ts'; + +function mkValidQuery(overrides: Partial<Query> = {}): Query { + return { + id: 'q-0001', + tier: 'easy', + text: 'Who founded Acme?', + expected_output_type: 'cited-source-pages', + gold: { relevant: ['people/alice-chen'] }, + ...overrides, + }; +} + +describe('validateQuery — required fields', () => { + test('valid query passes', () => { + const r = validateQuery(mkValidQuery()); + expect(r.ok).toBe(true); + expect(r.issues.length).toBe(0); + }); + + test('missing id fails', () => { + const r = validateQuery(mkValidQuery({ id: '' })); + expect(r.ok).toBe(false); + expect(r.issues.some(i => i.field === 'id')).toBe(true); + }); + + test('missing text fails', () => { + const r = validateQuery(mkValidQuery({ text: '' })); + expect(r.ok).toBe(false); + expect(r.issues.some(i => i.field === 'text')).toBe(true); + }); + + test('invalid tier fails', () => { + const r = validateQuery(mkValidQuery({ tier: 'invalid' as unknown as 'easy' })); + expect(r.ok).toBe(false); + expect(r.issues.some(i => i.field === 'tier')).toBe(true); + }); + + test('invalid expected_output_type fails', () => { + const r = validateQuery(mkValidQuery({ + expected_output_type: 'nonsense' as unknown as 'answer-string', + })); + expect(r.ok).toBe(false); + expect(r.issues.some(i => i.field === 'expected_output_type')).toBe(true); + }); +}); + +describe('validateQuery — temporal as_of_date rule', () => { + test('non-temporal query without as_of_date passes', () => { + const r = validateQuery(mkValidQuery({ text: 'Who founded Acme?' })); + expect(r.ok).toBe(true); + }); + + test('"is" verb without as_of_date fails', () => { + const r = validateQuery(mkValidQuery({ text: 'Where is Sarah working?' })); + expect(r.ok).toBe(false); + expect(r.issues.some(i => i.field === 'as_of_date')).toBe(true); + }); + + test('"was" verb without as_of_date fails', () => { + const r = validateQuery(mkValidQuery({ text: 'Who was at the meeting?' })); + expect(r.ok).toBe(false); + expect(r.issues.some(i => i.field === 'as_of_date')).toBe(true); + }); + + test('"as of" verb without as_of_date fails', () => { + const r = validateQuery(mkValidQuery({ text: 'As of 2024, who invested?' })); + expect(r.ok).toBe(false); + expect(r.issues.some(i => i.field === 'as_of_date')).toBe(true); + }); + + test('temporal query with "corpus-end" passes', () => { + const r = validateQuery(mkValidQuery({ + text: 'Who is the CEO now?', + as_of_date: 'corpus-end', + })); + expect(r.ok).toBe(true); + }); + + test('temporal query with "per-source" passes', () => { + const r = validateQuery(mkValidQuery({ + text: 'Who was at the meeting?', + as_of_date: 'per-source', + })); + expect(r.ok).toBe(true); + }); + + test('temporal query with ISO-8601 date passes', () => { + const r = validateQuery(mkValidQuery({ + text: 'Who was at Acme in 2023?', + as_of_date: '2023-01-01', + })); + expect(r.ok).toBe(true); + }); + + test('temporal query with bogus as_of_date fails', () => { + const r = validateQuery(mkValidQuery({ + text: 'Who is the CEO now?', + as_of_date: 'yesterday', + })); + expect(r.ok).toBe(false); + expect(r.issues.some(i => i.field === 'as_of_date')).toBe(true); + }); +}); + +describe('validateQuery — gold shape by expected_output_type', () => { + test('cited-source-pages requires gold.relevant array', () => { + const r = validateQuery(mkValidQuery({ + expected_output_type: 'cited-source-pages', + gold: {}, + })); + expect(r.ok).toBe(false); + expect(r.issues.some(i => i.field === 'gold.relevant')).toBe(true); + }); + + test('cited-source-pages with malformed slug fails', () => { + const r = validateQuery(mkValidQuery({ + gold: { relevant: ['not-a-slug'] }, + })); + expect(r.ok).toBe(false); + expect(r.issues.some(i => i.field === 'gold.relevant')).toBe(true); + }); + + test('abstention requires expected_abstention=true', () => { + const r = validateQuery(mkValidQuery({ + expected_output_type: 'abstention', + gold: {}, + })); + expect(r.ok).toBe(false); + expect(r.issues.some(i => i.field === 'gold.expected_abstention')).toBe(true); + }); + + test('abstention with expected_abstention=true passes', () => { + const r = validateQuery(mkValidQuery({ + expected_output_type: 'abstention', + gold: { expected_abstention: true }, + })); + expect(r.ok).toBe(true); + }); +}); + +describe('validateQuery — tier 5.5 author requirement', () => { + test('externally-authored without author fails', () => { + const r = validateQuery(mkValidQuery({ + tier: 'externally-authored', + })); + expect(r.ok).toBe(false); + expect(r.issues.some(i => i.field === 'author')).toBe(true); + }); + + test('externally-authored with author passes', () => { + const r = validateQuery(mkValidQuery({ + tier: 'externally-authored', + author: 'synthetic-outsider-v1', + })); + expect(r.ok).toBe(true); + }); +}); + +describe('validateQuerySet — batch level', () => { + test('duplicate ids fail', () => { + const r = validateQuerySet([ + mkValidQuery({ id: 'q-0001' }), + mkValidQuery({ id: 'q-0001' }), + ]); + expect(r.ok).toBe(false); + expect(r.issues.some(i => i.reason === 'duplicate id in batch')).toBe(true); + }); + + test('unique ids pass', () => { + const r = validateQuerySet([ + mkValidQuery({ id: 'q-0001' }), + mkValidQuery({ id: 'q-0002' }), + ]); + expect(r.ok).toBe(true); + }); + + test('issues from multiple queries aggregated', () => { + const r = validateQuerySet([ + mkValidQuery({ id: 'q-0001', text: '' }), // missing text + mkValidQuery({ id: 'q-0002', text: 'Who is CEO now?' }), // missing as_of_date + ]); + expect(r.ok).toBe(false); + expect(r.issues.some(i => i.queryId === 'q-0001' && i.field === 'text')).toBe(true); + expect(r.issues.some(i => i.queryId === 'q-0002' && i.field === 'as_of_date')).toBe(true); + }); +}); + +describe('TEMPORAL_VERBS regex', () => { + test('matches common temporal verbs', () => { + expect(TEMPORAL_VERBS.test('Where is Sarah?')).toBe(true); + expect(TEMPORAL_VERBS.test('Where was Sarah?')).toBe(true); + expect(TEMPORAL_VERBS.test('Who were the founders?')).toBe(true); + expect(TEMPORAL_VERBS.test('What was the current valuation?')).toBe(true); + expect(TEMPORAL_VERBS.test('Where is she now?')).toBe(true); + expect(TEMPORAL_VERBS.test('At the time, who led the round?')).toBe(true); + expect(TEMPORAL_VERBS.test('During Q1, which deals closed?')).toBe(true); + expect(TEMPORAL_VERBS.test('As of 2024, who works at Acme?')).toBe(true); + expect(TEMPORAL_VERBS.test('When did Alice join?')).toBe(true); + }); + + test('does not match non-temporal verbs', () => { + expect(TEMPORAL_VERBS.test('Who founded Acme?')).toBe(false); + expect(TEMPORAL_VERBS.test('Which investors backed Beta?')).toBe(false); + expect(TEMPORAL_VERBS.test('List the advisors at Gamma.')).toBe(false); + }); +}); diff --git a/eval/runner/queries/validator.ts b/eval/runner/queries/validator.ts new file mode 100644 index 00000000..653c7d9b --- /dev/null +++ b/eval/runner/queries/validator.ts @@ -0,0 +1,204 @@ +/** + * Runtime Query schema validator. + * + * Per the v1.1 eng pass 2 spec. Hand-rolled (no zod dep) to match existing + * gbrain codebase style (see src/core/yaml-lite.ts for precedent). + * + * Validates: + * - Required fields (id, tier, text, expected_output_type, gold) + * - Tier enum + * - expected_output_type enum + * - Temporal `as_of_date` rule: any query with a temporal verb MUST + * set as_of_date to ISO-8601 | "corpus-end" | "per-source" + * - id uniqueness within a batch + * - gold.relevant structure when the expected_output_type is + * 'cited-source-pages' (most common tier-1/2/3 pattern) + * + * Public functions: + * validateQuery(q) -> ValidationResult single-query + * validateQuerySet(qs) -> ValidationResult<batch> + * + * On failure, returns human-readable reasons with the offending query id + * so `eval:query:validate` can point contributors at the exact problem. + */ + +import type { Query, Tier, ExpectedOutputType } from '../types.ts'; + +// ─── Enums ───────────────────────────────────────────────────────── + +const VALID_TIERS: readonly Tier[] = [ + 'easy', 'medium', 'hard', 'adversarial', 'fuzzy', 'externally-authored', +] as const; + +const VALID_OUTPUT_TYPES: readonly ExpectedOutputType[] = [ + 'answer-string', + 'canonical-entity-id', + 'cited-source-pages', + 'time-qualified-answer', + 'abstention', + 'contradiction-explanation', + 'poison-flag', + 'confidence-score', +] as const; + +// ─── Temporal rule (per eng pass 2) ──────────────────────────────── + +/** + * Regex for detecting temporal verbs in query text. If any of these + * appear, the query is temporal and MUST carry an `as_of_date` field. + * Without that, scoring is ambiguous (which version of the fact is + * considered correct?). + */ +export const TEMPORAL_VERBS = + /\b(is|was|were|current|now|at the time|during|as of|when did)\b/i; + +const ISO_DATE_RE = /^\d{4}-\d{2}-\d{2}(T.*)?$/; + +// ─── Types ───────────────────────────────────────────────────────── + +export interface ValidationIssue { + queryId: string; + field: string; + reason: string; +} + +export interface ValidationResult { + ok: boolean; + issues: ValidationIssue[]; + /** Count of queries processed (for batch). 1 for single-query validation. */ + total: number; +} + +// ─── Individual query validation ─────────────────────────────────── + +export function validateQuery(q: Query): ValidationResult { + const issues: ValidationIssue[] = []; + const qid = q.id || '(missing id)'; + + if (!q.id || typeof q.id !== 'string' || q.id.trim().length === 0) { + issues.push({ queryId: qid, field: 'id', reason: 'id must be a non-empty string (e.g. "q-0001")' }); + } + if (!q.text || typeof q.text !== 'string' || q.text.trim().length === 0) { + issues.push({ queryId: qid, field: 'text', reason: 'text must be a non-empty string' }); + } + if (!VALID_TIERS.includes(q.tier)) { + issues.push({ queryId: qid, field: 'tier', reason: `tier must be one of ${VALID_TIERS.join(', ')}` }); + } + if (!VALID_OUTPUT_TYPES.includes(q.expected_output_type)) { + issues.push({ + queryId: qid, + field: 'expected_output_type', + reason: `expected_output_type must be one of ${VALID_OUTPUT_TYPES.join(', ')}`, + }); + } + if (!q.gold || typeof q.gold !== 'object') { + issues.push({ queryId: qid, field: 'gold', reason: 'gold must be an object' }); + } + + // Temporal as-of-date rule (eng pass 2). + if (q.text && TEMPORAL_VERBS.test(q.text)) { + if (q.as_of_date === undefined || q.as_of_date === null || q.as_of_date === '') { + issues.push({ + queryId: qid, + field: 'as_of_date', + reason: + 'temporal verb detected; as_of_date required. Set to "corpus-end", "per-source", or an ISO-8601 date.', + }); + } else if ( + q.as_of_date !== 'corpus-end' && + q.as_of_date !== 'per-source' && + !ISO_DATE_RE.test(q.as_of_date) + ) { + issues.push({ + queryId: qid, + field: 'as_of_date', + reason: 'as_of_date must be "corpus-end", "per-source", or ISO-8601 (YYYY-MM-DD)', + }); + } + } + + // If expected_output_type is cited-source-pages, gold.relevant should exist + // and be a non-empty array of slug-like strings. + if (q.expected_output_type === 'cited-source-pages') { + const rel = (q.gold as Record<string, unknown>)?.relevant; + if (!Array.isArray(rel) || rel.length === 0) { + issues.push({ + queryId: qid, + field: 'gold.relevant', + reason: 'cited-source-pages queries require gold.relevant[] with at least one slug', + }); + } else { + for (const s of rel) { + if (typeof s !== 'string' || !/^[a-z][a-z0-9-]*\/[a-z0-9][a-z0-9-]*$/.test(s)) { + issues.push({ + queryId: qid, + field: 'gold.relevant', + reason: `slug "${s}" does not match "dir/slug" format (e.g. "people/alice-chen")`, + }); + break; // one message per query is enough + } + } + } + } + + // Abstention queries MUST set expected_abstention to true. + if (q.expected_output_type === 'abstention') { + const expAb = (q.gold as Record<string, unknown>)?.expected_abstention; + if (expAb !== true) { + issues.push({ + queryId: qid, + field: 'gold.expected_abstention', + reason: 'abstention queries require gold.expected_abstention === true', + }); + } + } + + // Tier 5.5 externally-authored queries must carry an author field. + if (q.tier === 'externally-authored') { + if (!q.author || typeof q.author !== 'string' || q.author.trim().length === 0) { + issues.push({ + queryId: qid, + field: 'author', + reason: 'externally-authored queries require an author field (e.g. "@alice-researcher" or "synthetic-outsider-v1")', + }); + } + } + + return { ok: issues.length === 0, issues, total: 1 }; +} + +// ─── Batch validation ─────────────────────────────────────────────── + +export function validateQuerySet(queries: Query[]): ValidationResult { + const issues: ValidationIssue[] = []; + const seenIds = new Set<string>(); + + for (const q of queries) { + const r = validateQuery(q); + issues.push(...r.issues); + + // Duplicate ID check (batch-level). + if (q.id) { + if (seenIds.has(q.id)) { + issues.push({ queryId: q.id, field: 'id', reason: 'duplicate id in batch' }); + } + seenIds.add(q.id); + } + } + + return { ok: issues.length === 0, issues, total: queries.length }; +} + +// ─── Formatting helpers (for CLI output) ─────────────────────────── + +export function formatIssues(result: ValidationResult): string { + if (result.ok) { + return `\u2713 All ${result.total} queries valid.`; + } + const lines: string[] = []; + lines.push(`\u2717 ${result.issues.length} issue(s) across ${result.total} query/queries:`); + for (const issue of result.issues) { + lines.push(` [${issue.queryId}] ${issue.field}: ${issue.reason}`); + } + return lines.join('\n'); +} diff --git a/eval/runner/type-accuracy.ts b/eval/runner/type-accuracy.ts new file mode 100644 index 00000000..738a3fcf --- /dev/null +++ b/eval/runner/type-accuracy.ts @@ -0,0 +1,384 @@ +/** + * BrainBench — per-link-type accuracy on the 240-page rich-prose corpus. + * + * This is the measurement tool for v0.10.5+ extraction work. It: + * 1. Loads all pages from eval/data/world-v1/ + * 2. Derives GOLD expected links per page from `_facts` metadata + * (founders → founded, investors → invested_in, advisors → advises, + * employees → works_at, attendees → attended, primary_affiliation → + * works_at or founded based on role) + * 3. Runs extractPageLinks on each page → INFERRED links + * 4. Compares gold vs inferred per link type: + * - correctly_typed: gold (src, tgt) exists AND inferred type matches + * - mistyped: gold (src, tgt) exists AND inferred type differs + * - missed: gold (src, tgt) exists AND no inferred edge + * - spurious: inferred (src, tgt) with no gold edge at all + * + * Emits a per-link-type table with type accuracy per type + overall. + * Headline metric: TYPE ACCURACY = correctly_typed / (correctly_typed + mistyped) + * conditional on the edge being found at all (excludes missed). + * + * Also emits a COMBINED metric: F1 per link type treating type as part of + * the identity — (src, tgt, type) triple must match. Catches both the + * extraction-recall problem and the type-accuracy problem in one number. + * + * Usage: bun eval/runner/type-accuracy.ts [--json] + */ + +import { readdirSync, readFileSync } from 'fs'; +import { join } from 'path'; +import { extractPageLinks } from '../../src/core/link-extraction.ts'; +import type { PageType } from '../../src/core/types.ts'; + +interface RichPage { + slug: string; + type: 'person' | 'company' | 'meeting' | 'concept'; + title: string; + compiled_truth: string; + timeline: string; + _facts: { + type: string; + name?: string; + role?: string; + industry?: string; + primary_affiliation?: string; + secondary_affiliations?: string[]; + founders?: string[]; + employees?: string[]; + investors?: string[]; + advisors?: string[]; + attendees?: string[]; + related_companies?: string[]; + }; +} + +interface GoldEdge { + from: string; + to: string; + type: string; +} + +/** Load all rich-prose pages from the world-v1 shard directory. */ +function loadCorpus(dir: string): RichPage[] { + const files = readdirSync(dir).filter(f => f.endsWith('.json') && !f.startsWith('_')); + const out: RichPage[] = []; + for (const f of files) { + const p = JSON.parse(readFileSync(join(dir, f), 'utf-8')); + if (Array.isArray(p.timeline)) p.timeline = p.timeline.join('\n'); + if (Array.isArray(p.compiled_truth)) p.compiled_truth = p.compiled_truth.join('\n\n'); + p.title = String(p.title ?? ''); + p.compiled_truth = String(p.compiled_truth ?? ''); + p.timeline = String(p.timeline ?? ''); + out.push(p as RichPage); + } + return out; +} + +/** + * Derive the gold edge set from `_facts` metadata. Only edges where both + * endpoints are real pages in the corpus count (FK-constraint style). + * + * Rules: + * company.founders -> founded (person -> company) + * company.employees -> works_at (person -> company) + * company.investors -> invested_in (person -> company) + * company.advisors -> advises (person -> company) + * meeting.attendees -> attended (person -> meeting) + * person.primary_affiliation + role=founder -> founded + * person.primary_affiliation + role∈{engineer,...}-> works_at + * person.primary_affiliation + role=advisor -> advises + * person.primary_affiliation + role=investor/partner -> invested_in + * person.secondary_affiliations + role=advisor -> advises + */ +function buildGoldEdges(pages: RichPage[]): GoldEdge[] { + const existing = new Set(pages.map(p => p.slug)); + const edges: GoldEdge[] = []; + const push = (from: string, to: string, type: string) => { + if (!existing.has(from) || !existing.has(to)) return; + if (from === to) return; + edges.push({ from, to, type }); + }; + + // Company-page -> incoming edges from people referenced in _facts arrays. + for (const p of pages) { + if (p._facts.type === 'company') { + for (const f of p._facts.founders ?? []) push(f, p.slug, 'founded'); + for (const e of p._facts.employees ?? []) { + // Avoid double-labeling: if e is also a founder, prefer founded (more specific). + if ((p._facts.founders ?? []).includes(e)) continue; + push(e, p.slug, 'works_at'); + } + for (const i of p._facts.investors ?? []) push(i, p.slug, 'invested_in'); + for (const a of p._facts.advisors ?? []) push(a, p.slug, 'advises'); + } + if (p._facts.type === 'meeting') { + // Direction: extractPageLinks on a meeting page produces + // (meeting_slug, person_slug, 'attended') because the person slugs + // appear as entity refs inside the meeting page's content. Match that + // direction in the gold so (from, to) pairs align with the inferred set. + for (const a of p._facts.attendees ?? []) push(p.slug, a, 'attended'); + } + } + + // Person-page -> outgoing primary_affiliation + secondaries. + for (const p of pages) { + if (p._facts.type !== 'person') continue; + const role = (p._facts.role ?? '').toLowerCase(); + const primary = p._facts.primary_affiliation; + if (primary && existing.has(primary)) { + if (['founder', 'co-founder'].includes(role)) push(p.slug, primary, 'founded'); + else if (role === 'advisor') push(p.slug, primary, 'advises'); + else if (['partner', 'investor', 'vc'].includes(role)) push(p.slug, primary, 'invested_in'); + else push(p.slug, primary, 'works_at'); + } + for (const sec of p._facts.secondary_affiliations ?? []) { + if (!existing.has(sec)) continue; + // Secondary affiliations are typically advisory / board work. + if (role === 'advisor') push(p.slug, sec, 'advises'); + else if (['partner', 'investor', 'vc'].includes(role)) push(p.slug, sec, 'invested_in'); + else push(p.slug, sec, 'mentions'); + } + } + + // Dedup (same from/to/type edge could be added via multiple rules). + const seen = new Set<string>(); + const dedup: GoldEdge[] = []; + for (const e of edges) { + const k = `${e.from}\u0000${e.to}\u0000${e.type}`; + if (seen.has(k)) continue; + seen.add(k); + dedup.push(e); + } + return dedup; +} + +/** Run extractPageLinks on every page; return flat list of inferred edges. */ +function inferAllEdges(pages: RichPage[]): GoldEdge[] { + const edges: GoldEdge[] = []; + for (const p of pages) { + const content = `${p.title}\n\n${p.compiled_truth}\n\n${p.timeline}`; + const candidates = extractPageLinks(content, {}, p.type as PageType); + for (const c of candidates) { + edges.push({ from: p.slug, to: c.targetSlug, type: c.linkType }); + } + } + return edges; +} + +interface PerTypeResult { + linkType: string; + gold: number; + correctly_typed: number; // gold edge present AND inferred type matches + mistyped: number; // gold edge present AND inferred type differs + missed: number; // gold edge present AND no inferred edge + spurious: number; // inferred edge (this type) with no matching gold (any type) for (from, to) + type_accuracy: number; // correctly_typed / (correctly_typed + mistyped) [conditional on finding the edge] + recall: number; // correctly_typed / gold + precision: number; // correctly_typed / (correctly_typed + spurious_this_type_and_mistyped_from_other_types_into_this_type) + f1_strict: number; // F1 where (from, to, type) triple must match exactly +} + +interface ConfusionMatrix { + // matrix[goldType][inferredType] = count + [goldType: string]: Record<string, number>; +} + +function score(gold: GoldEdge[], inferred: GoldEdge[]): { + perType: PerTypeResult[]; + confusion: ConfusionMatrix; + overallTypeAccuracy: number; + overallStrictF1: number; +} { + // Index gold by (from, to) pair — regardless of type. + const goldByPair = new Map<string, string>(); // key: from\u0000to → type + for (const g of gold) { + goldByPair.set(`${g.from}\u0000${g.to}`, g.type); + } + + // Index inferred by (from, to, type). + const inferredByPair = new Map<string, string>(); + for (const i of inferred) { + const key = `${i.from}\u0000${i.to}`; + // Keep the first inferred type for each pair; extractPageLinks already dedupes by (targetSlug, linkType). + if (!inferredByPair.has(key)) { + inferredByPair.set(key, i.type); + } + } + + const linkTypes = new Set<string>(); + for (const g of gold) linkTypes.add(g.type); + for (const i of inferred) linkTypes.add(i.type); + + // Build confusion matrix: gold type → inferred type counts. + const confusion: ConfusionMatrix = {}; + for (const t of linkTypes) confusion[t] = {}; + + for (const [pair, goldType] of goldByPair) { + const inferredType = inferredByPair.get(pair) ?? '(missing)'; + confusion[goldType][inferredType] = (confusion[goldType][inferredType] ?? 0) + 1; + } + // Spurious edges (inferred without gold) tracked under '(no-gold)' rows. + confusion['(no-gold)'] = {}; + for (const [pair, inferredType] of inferredByPair) { + if (!goldByPair.has(pair)) { + confusion['(no-gold)'][inferredType] = (confusion['(no-gold)'][inferredType] ?? 0) + 1; + } + } + + const perType: PerTypeResult[] = []; + let overallCorrectlyTyped = 0; + let overallFound = 0; + let overallGold = 0; + let overallInferredThisTypeOrMistyped = 0; + // For overall strict F1: + let overallStrictTP = 0; + let overallStrictFP = 0; + let overallStrictFN = 0; + + for (const t of linkTypes) { + if (t === '(no-gold)' || t === '(missing)') continue; + const goldCount = Object.values(confusion[t] ?? {}).reduce((a, b) => a + b, 0); + const correctly_typed = confusion[t]?.[t] ?? 0; + const missed = confusion[t]?.['(missing)'] ?? 0; + const mistyped = goldCount - correctly_typed - missed; + + // Spurious for this type: inferred as this type, but gold had a different type OR no gold edge. + let spurious = 0; + // (a) inferred as t where gold was a DIFFERENT type: sum column t across other goldTypes + for (const gt of Object.keys(confusion)) { + if (gt === t || gt === '(missing)' || gt === '(no-gold)') continue; + spurious += confusion[gt]?.[t] ?? 0; + } + // (b) inferred as t with no gold edge at all + spurious += confusion['(no-gold)']?.[t] ?? 0; + + const found = correctly_typed + mistyped; // edge found (any type) where gold existed + const type_accuracy = found > 0 ? correctly_typed / found : 0; + const recall = goldCount > 0 ? correctly_typed / goldCount : 0; + const precisionDenom = correctly_typed + spurious; + const precision = precisionDenom > 0 ? correctly_typed / precisionDenom : 0; + const f1_strict = + precision + recall > 0 ? (2 * precision * recall) / (precision + recall) : 0; + + perType.push({ + linkType: t, + gold: goldCount, + correctly_typed, + mistyped, + missed, + spurious, + type_accuracy, + recall, + precision, + f1_strict, + }); + + overallCorrectlyTyped += correctly_typed; + overallFound += found; + overallGold += goldCount; + overallInferredThisTypeOrMistyped += correctly_typed + spurious; + overallStrictTP += correctly_typed; + overallStrictFN += mistyped + missed; // gold edges not correctly-typed + overallStrictFP += spurious; + } + + const overallTypeAccuracy = overallFound > 0 ? overallCorrectlyTyped / overallFound : 0; + const overallStrictPrecision = + overallStrictTP + overallStrictFP > 0 + ? overallStrictTP / (overallStrictTP + overallStrictFP) + : 0; + const overallStrictRecall = + overallStrictTP + overallStrictFN > 0 + ? overallStrictTP / (overallStrictTP + overallStrictFN) + : 0; + const overallStrictF1 = + overallStrictPrecision + overallStrictRecall > 0 + ? (2 * overallStrictPrecision * overallStrictRecall) / + (overallStrictPrecision + overallStrictRecall) + : 0; + + // Sort perType by gold count descending (most common first). + perType.sort((a, b) => b.gold - a.gold); + + return { perType, confusion, overallTypeAccuracy, overallStrictF1 }; +} + +function pct(n: number, digits = 1): string { + return `${(n * 100).toFixed(digits)}%`; +} + +async function main() { + const json = process.argv.includes('--json'); + const dir = process.argv.find(a => a.startsWith('--dir='))?.slice('--dir='.length) ?? + 'eval/data/world-v1'; + const log = json ? () => {} : console.log; + + log('# BrainBench — type accuracy on rich-prose corpus\n'); + log(`Generated: ${new Date().toISOString().slice(0, 19)}`); + log(`Corpus: ${dir}/`); + + const pages = loadCorpus(dir); + log(`Loaded ${pages.length} pages.\n`); + + const gold = buildGoldEdges(pages); + const inferred = inferAllEdges(pages); + + log(`Gold edges (from _facts): ${gold.length}`); + log(`Inferred edges (extractPageLinks): ${inferred.length}\n`); + + const { perType, confusion, overallTypeAccuracy, overallStrictF1 } = score(gold, inferred); + + log('## Per-link-type results\n'); + log('| Link type | Gold | Correct | Mistyped | Missed | Spurious | Type acc | Recall | Prec | F1 (strict) |'); + log('|--------------|------|---------|----------|--------|----------|----------|--------|--------|-------------|'); + for (const r of perType) { + log( + `| ${r.linkType.padEnd(12)} | ${String(r.gold).padStart(4)} | ${String(r.correctly_typed).padStart(7)} | ${String(r.mistyped).padStart(8)} | ${String(r.missed).padStart(6)} | ${String(r.spurious).padStart(8)} | ${pct(r.type_accuracy).padStart(8)} | ${pct(r.recall).padStart(6)} | ${pct(r.precision).padStart(6)} | ${pct(r.f1_strict).padStart(11)} |`, + ); + } + log(''); + log('**Columns:**'); + log('- *Type acc*: given the edge was found at all, was it typed correctly? `correct / (correct + mistyped)`.'); + log('- *Recall*: of gold edges, how many did we correctly find AND type? `correct / gold`.'); + log('- *Precision*: of edges we inferred as this type, how many were actually this type? `correct / (correct + spurious)`.'); + log('- *F1 (strict)*: strict `(from, to, type)` triple match. Catches both extraction-recall and type-accuracy misses in one number.\n'); + + log('## Overall\n'); + log(`- Overall type accuracy (conditional on finding the edge): **${pct(overallTypeAccuracy)}**`); + log(`- Overall strict F1 (triple match): **${pct(overallStrictF1)}**\n`); + + log('## Confusion matrix (rows = gold type, cols = inferred type)\n'); + const inferredCols = Array.from( + new Set( + Object.values(confusion).flatMap(row => Object.keys(row)), + ), + ).sort(); + const rowKeys = Object.keys(confusion).filter(k => k !== '(no-gold)').sort(); + rowKeys.push('(no-gold)'); + + const header = ['gold \\ inferred', ...inferredCols]; + log('| ' + header.map(h => h.padEnd(14)).join(' | ') + ' |'); + log('|' + header.map(() => '----------------').join('|') + '|'); + for (const g of rowKeys) { + if (!confusion[g]) continue; + const row = [g, ...inferredCols.map(ic => String(confusion[g][ic] ?? 0))]; + log('| ' + row.map(v => v.padEnd(14)).join(' | ') + ' |'); + } + log(''); + + if (json) { + console.log(JSON.stringify({ + overallTypeAccuracy, + overallStrictF1, + perType, + confusion, + goldTotal: gold.length, + inferredTotal: inferred.length, + }, null, 2)); + } +} + +main().catch(e => { + console.error(e); + process.exit(1); +}); diff --git a/eval/runner/types.ts b/eval/runner/types.ts new file mode 100644 index 00000000..36a3694d --- /dev/null +++ b/eval/runner/types.ts @@ -0,0 +1,215 @@ +/** + * BrainBench adapter interface (v1.1 Phase 2). + * + * Adapters are configs-under-test. Each one ingests the same raw pages, + * answers the same queries, and emits ranked results. The runner treats + * BrainState as opaque — it never inspects adapter internals. + * + * Ingestion boundary: the runner passes `rawPages: Page[]` in memory. + * Adapters NEVER receive the `gold/` directory path. Gold is consumed only + * by scorers, which the runner calls separately on adapter output. This is + * structural enforcement of the "system-under-test gets only raw pages" + * contract (eng pass 2 requirement). + * + * Precedence of adapter implementations (ships across Phase 2): + * GbrainAdapter (configs A–F — existing gbrain wrapped in the interface) + * RipgrepBm25Adapter (EXT-1 — strong grep-based baseline) + * VectorOnlyAdapter (EXT-2 — commodity vector RAG, same embedder as gbrain) + * HybridNoGraphAdapter(EXT-3 — gbrain hybrid with graph features disabled) + */ + +// ─── Page ──────────────────────────────────────────────────────────── + +/** + * A raw page as the adapter sees it. Slug is the stable ID; + * compiled_truth + timeline are the prose the adapter indexes. + * Frontmatter carries loosely-typed metadata (type, title, etc.). + */ +export interface Page { + slug: string; + type: 'person' | 'company' | 'meeting' | 'concept' | 'deal' | 'project' | 'source' | 'media'; + title: string; + compiled_truth: string; + timeline: string; + /** Optional additional metadata. Adapters should NOT rely on _facts — + * that field is the gold canonical source, reserved for scorers. */ + frontmatter?: Record<string, unknown>; +} + +// ─── Query ─────────────────────────────────────────────────────────── + +export type Tier = + | 'easy' // T1: single-page lookup + | 'medium' // T2: relational, graph-required + | 'hard' // T3: multi-hop + temporal + | 'adversarial' // T4: identity collisions, contradictions + | 'fuzzy' // T5: vague recall, "I know I mentioned it somewhere" + | 'externally-authored'; // T5.5: outside-researcher queries + +export type ExpectedOutputType = + | 'answer-string' + | 'canonical-entity-id' + | 'cited-source-pages' + | 'time-qualified-answer' + | 'abstention' + | 'contradiction-explanation' + | 'poison-flag' + | 'confidence-score'; + +/** + * Gold shape varies by tier. Kept as an open-ended record; scorers + * validate the tier-specific shape. Canonical gold for relational queries + * lives under `relevant` (list of page slugs expected in top-K). + */ +export interface Gold { + relevant?: string[]; + grades?: Record<string, number>; + expected_answer?: string; + expected_entity_id?: string; + expected_citations?: string[]; + expected_abstention?: boolean; + expected_as_of?: string; + [key: string]: unknown; +} + +export interface Query { + id: string; // q-0001 … q-0350 + tier: Tier; + text: string; // natural-language query + expected_output_type: ExpectedOutputType; + gold: Gold; + /** Required for temporal queries — see eng pass 2 validator spec. */ + as_of_date?: string | 'corpus-end' | 'per-source'; + acceptable_variants?: string[]; // for LLM-judged outputs + known_failure_modes?: string[]; + author?: string; // set for Tier 5.5 externally-authored + tags?: string[]; // 'identity-collision', 'contradiction', etc. +} + +// ─── RankedDoc ────────────────────────────────────────────────────── + +/** + * Adapter output per query: a ranked list of pages the adapter believes + * are relevant. Score semantics are adapter-specific; only RANK matters + * for top-K metrics. Scorers never compare scores across adapters. + */ +export interface RankedDoc { + page_id: string; // page slug + score: number; // adapter-internal relevance score (not comparable across adapters) + rank: number; // 1-based rank within this query's result list + /** Optional snippet of supporting text. Useful for citation scoring. */ + snippet?: string; +} + +// ─── PoisonDisposition ────────────────────────────────────────────── + +/** + * Per eng pass 2 spec. Each poison item in the corpus is tagged with an + * `expected_behavior` in gold; the adapter reports which behavior it + * chose via getPoisonDisposition(). Scorer matches adapter disposition + * against expected to compute poison_resistance. + */ +export type PoisonDisposition = + | 'exclude' // never ingested; page not in index + | 'quarantine' // ingested but tagged; not returned in normal queries + | 'warn' // retrievable with a warning flag on results + | 'ignore' // indexed but not used for factual answers + | 'mark-untrusted';// provenance metadata flags source as untrusted + +// ─── AdapterConfig ────────────────────────────────────────────────── + +/** + * Per-adapter configuration knobs. Adapter implementations extend this + * with their own fields (e.g. GbrainAdapter takes `config: 'A' | ... | 'F'`). + */ +export interface AdapterConfig { + /** Human-readable adapter name (shown in scorecards). */ + name: string; + /** Top-K truncation the scorer uses (adapter is free to return more). */ + k?: number; + /** Adapter-specific options. */ + [key: string]: unknown; +} + +// ─── BrainState ───────────────────────────────────────────────────── + +/** + * OPAQUE to the runner. Each adapter internally defines its own shape: + * RipgrepBm25Adapter BrainState = an in-memory inverted index + doc-length table + * GbrainAdapter BrainState = a PGLite engine handle + .db file path + * VectorOnlyAdapter BrainState = embedding index + cached vectors + * + * The runner only uses it as an adapter-internal state handle. Never + * inspected, serialized, or cross-type-checked. This is the structural + * boundary that prevents a runner bug from leaking implementation details + * across adapters. + */ +export type BrainState = unknown; + +// ─── Adapter interface ────────────────────────────────────────────── + +export interface Adapter { + /** The registered adapter name, e.g. "gbrain-a" | "ripgrep-bm25". */ + readonly name: string; + + /** + * Ingest the raw pages and build internal state. Called ONCE per + * benchmark run. Adapters that need warming (embeddings, indexes) do + * that work here. + */ + init(rawPages: Page[], config: AdapterConfig): Promise<BrainState>; + + /** + * Answer a single query. Adapters return their top results in rank + * order. The scorer applies the query's `k` cutoff; adapters are free + * to return fewer than k (with a shorter list). + */ + query(q: Query, state: BrainState): Promise<RankedDoc[]>; + + /** + * Persist the brain state to a filesystem path (for reproducibility + + * cross-task state sharing). Returns the path. Optional — adapters + * that don't support snapshotting can return an empty string. + */ + snapshot?(state: BrainState): Promise<string>; + + /** + * Per-page poison disposition: what did the adapter do with each + * poison item? Scorer compares to gold's `expected_behavior`. + * Adapters that don't have a poison path return an empty map. + */ + getPoisonDisposition?(state: BrainState): Record<string, PoisonDisposition>; + + /** + * Release any resources held by `state` (DB connections, file locks, + * worker threads). Called once per run after scoring completes. + * Adapters that hold no resources can omit this. Without it, PGLite-backed + * adapters leak engine workers and Bun exits 99 at the end of the run. + */ + teardown?(state: BrainState): Promise<void>; +} + +// ─── Scorer helpers ───────────────────────────────────────────────── + +/** Standard top-K slice; helper since every scorer needs it. */ +export function topK(docs: RankedDoc[], k: number): RankedDoc[] { + return docs.slice(0, k); +} + +/** Precision@k: fraction of top-k that are in relevant set. */ +export function precisionAtK(docs: RankedDoc[], relevant: Set<string>, k: number): number { + const topDocs = topK(docs, k); + if (topDocs.length === 0) return 0; + let hits = 0; + for (const d of topDocs) if (relevant.has(d.page_id)) hits++; + return hits / topDocs.length; +} + +/** Recall@k: fraction of relevant found in top-k. */ +export function recallAtK(docs: RankedDoc[], relevant: Set<string>, k: number): number { + if (relevant.size === 0) return 0; + const topDocs = topK(docs, k); + let hits = 0; + for (const d of topDocs) if (relevant.has(d.page_id)) hits++; + return hits / relevant.size; +} diff --git a/package.json b/package.json index 740147f3..f0b117ab 100644 --- a/package.json +++ b/package.json @@ -22,7 +22,15 @@ "build:schema": "bash scripts/build-schema.sh", "test": "scripts/check-jsonb-pattern.sh && bun test", "test:e2e": "bun test test/e2e/", + "test:eval": "bun test eval/runner/queries/validator.test.ts eval/runner/adapters/ eval/generators/world-html.test.ts", "check:jsonb": "scripts/check-jsonb-pattern.sh", + "eval:run": "bun eval/runner/multi-adapter.ts", + "eval:run:dev": "BRAINBENCH_N=1 bun eval/runner/multi-adapter.ts", + "eval:world:view": "bun eval/cli/world-view.ts", + "eval:world:render": "bun eval/cli/world-view.ts --no-open", + "eval:query:validate": "bun eval/cli/query-validate.ts", + "eval:query:new": "bun eval/cli/query-new.ts", + "eval:type-accuracy": "bun eval/runner/type-accuracy.ts", "postinstall": "gbrain --version >/dev/null 2>&1 && gbrain apply-migrations --yes --non-interactive 2>/dev/null || true", "prepublish:clawhub": "bun run build:all", "publish:clawhub": "clawhub package publish . --family bundle-plugin" diff --git a/src/core/link-extraction.ts b/src/core/link-extraction.ts index 55570b59..7ea447a5 100644 --- a/src/core/link-extraction.ts +++ b/src/core/link-extraction.ts @@ -207,7 +207,20 @@ function excerpt(s: string, idx: number, width: number): string { // explicit "advisor"/"advise" rooting. // Employment context: position + at/of, or explicit work verbs. -const WORKS_AT_RE = /\b(?:CEO of|CTO of|COO of|CFO of|CMO of|CRO of|VP at|VP of|VPs? Engineering|VPs? Product|works at|worked at|working at|employed by|employed at|joined as|joined the team|engineer at|engineer for|director at|director of|head of|leads engineering|leads product|currently at|previously at|previously worked at|spent .* (?:years|months) at|stint at|tenure at)\b/i; +// +// v0.10.5 additions (drive works_at 58% → >85% on rich prose): +// - Role-prefixed engineer patterns: "senior engineer at", "staff engineer at", +// "principal engineer at", "lead engineer at". Current "engineer at" only +// hits if the word "engineer" is immediately adjacent; prose often uses +// rank-qualified forms. +// - Generic role patterns: "backend engineer at", "frontend engineer at", +// "ML engineer at", "data engineer at", "full-stack engineer at". +// - Broader role verbs: "manages engineering at", "running product at", +// "leads the [team] at", "heads up engineering at". +// - Possessive time: "his time at", "her time at", "their time at", "my time at". +// - Role noun forms: "role at", "tenure as", "stint as", "position at". +// - Promoted/staff-engineer forms: "promoted to (staff|senior|principal) engineer at". +const WORKS_AT_RE = /\b(?:CEO of|CTO of|COO of|CFO of|CMO of|CRO of|VP at|VP of|VPs? Engineering|VPs? Product|works at|worked at|working at|employed by|employed at|joined as|joined the team|engineer at|engineer for|director at|director of|head of|heads up .{0,20} at|leads engineering|leads product|leads the .{0,20} (?:team|org) at|manages engineering at|manages product at|running (?:engineering|product|design) at|currently at|previously at|previously worked at|spent .* (?:years|months) at|stint at|stint as|tenure at|tenure as|role at|position at|(?:senior|staff|principal|lead|backend|frontend|full-?stack|ML|data|security) engineer at|promoted to (?:senior|staff|principal|lead) .{0,20} at|(?:his|her|their|my) time at)\b/i; // Investment context. Order patterns from most-specific to least to keep // regex efficient. Includes funding-round verbs ("led the seed", "led X's @@ -224,13 +237,39 @@ const FOUNDED_RE = /\b(?:founded|co-?founded|started the company|incorporated|fo // Advise context: must be rooted in "advisor"/"advise" (investors also sit on // boards). Keep "board advisor" / "advisory board" but drop generic "board // member" / "sits on the board" which over-matches. -const ADVISES_RE = /\b(?:advises|advised|advisor (?:to|at|for|of)|advisory (?:board|role|position)|board advisor|on .{0,20} advisory board|joined .{0,20} advisory board)\b/i; +// +// v0.10.5 additions (drive advises 41% → >85% on rich prose): +// - Advisory capacity phrasings: "in an advisory capacity", "advisory engagement", +// "advisory partnership", "advisory contract", "advisory relationship". +// - "as an advisor" form: joined/serves/brought on "as an advisor" / "as a +// security advisor" / "as a technical advisor" / "as an industry advisor". +// - "consults for / consulting role": advisor-adjacent verbs that appear in +// narratives where the direct "advises" verb isn't used. +// - Advisor-qualified: "strategic advisor to|at", "technical advisor to|at", +// "security advisor to|at", "product advisor to|at", "industry advisor". +const ADVISES_RE = /\b(?:advises|advised|advisor (?:to|at|for|of)|advisory (?:board|role|position|capacity|engagement|partnership|contract|relationship|work)|board advisor|on .{0,20} advisory board|joined .{0,20} advisory board|in an? advisory (?:capacity|role|position)|as an? (?:advisor|security advisor|technical advisor|strategic advisor|industry advisor|product advisor|board advisor|senior advisor)|(?:strategic|technical|security|product|industry|senior|board) advisor (?:to|at|for|of)|consults for|consulting role (?:at|with))\b/i; // Page-role detection: if the source page describes a partner/investor at // page level, that's a strong prior for outbound company refs being // invested_in even when per-edge context lacks explicit investment verbs. const PARTNER_ROLE_RE = /\b(?:partner at|partner of|venture partner|VC partner|invested early|investor at|investor in|portfolio|venture capital|early-stage investor|seed investor|fund [A-Z]|invests across|backs companies)\b/i; -const ADVISOR_ROLE_RE = /\b(?:full-time advisor|professional advisor|advises (?:multiple|several|various))\b/i; + +// Advisor role prior: fires when the page-level description indicates the +// person IS an advisor (not just mentions advising). Broadened in v0.10.5 +// from "full-time/professional/advises multiple" to catch any page that +// self-identifies the subject as an advisor. +const ADVISOR_ROLE_RE = /\b(?:full-time advisor|professional advisor|advises (?:multiple|several|various)|is an? (?:advisor|security advisor|technical advisor|strategic advisor|industry advisor|product advisor|senior advisor)|took on advisory roles|(?:her|his|their) advisory (?:work|role|engagement|portfolio)|serves as (?:an )?advisor)\b/i; + +// Employee role prior (new in v0.10.5): fires when the page-level description +// indicates the person IS an employee (senior/staff/lead engineer, director, +// head, etc.) at some company. Biases outbound company refs on that page +// toward works_at when per-edge verbs are absent (e.g. possessive phrasings +// "her work on Delta's pipeline..." where the verb "works" doesn't appear +// near the slug). +// +// Scope: only fires for person-page → company-page links. Companies' own +// pages mentioning their employees use the page-role layer differently. +const EMPLOYEE_ROLE_RE = /\b(?:is an? (?:senior|staff|principal|lead|backend|frontend|full-?stack|ML|data|security|DevOps|platform)? ?engineer at|is an? (?:senior|staff|principal|lead)? ?(?:developer|designer|product manager|engineering manager|director|VP) (?:at|of)|holds? the (?:CTO|CEO|CFO|COO|CMO|CRO|VP) (?:role|position|seat|title) at|is the (?:CTO|CEO|CFO|COO|CMO|CRO) of|employee at|on the team at|works on .{0,30} at)\b/i; /** * Infer link_type from page context. Deterministic regex heuristics, no LLM. @@ -262,9 +301,15 @@ export function inferLinkType(pageType: PageType, context: string, globalContext // about VC topics naturally contain "venture capital" in their text, but // their company refs are mentions, not investments. Partner pages mentioning // other people (co-investors, friends) should also stay as mentions. + // + // Precedence within priors: investor > advisor > employee. Investors often + // also sit on boards ("board seat at portfolio company") which a naive + // employee/advisor match would mis-classify; keep investor first so those + // phrasings resolve correctly. if (pageType === 'person' && globalContext && targetSlug?.startsWith('companies/')) { if (PARTNER_ROLE_RE.test(globalContext)) return 'invested_in'; if (ADVISOR_ROLE_RE.test(globalContext)) return 'advises'; + if (EMPLOYEE_ROLE_RE.test(globalContext)) return 'works_at'; } return 'mentions'; } diff --git a/test/link-extraction.test.ts b/test/link-extraction.test.ts index e5ed7ec5..7ce3462a 100644 --- a/test/link-extraction.test.ts +++ b/test/link-extraction.test.ts @@ -179,6 +179,123 @@ describe('inferLinkType', () => { test('media page -> mentions (not attended)', () => { expect(inferLinkType('media', 'Alice attended the workshop.')).toBe('mentions'); }); + + // ─── v0.10.5: works_at residuals (drive 58% → >85% on rich prose) ─── + + test('v0.10.5 works_at: rank-prefixed engineer at', () => { + expect(inferLinkType('person', 'Adam is a senior engineer at Delta.')).toBe('works_at'); + expect(inferLinkType('person', 'She is a staff engineer at Stripe.')).toBe('works_at'); + expect(inferLinkType('person', 'Promoted to principal engineer at Acme.')).toBe('works_at'); + }); + + test('v0.10.5 works_at: discipline-prefixed engineer at', () => { + expect(inferLinkType('person', 'Backend engineer at NovaPay.')).toBe('works_at'); + expect(inferLinkType('person', 'Full-stack engineer at Vox.')).toBe('works_at'); + expect(inferLinkType('person', 'ML engineer at DeepMind.')).toBe('works_at'); + expect(inferLinkType('person', 'Security engineer at Stripe.')).toBe('works_at'); + }); + + test('v0.10.5 works_at: possessive time at', () => { + expect(inferLinkType('person', 'During her time at Goldman, she built the team.')).toBe('works_at'); + expect(inferLinkType('person', 'His time at Delta taught him systems thinking.')).toBe('works_at'); + }); + + test('v0.10.5 works_at: leadership verbs beyond "leads engineering"', () => { + expect(inferLinkType('person', 'She heads up design at Beta.')).toBe('works_at'); + expect(inferLinkType('person', 'He manages engineering at Gamma.')).toBe('works_at'); + expect(inferLinkType('person', 'She leads the platform team at Delta.')).toBe('works_at'); + expect(inferLinkType('person', 'Running product at Stripe.')).toBe('works_at'); + }); + + test('v0.10.5 works_at: tenure/stint/role as', () => { + expect(inferLinkType('person', 'Her tenure as head of engineering was short.')).toBe('works_at'); + expect(inferLinkType('person', 'A brief stint as VP of Product.')).toBe('works_at'); + expect(inferLinkType('person', 'His role at Delta was to unblock the pipeline team.')).toBe('works_at'); + }); + + test('v0.10.5 works_at: page-role employee prior for ambiguous context', () => { + // Per-edge context doesn't mention a work verb, but globalContext establishes + // the person IS a senior engineer at a company. The employee role prior + // should bias outbound company refs toward works_at. + const globalContext = 'Adam Lopez is a senior engineer at Delta. His work is excellent.'; + const perEdgeContext = 'Adam is excellent.'; // no work verb in the window + expect(inferLinkType('person', perEdgeContext, globalContext, 'companies/delta-3')).toBe('works_at'); + }); + + test('v0.10.5 works_at: page-role CTO-of prior', () => { + const globalContext = 'Beth is the CTO of Prism, shipping their platform.'; + const perEdgeContext = 'Beth is shipping.'; // no work verb near slug + expect(inferLinkType('person', perEdgeContext, globalContext, 'companies/prism-43')).toBe('works_at'); + }); + + // ─── v0.10.5: advises residuals (drive 41% → >85% on rich prose) ─── + + test('v0.10.5 advises: "as an advisor" / "as a security advisor"', () => { + expect(inferLinkType('person', 'Joined Acme as an advisor in 2022.')).toBe('advises'); + expect(inferLinkType('person', 'Brought on as a security advisor.')).toBe('advises'); + expect(inferLinkType('person', 'Serves as a technical advisor to the team.')).toBe('advises'); + }); + + test('v0.10.5 advises: prefixed advisor (security advisor to X)', () => { + expect(inferLinkType('person', 'She is the security advisor to Orbit Labs.')).toBe('advises'); + expect(inferLinkType('person', 'He is a strategic advisor at Prism.')).toBe('advises'); + expect(inferLinkType('person', 'Product advisor to several early-stage startups.')).toBe('advises'); + }); + + test('v0.10.5 advises: "in an advisory capacity"', () => { + expect(inferLinkType('person', 'Engaged with Prism in an advisory capacity.')).toBe('advises'); + expect(inferLinkType('person', 'Continued in an advisory role through 2024.')).toBe('advises'); + }); + + test('v0.10.5 advises: advisory engagement / partnership / contract', () => { + expect(inferLinkType('person', 'Began a formal advisory engagement with Prism.')).toBe('advises'); + expect(inferLinkType('person', 'Signed an advisory contract last year.')).toBe('advises'); + expect(inferLinkType('person', 'Multi-year advisory partnership with Beta.')).toBe('advises'); + }); + + test('v0.10.5 advises: page-role "is an advisor" prior', () => { + // Per-edge window has no advisor verb (just possessive "her work"), but + // page-level establishes the subject IS an advisor. Prior should fire. + const globalContext = 'Alice Davis is an advisor at Prism. Her work has been invaluable.'; + const perEdgeContext = 'Alice Davis has been invaluable.'; // no advise verb in window + expect(inferLinkType('person', perEdgeContext, globalContext, 'companies/prism-43')).toBe('advises'); + }); + + test('v0.10.5 advises: "serves as advisor" page prior', () => { + // Avoid "portfolio" in global context since that trips PARTNER_ROLE_RE. + // Real advisor pages rarely use "portfolio" (that's a partner word). + const globalContext = 'Beth serves as advisor to three early-stage startups.'; + const perEdgeContext = 'Beth sees Acme regularly.'; + expect(inferLinkType('person', perEdgeContext, globalContext, 'companies/acme')).toBe('advises'); + }); + + // ─── Regression guards: v0.10.5 expansions must not break tightened rules ─── + + test('v0.10.5 regression: generic "board member" still resolves to mentions', () => { + // This was the v0.10.4 tightening. The expanded ADVISES_RE must not + // re-introduce the false-positive on partner bios. + expect(inferLinkType('person', 'Jane is a board member at Beta Health.')).toBe('mentions'); + }); + + test('v0.10.5 regression: "sits on the board" still mentions (not advises)', () => { + expect(inferLinkType('person', 'She sits on the board of Acme.')).toBe('mentions'); + }); + + test('v0.10.5 regression: "backs companies" still resolves to invested_in via partner prior', () => { + // Partner prior takes precedence over employee prior. + const globalContext = 'Wendy is a venture partner who backs companies at the seed stage. Her portfolio is diverse.'; + const perEdgeContext = 'Wendy recently discussed Cipher.'; + expect(inferLinkType('person', perEdgeContext, globalContext, 'companies/cipher-13')).toBe('invested_in'); + }); + + test('v0.10.5 regression: partner + advisor co-mention stays invested_in for investee', () => { + // If someone is both a partner AND mentions advisory work, the outbound + // companies should lean toward invested_in (partner precedence). This + // protects against a common pattern where partners say "I also advise X". + const globalContext = 'Jane is a partner at Accel. She also advises multiple startups.'; + const perEdgeContext = 'Jane has worked with Acme.'; + expect(inferLinkType('person', perEdgeContext, globalContext, 'companies/acme')).toBe('invested_in'); + }); }); // ─── parseTimelineEntries ──────────────────────────────────────