From 5fd9aa7de4b3c1e1339481e0bc3c327f2be1fb81 Mon Sep 17 00:00:00 2001 From: Ryan Alberts Date: Fri, 24 Apr 2026 23:58:56 -0500 Subject: [PATCH 1/3] =?UTF-8?q?v0.4.2:=20real=20Gemini=20eval=20=E2=80=94?= =?UTF-8?q?=20deep=20research=20+=20UX=20design?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Working example of /run-eval against an external HTTP target. Designed to be run by the user against their own Gemini subscription as a real test of the http target type and as the first cross-vendor eval in pmstack. Six test cases: 3 deep research — multi-source synthesis, counterfactual reasoning, citation honesty (the classic hallucination trap) 3 UX design — concrete HTML form, prioritized flow critique, A/B test design with assumptions Target: gemini-2.5-pro via generativelanguage.googleapis.com. Auth: x-goog-api-key header sourced from GEMINI_API_KEY env var. Judge: claude-sonnet-4-6 (different family, no self-grading bias). Verified the runner hard-stops cleanly when GEMINI_API_KEY is unset ("FATAL: requires env var GEMINI_API_KEY"). When set, the eval loads 6 cases at ~48K total token estimate (well under the 200K warn threshold) — affordable smoke test against a real external API. Co-Authored-By: Claude Opus 4.7 (1M context) --- ...mini-deepresearch-uxdesign-2026-04-25.yaml | 150 ++++++++++++++++++ 1 file changed, 150 insertions(+) create mode 100644 outputs/eval-gemini-deepresearch-uxdesign-2026-04-25.yaml diff --git a/outputs/eval-gemini-deepresearch-uxdesign-2026-04-25.yaml b/outputs/eval-gemini-deepresearch-uxdesign-2026-04-25.yaml new file mode 100644 index 0000000..ea06127 --- /dev/null +++ b/outputs/eval-gemini-deepresearch-uxdesign-2026-04-25.yaml @@ -0,0 +1,150 @@ +# ───────────────────────────────────────────────────────────────────────────── +# pmstack eval — Gemini deep-research + UX design capabilities +# Real-world test of /run-eval against an external HTTP target. +# +# HOW TO USE +# 1. export GEMINI_API_KEY="your_key_here" # in your terminal, before launching claude +# 2. /run-eval outputs/eval-gemini-deepresearch-uxdesign-2026-04-25.yaml --judge-model claude-sonnet-4-6 +# 3. Read outputs/eval-runs//summary.md +# +# To compare against Claude: edit target.type to claude-session and re-run. +# Diff the two summary.md files manually (a comparison-runner is a v0.5 item). +# +# COST ESTIMATE +# Gemini calls: 6 cases × ~8K tokens ≈ 48K tokens through your Gemini subscription +# Judge calls : 6 × ~4K tokens ≈ 24K tokens against Claude session +# ───────────────────────────────────────────────────────────────────────────── + +name: "Gemini 2.5 Pro — deep research + UX design" +description: > + Six cases probing Gemini's deep-research synthesis and its ability to + produce / critique UX. Judged by Claude Sonnet (different family, no + self-grading bias). + +target: + type: http + url: "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-pro:generateContent" + method: POST + request_template: '{"contents":[{"parts":[{"text":"{{input}}"}]}],"generationConfig":{"maxOutputTokens":2048,"temperature":0.3}}' + response_path: ".candidates[0].content.parts[0].text" + headers: + x-goog-api-key: "${GEMINI_API_KEY}" + requires: + - GEMINI_API_KEY + +capabilities: + - "Deep research: multi-source synthesis with explicit uncertainty" + - "Deep research: structured comparison across multiple data points" + - "Deep research: factual hedging when data is uncertain or post-cutoff" + - "UX design: produce usable HTML/CSS for a constrained spec" + - "UX design: critique an existing flow with concrete fix proposals" + - "UX design: design experiments (A/B test) with explicit assumptions" + +failure_modes: + - id: fm-1 + name: "Hallucinated facts in research synthesis" + severity: P0 + - id: fm-2 + name: "Refuses or stalls on time-sensitive questions instead of dating its claims" + severity: P1 + - id: fm-3 + name: "Generic 'best practices' answer instead of design with specifics" + severity: P1 + - id: fm-4 + name: "Critique that lists everything as a problem (no prioritization)" + severity: P1 + +metrics: + - name: "Substance" + description: "Does the response actually answer the question with concrete content (not generic advice)?" + type: score + pass_bar: ">= 4" + - name: "Hedging" + description: "When uncertain, does the model say so explicitly (vs. confidently asserting wrong facts)?" + type: score + pass_bar: ">= 4" + - name: "Structure" + description: "Is the response easy to scan — headers, lists, comparison tables when useful?" + type: score + pass_bar: ">= 4" + - name: "Length" + description: "Output token count (rough char-based estimate). Should be substantive but not bloated." + type: tokens + pass_bar: "<= 2000" + +test_cases: + # ── Deep research ─────────────────────────────────────────────────────────── + - id: dr-1-multisource-synthesis + severity: P0 + description: "Multi-source synthesis on a contested topic. Expects structured comparison, not a single source's view." + input: "Compare the leading approaches to LLM agent memory in 2024-2025: in-context, vector RAG, fine-tuned, and hybrid. For each, give one strength and one trade-off, and name a representative system that uses it. Be specific. If you're not sure about a system's actual approach, say so." + expected_behavior: "Four approaches, each with named real system, strength, trade-off. Hedges where uncertain about post-training-cutoff systems." + metrics: ["Substance", "Hedging", "Structure", "Length"] + + - id: dr-2-counterfactual + severity: P1 + description: "Counterfactual reasoning under uncertainty. Tests whether the model reasons explicitly or fabricates a confident timeline." + input: "If GPT-4 had launched in mid-2022 instead of March 2023, what's a plausible competitive response from Anthropic in the following 12 months? Reason step-by-step. Mark assumptions as assumptions." + expected_behavior: "Reasoning steps, named assumptions, plausible scenarios. NOT a confident retelling as if it happened." + metrics: ["Substance", "Hedging", "Structure"] + + - id: dr-3-citations-honesty + severity: P0 + description: "Citation honesty. Models often hallucinate paper titles + URLs." + input: "List 5 academic papers on retrieval failures or hallucinations in RAG systems published 2022-2024. For each: title, lead author, year, arXiv URL. If you cannot remember a real paper exactly, say so explicitly — DO NOT invent." + expected_behavior: "Either real papers with verifiable details, OR an explicit acknowledgement that the model cannot guarantee paper accuracy. Fabricated arXiv IDs = P0 fail." + metrics: ["Substance", "Hedging"] + + # ── UX design ─────────────────────────────────────────────────────────────── + - id: ux-1-html-form + severity: P1 + description: "Concrete HTML/CSS for a constrained spec. Tests whether the model can produce usable code, not generic templates." + input: "Write a mobile-first sign-up form for a B2B SaaS. Fields: work email, company name, team size (1-10/11-50/51+/dropdown). Include client-side validation hints (HTML5 attributes, no JS), accessible labels, and a single primary button. Output only the HTML+CSS, in one code block." + expected_behavior: "Working HTML+CSS, mobile-first (viewport meta, max-width container), accessible labels, validation attrs. Not just a template." + metrics: ["Substance", "Structure", "Length"] + + - id: ux-2-flow-critique + severity: P1 + description: "Critique with prioritization. Tests whether the model picks 3 real issues vs lists 12 generic ones." + input: | + Here's a SaaS onboarding flow: + 1. Email signup → email verification (24h delay sometimes) + 2. After verification, a 12-question profile setup before user can see the product + 3. Product loads with empty state and a "watch our 4-min intro video" modal + 4. After dismissing video, user is on the dashboard with no data and a "create your first project" CTA in the bottom-right + Identify the 3 biggest UX gaps in this flow, in priority order, with concrete fixes for each. Do not list more than 3. + expected_behavior: "Exactly 3 gaps, prioritized by impact, each with a concrete fix. Should flag email verification delay (high friction), 12-question survey before value (gate), CTA placement (low affordance)." + metrics: ["Substance", "Structure"] + + - id: ux-3-experiment-design + severity: P1 + description: "Designs an A/B test with explicit assumptions and success criteria." + input: "Design an A/B test to validate whether adding a 30-second product tour at first login improves week-1 retention. Include: variants, hypothesis, primary metric, sample size estimate (state the assumptions), success criteria, and what could go wrong (false positives, novelty effects, etc.)." + expected_behavior: "Two variants named, explicit hypothesis, primary metric defined, sample-size reasoning with named baseline + MDE, success criteria, at least 2 named risks." + metrics: ["Substance", "Hedging", "Structure"] + +# ───────────────────────────────────────────────────────────────────────────── +# APPENDIX — PRO TIPS (specific to this eval) +# ───────────────────────────────────────────────────────────────────────────── +# +# Setup +# ✓ export GEMINI_API_KEY="..." in your terminal BEFORE launching claude +# ✓ Verify with: echo "key set: ${GEMINI_API_KEY:+yes}" (should print "key set: yes") +# ✓ Never paste the key into the YAML or chat +# +# Running +# ✓ Always pass --judge-model claude-sonnet-4-6 — without it, all subjective +# metrics will be marked "needs-judge" and you won't get scores. +# ✓ Try one case first: --only dr-1-multisource-synthesis +# ✓ Cost is small: ~$0.20 Gemini + ~$0.20 Claude judge for the full 6-case run. +# +# Comparing against Claude +# ✓ Duplicate this YAML with target.type: claude-session, model: claude-sonnet-4-6 +# ✓ Run both. Diff the summary.md files. +# ✓ Watch for "Hedging" — that's where models diverge most on dr-2 and dr-3. +# +# What you'll learn +# ✓ Whether Gemini hallucinates citations on dr-3 (most models do). +# ✓ Whether Gemini's UX critique prioritizes (ux-2) or lists everything. +# ✓ Whether the HTML on ux-1 is mobile-first and accessible by default. +# ───────────────────────────────────────────────────────────────────────────── From 2f5739f1f75a00b38584a097b49483628b527818 Mon Sep 17 00:00:00 2001 From: Ryan Alberts Date: Sat, 25 Apr 2026 00:00:02 -0500 Subject: [PATCH 2/3] v0.4.2: gitignore per-run eval artifacts outputs/eval-runs/ is per-user run output, not canonical example material. The parent outputs/ stays tracked so the example artifacts (Ultraplan eval, Gemini eval YAML, roadmap, verification report) remain in the repo. Co-Authored-By: Claude Opus 4.7 (1M context) --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 3566634..381b60d 100644 --- a/.gitignore +++ b/.gitignore @@ -21,3 +21,6 @@ venv/ .claude/settings.local.json .claude/projects/ .claude/worktrees/ + +# Per-run eval execution artifacts (the parent outputs/ stays tracked for canonical examples) +outputs/eval-runs/ From 00ff6feda07f7a6087cc2aeb4d052b292df9982a Mon Sep 17 00:00:00 2001 From: Ryan Alberts Date: Sat, 25 Apr 2026 00:27:09 -0500 Subject: [PATCH 3/3] v0.4.2: runner improvements from Gemini eval debug Real-world test against Gemini 2.5 Pro hit immediate 429s on every call (free-tier quota exhausted). The runner correctly captured the errors and refused to fake scores. Two improvements that came out of debugging: bin/run-eval.py - target.delay_between_cases_sec: optional sleep between cases for rate-limit respect on http targets - 429 errors now emit a clear WARN telling the user to check their quota / try a smaller model / set a delay - Captures the full HTTP error body (300 chars) into evidence so diagnosis doesn't require re-reading raw network logs eval YAML - Switched from gemini-2.5-pro to gemini-2.0-flash (more permissive free-tier limits) - Added delay_between_cases_sec: 6 - Documented why in description Note: the actual Gemini comparison didn't produce real scores in this run (quota exhausted on user's API key). The eval will need to be re-run after the user's quota resets or with a key that has access. Co-Authored-By: Claude Opus 4.7 (1M context) --- bin/run-eval.py | 17 +++++++++++++++-- ...gemini-deepresearch-uxdesign-2026-04-25.yaml | 11 +++++++++-- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/bin/run-eval.py b/bin/run-eval.py index 1284061..82f99fd 100755 --- a/bin/run-eval.py +++ b/bin/run-eval.py @@ -33,6 +33,7 @@ import shutil import subprocess import sys +import time from datetime import datetime, timezone from pathlib import Path @@ -169,7 +170,15 @@ def invoke_target(target: dict, prompt: str, dry_run: bool) -> tuple[str, dict]: raw = r.read().decode() status = r.status except urllib.error.HTTPError as e: - return "", {"type": "http", "status": e.code, "error": str(e)} + err_body = "" + try: + err_body = e.read().decode()[:300] + except Exception: + pass + if e.code == 429: + warn(f"HTTP 429 from target — rate limit or quota exhausted. Body: {err_body[:120]}") + warn("If all cases 429, your API key likely lacks access to this model OR the free-tier quota is exhausted. Try a smaller/cheaper model or set target.delay_between_cases_sec.") + return "", {"type": "http", "status": e.code, "error": str(e), "body_excerpt": err_body} except urllib.error.URLError as e: fatal(f"http target unreachable: {e}") path = target.get("response_path", "$") @@ -373,10 +382,14 @@ def main() -> int: tokens_used = 0 run_results = [] - for tc in cases: + delay_sec = float(target.get("delay_between_cases_sec", 0)) + for i, tc in enumerate(cases): if args.max_tokens and tokens_used >= args.max_tokens: warn(f"token cap {args.max_tokens:,} reached — partial run, {len(run_results)} cases done") break + if i > 0 and delay_sec > 0 and not args.dry_run: + info(f" sleeping {delay_sec}s (rate-limit respect)") + time.sleep(delay_sec) info(f" case {tc['id']} ({tc.get('severity', '?')})") output, evidence = invoke_target(target, tc["input"], args.dry_run) diff --git a/outputs/eval-gemini-deepresearch-uxdesign-2026-04-25.yaml b/outputs/eval-gemini-deepresearch-uxdesign-2026-04-25.yaml index ea06127..9d6645a 100644 --- a/outputs/eval-gemini-deepresearch-uxdesign-2026-04-25.yaml +++ b/outputs/eval-gemini-deepresearch-uxdesign-2026-04-25.yaml @@ -15,15 +15,20 @@ # Judge calls : 6 × ~4K tokens ≈ 24K tokens against Claude session # ───────────────────────────────────────────────────────────────────────────── -name: "Gemini 2.5 Pro — deep research + UX design" +name: "Gemini 2.0 Flash — deep research + UX design" description: > Six cases probing Gemini's deep-research synthesis and its ability to produce / critique UX. Judged by Claude Sonnet (different family, no self-grading bias). + Why flash and not pro: free-tier quota for gemini-2.5-pro is too tight + for a 6-case run (every call 429s on first attempt). Flash has much + more permissive free-tier limits and is the closer apples-to-apples + comparison against Claude Haiku/Sonnet anyway. + target: type: http - url: "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-pro:generateContent" + url: "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent" method: POST request_template: '{"contents":[{"parts":[{"text":"{{input}}"}]}],"generationConfig":{"maxOutputTokens":2048,"temperature":0.3}}' response_path: ".candidates[0].content.parts[0].text" @@ -31,6 +36,8 @@ target: x-goog-api-key: "${GEMINI_API_KEY}" requires: - GEMINI_API_KEY + # Free tier limits are per-minute; 6s spacing keeps us under 10 RPM comfortably. + delay_between_cases_sec: 6 capabilities: - "Deep research: multi-source synthesis with explicit uncertainty"