diff --git a/.gitignore b/.gitignore index 3566634..381b60d 100644 --- a/.gitignore +++ b/.gitignore @@ -21,3 +21,6 @@ venv/ .claude/settings.local.json .claude/projects/ .claude/worktrees/ + +# Per-run eval execution artifacts (the parent outputs/ stays tracked for canonical examples) +outputs/eval-runs/ diff --git a/bin/run-eval.py b/bin/run-eval.py index 1284061..82f99fd 100755 --- a/bin/run-eval.py +++ b/bin/run-eval.py @@ -33,6 +33,7 @@ import shutil import subprocess import sys +import time from datetime import datetime, timezone from pathlib import Path @@ -169,7 +170,15 @@ def invoke_target(target: dict, prompt: str, dry_run: bool) -> tuple[str, dict]: raw = r.read().decode() status = r.status except urllib.error.HTTPError as e: - return "", {"type": "http", "status": e.code, "error": str(e)} + err_body = "" + try: + err_body = e.read().decode()[:300] + except Exception: + pass + if e.code == 429: + warn(f"HTTP 429 from target — rate limit or quota exhausted. Body: {err_body[:120]}") + warn("If all cases 429, your API key likely lacks access to this model OR the free-tier quota is exhausted. Try a smaller/cheaper model or set target.delay_between_cases_sec.") + return "", {"type": "http", "status": e.code, "error": str(e), "body_excerpt": err_body} except urllib.error.URLError as e: fatal(f"http target unreachable: {e}") path = target.get("response_path", "$") @@ -373,10 +382,14 @@ def main() -> int: tokens_used = 0 run_results = [] - for tc in cases: + delay_sec = float(target.get("delay_between_cases_sec", 0)) + for i, tc in enumerate(cases): if args.max_tokens and tokens_used >= args.max_tokens: warn(f"token cap {args.max_tokens:,} reached — partial run, {len(run_results)} cases done") break + if i > 0 and delay_sec > 0 and not args.dry_run: + info(f" sleeping {delay_sec}s (rate-limit respect)") + time.sleep(delay_sec) info(f" case {tc['id']} ({tc.get('severity', '?')})") output, evidence = invoke_target(target, tc["input"], args.dry_run) diff --git a/outputs/eval-gemini-deepresearch-uxdesign-2026-04-25.yaml b/outputs/eval-gemini-deepresearch-uxdesign-2026-04-25.yaml new file mode 100644 index 0000000..9d6645a --- /dev/null +++ b/outputs/eval-gemini-deepresearch-uxdesign-2026-04-25.yaml @@ -0,0 +1,157 @@ +# ───────────────────────────────────────────────────────────────────────────── +# pmstack eval — Gemini deep-research + UX design capabilities +# Real-world test of /run-eval against an external HTTP target. +# +# HOW TO USE +# 1. export GEMINI_API_KEY="your_key_here" # in your terminal, before launching claude +# 2. /run-eval outputs/eval-gemini-deepresearch-uxdesign-2026-04-25.yaml --judge-model claude-sonnet-4-6 +# 3. Read outputs/eval-runs//summary.md +# +# To compare against Claude: edit target.type to claude-session and re-run. +# Diff the two summary.md files manually (a comparison-runner is a v0.5 item). +# +# COST ESTIMATE +# Gemini calls: 6 cases × ~8K tokens ≈ 48K tokens through your Gemini subscription +# Judge calls : 6 × ~4K tokens ≈ 24K tokens against Claude session +# ───────────────────────────────────────────────────────────────────────────── + +name: "Gemini 2.0 Flash — deep research + UX design" +description: > + Six cases probing Gemini's deep-research synthesis and its ability to + produce / critique UX. Judged by Claude Sonnet (different family, no + self-grading bias). + + Why flash and not pro: free-tier quota for gemini-2.5-pro is too tight + for a 6-case run (every call 429s on first attempt). Flash has much + more permissive free-tier limits and is the closer apples-to-apples + comparison against Claude Haiku/Sonnet anyway. + +target: + type: http + url: "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent" + method: POST + request_template: '{"contents":[{"parts":[{"text":"{{input}}"}]}],"generationConfig":{"maxOutputTokens":2048,"temperature":0.3}}' + response_path: ".candidates[0].content.parts[0].text" + headers: + x-goog-api-key: "${GEMINI_API_KEY}" + requires: + - GEMINI_API_KEY + # Free tier limits are per-minute; 6s spacing keeps us under 10 RPM comfortably. + delay_between_cases_sec: 6 + +capabilities: + - "Deep research: multi-source synthesis with explicit uncertainty" + - "Deep research: structured comparison across multiple data points" + - "Deep research: factual hedging when data is uncertain or post-cutoff" + - "UX design: produce usable HTML/CSS for a constrained spec" + - "UX design: critique an existing flow with concrete fix proposals" + - "UX design: design experiments (A/B test) with explicit assumptions" + +failure_modes: + - id: fm-1 + name: "Hallucinated facts in research synthesis" + severity: P0 + - id: fm-2 + name: "Refuses or stalls on time-sensitive questions instead of dating its claims" + severity: P1 + - id: fm-3 + name: "Generic 'best practices' answer instead of design with specifics" + severity: P1 + - id: fm-4 + name: "Critique that lists everything as a problem (no prioritization)" + severity: P1 + +metrics: + - name: "Substance" + description: "Does the response actually answer the question with concrete content (not generic advice)?" + type: score + pass_bar: ">= 4" + - name: "Hedging" + description: "When uncertain, does the model say so explicitly (vs. confidently asserting wrong facts)?" + type: score + pass_bar: ">= 4" + - name: "Structure" + description: "Is the response easy to scan — headers, lists, comparison tables when useful?" + type: score + pass_bar: ">= 4" + - name: "Length" + description: "Output token count (rough char-based estimate). Should be substantive but not bloated." + type: tokens + pass_bar: "<= 2000" + +test_cases: + # ── Deep research ─────────────────────────────────────────────────────────── + - id: dr-1-multisource-synthesis + severity: P0 + description: "Multi-source synthesis on a contested topic. Expects structured comparison, not a single source's view." + input: "Compare the leading approaches to LLM agent memory in 2024-2025: in-context, vector RAG, fine-tuned, and hybrid. For each, give one strength and one trade-off, and name a representative system that uses it. Be specific. If you're not sure about a system's actual approach, say so." + expected_behavior: "Four approaches, each with named real system, strength, trade-off. Hedges where uncertain about post-training-cutoff systems." + metrics: ["Substance", "Hedging", "Structure", "Length"] + + - id: dr-2-counterfactual + severity: P1 + description: "Counterfactual reasoning under uncertainty. Tests whether the model reasons explicitly or fabricates a confident timeline." + input: "If GPT-4 had launched in mid-2022 instead of March 2023, what's a plausible competitive response from Anthropic in the following 12 months? Reason step-by-step. Mark assumptions as assumptions." + expected_behavior: "Reasoning steps, named assumptions, plausible scenarios. NOT a confident retelling as if it happened." + metrics: ["Substance", "Hedging", "Structure"] + + - id: dr-3-citations-honesty + severity: P0 + description: "Citation honesty. Models often hallucinate paper titles + URLs." + input: "List 5 academic papers on retrieval failures or hallucinations in RAG systems published 2022-2024. For each: title, lead author, year, arXiv URL. If you cannot remember a real paper exactly, say so explicitly — DO NOT invent." + expected_behavior: "Either real papers with verifiable details, OR an explicit acknowledgement that the model cannot guarantee paper accuracy. Fabricated arXiv IDs = P0 fail." + metrics: ["Substance", "Hedging"] + + # ── UX design ─────────────────────────────────────────────────────────────── + - id: ux-1-html-form + severity: P1 + description: "Concrete HTML/CSS for a constrained spec. Tests whether the model can produce usable code, not generic templates." + input: "Write a mobile-first sign-up form for a B2B SaaS. Fields: work email, company name, team size (1-10/11-50/51+/dropdown). Include client-side validation hints (HTML5 attributes, no JS), accessible labels, and a single primary button. Output only the HTML+CSS, in one code block." + expected_behavior: "Working HTML+CSS, mobile-first (viewport meta, max-width container), accessible labels, validation attrs. Not just a template." + metrics: ["Substance", "Structure", "Length"] + + - id: ux-2-flow-critique + severity: P1 + description: "Critique with prioritization. Tests whether the model picks 3 real issues vs lists 12 generic ones." + input: | + Here's a SaaS onboarding flow: + 1. Email signup → email verification (24h delay sometimes) + 2. After verification, a 12-question profile setup before user can see the product + 3. Product loads with empty state and a "watch our 4-min intro video" modal + 4. After dismissing video, user is on the dashboard with no data and a "create your first project" CTA in the bottom-right + Identify the 3 biggest UX gaps in this flow, in priority order, with concrete fixes for each. Do not list more than 3. + expected_behavior: "Exactly 3 gaps, prioritized by impact, each with a concrete fix. Should flag email verification delay (high friction), 12-question survey before value (gate), CTA placement (low affordance)." + metrics: ["Substance", "Structure"] + + - id: ux-3-experiment-design + severity: P1 + description: "Designs an A/B test with explicit assumptions and success criteria." + input: "Design an A/B test to validate whether adding a 30-second product tour at first login improves week-1 retention. Include: variants, hypothesis, primary metric, sample size estimate (state the assumptions), success criteria, and what could go wrong (false positives, novelty effects, etc.)." + expected_behavior: "Two variants named, explicit hypothesis, primary metric defined, sample-size reasoning with named baseline + MDE, success criteria, at least 2 named risks." + metrics: ["Substance", "Hedging", "Structure"] + +# ───────────────────────────────────────────────────────────────────────────── +# APPENDIX — PRO TIPS (specific to this eval) +# ───────────────────────────────────────────────────────────────────────────── +# +# Setup +# ✓ export GEMINI_API_KEY="..." in your terminal BEFORE launching claude +# ✓ Verify with: echo "key set: ${GEMINI_API_KEY:+yes}" (should print "key set: yes") +# ✓ Never paste the key into the YAML or chat +# +# Running +# ✓ Always pass --judge-model claude-sonnet-4-6 — without it, all subjective +# metrics will be marked "needs-judge" and you won't get scores. +# ✓ Try one case first: --only dr-1-multisource-synthesis +# ✓ Cost is small: ~$0.20 Gemini + ~$0.20 Claude judge for the full 6-case run. +# +# Comparing against Claude +# ✓ Duplicate this YAML with target.type: claude-session, model: claude-sonnet-4-6 +# ✓ Run both. Diff the summary.md files. +# ✓ Watch for "Hedging" — that's where models diverge most on dr-2 and dr-3. +# +# What you'll learn +# ✓ Whether Gemini hallucinates citations on dr-3 (most models do). +# ✓ Whether Gemini's UX critique prioritizes (ux-2) or lists everything. +# ✓ Whether the HTML on ux-1 is mobile-first and accessible by default. +# ─────────────────────────────────────────────────────────────────────────────