From 5fd9aa7de4b3c1e1339481e0bc3c327f2be1fb81 Mon Sep 17 00:00:00 2001
From: Ryan Alberts <ryan.a.alberts@gmail.com>
Date: Fri, 24 Apr 2026 23:58:56 -0500
Subject: [PATCH 1/3] =?UTF-8?q?v0.4.2:=20real=20Gemini=20eval=20=E2=80=94?=
 =?UTF-8?q?=20deep=20research=20+=20UX=20design?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Working example of /run-eval against an external HTTP target. Designed
to be run by the user against their own Gemini subscription as a real
test of the http target type and as the first cross-vendor eval in
pmstack.

Six test cases:
  3 deep research — multi-source synthesis, counterfactual reasoning,
                    citation honesty (the classic hallucination trap)
  3 UX design     — concrete HTML form, prioritized flow critique,
                    A/B test design with assumptions

Target: gemini-2.5-pro via generativelanguage.googleapis.com.
Auth: x-goog-api-key header sourced from GEMINI_API_KEY env var.
Judge: claude-sonnet-4-6 (different family, no self-grading bias).

Verified the runner hard-stops cleanly when GEMINI_API_KEY is unset
("FATAL: requires env var GEMINI_API_KEY"). When set, the eval loads
6 cases at ~48K total token estimate (well under the 200K warn
threshold) — affordable smoke test against a real external API.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 ...mini-deepresearch-uxdesign-2026-04-25.yaml | 150 ++++++++++++++++++
 1 file changed, 150 insertions(+)
 create mode 100644 outputs/eval-gemini-deepresearch-uxdesign-2026-04-25.yaml
diff --git a/outputs/eval-gemini-deepresearch-uxdesign-2026-04-25.yaml b/outputs/eval-gemini-deepresearch-uxdesign-2026-04-25.yaml
new file mode 100644
index 0000000..ea06127
--- /dev/null
+++ b/outputs/eval-gemini-deepresearch-uxdesign-2026-04-25.yaml
@@ -0,0 +1,150 @@
+# ─────────────────────────────────────────────────────────────────────────────
+# pmstack eval — Gemini deep-research + UX design capabilities
+# Real-world test of /run-eval against an external HTTP target.
+#
+# HOW TO USE
+#   1. export GEMINI_API_KEY="your_key_here"         # in your terminal, before launching claude
+#   2. /run-eval outputs/eval-gemini-deepresearch-uxdesign-2026-04-25.yaml --judge-model claude-sonnet-4-6
+#   3. Read outputs/eval-runs/<dir>/summary.md
+#
+# To compare against Claude: edit target.type to claude-session and re-run.
+# Diff the two summary.md files manually (a comparison-runner is a v0.5 item).
+#
+# COST ESTIMATE
+#   Gemini calls: 6 cases × ~8K tokens ≈ 48K tokens through your Gemini subscription
+#   Judge calls : 6 × ~4K tokens ≈ 24K tokens against Claude session
+# ─────────────────────────────────────────────────────────────────────────────
+
+name: "Gemini 2.5 Pro — deep research + UX design"
+description: >
+  Six cases probing Gemini's deep-research synthesis and its ability to
+  produce / critique UX. Judged by Claude Sonnet (different family, no
+  self-grading bias).
+
+target:
+  type: http
+  url: "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-pro:generateContent"
+  method: POST
+  request_template: '{"contents":[{"parts":[{"text":"{{input}}"}]}],"generationConfig":{"maxOutputTokens":2048,"temperature":0.3}}'
+  response_path: ".candidates[0].content.parts[0].text"
+  headers:
+    x-goog-api-key: "${GEMINI_API_KEY}"
+  requires:
+    - GEMINI_API_KEY
+
+capabilities:
+  - "Deep research: multi-source synthesis with explicit uncertainty"
+  - "Deep research: structured comparison across multiple data points"
+  - "Deep research: factual hedging when data is uncertain or post-cutoff"
+  - "UX design: produce usable HTML/CSS for a constrained spec"
+  - "UX design: critique an existing flow with concrete fix proposals"
+  - "UX design: design experiments (A/B test) with explicit assumptions"
+
+failure_modes:
+  - id: fm-1
+    name: "Hallucinated facts in research synthesis"
+    severity: P0
+  - id: fm-2
+    name: "Refuses or stalls on time-sensitive questions instead of dating its claims"
+    severity: P1
+  - id: fm-3
+    name: "Generic 'best practices' answer instead of design with specifics"
+    severity: P1
+  - id: fm-4
+    name: "Critique that lists everything as a problem (no prioritization)"
+    severity: P1
+
+metrics:
+  - name: "Substance"
+    description: "Does the response actually answer the question with concrete content (not generic advice)?"
+    type: score
+    pass_bar: ">= 4"
+  - name: "Hedging"
+    description: "When uncertain, does the model say so explicitly (vs. confidently asserting wrong facts)?"
+    type: score
+    pass_bar: ">= 4"
+  - name: "Structure"
+    description: "Is the response easy to scan — headers, lists, comparison tables when useful?"
+    type: score
+    pass_bar: ">= 4"
+  - name: "Length"
+    description: "Output token count (rough char-based estimate). Should be substantive but not bloated."
+    type: tokens
+    pass_bar: "<= 2000"
+
+test_cases:
+  # ── Deep research ───────────────────────────────────────────────────────────
+  - id: dr-1-multisource-synthesis
+    severity: P0
+    description: "Multi-source synthesis on a contested topic. Expects structured comparison, not a single source's view."
+    input: "Compare the leading approaches to LLM agent memory in 2024-2025: in-context, vector RAG, fine-tuned, and hybrid. For each, give one strength and one trade-off, and name a representative system that uses it. Be specific. If you're not sure about a system's actual approach, say so."
+    expected_behavior: "Four approaches, each with named real system, strength, trade-off. Hedges where uncertain about post-training-cutoff systems."
+    metrics: ["Substance", "Hedging", "Structure", "Length"]
+
+  - id: dr-2-counterfactual
+    severity: P1
+    description: "Counterfactual reasoning under uncertainty. Tests whether the model reasons explicitly or fabricates a confident timeline."
+    input: "If GPT-4 had launched in mid-2022 instead of March 2023, what's a plausible competitive response from Anthropic in the following 12 months? Reason step-by-step. Mark assumptions as assumptions."
+    expected_behavior: "Reasoning steps, named assumptions, plausible scenarios. NOT a confident retelling as if it happened."
+    metrics: ["Substance", "Hedging", "Structure"]
+
+  - id: dr-3-citations-honesty
+    severity: P0
+    description: "Citation honesty. Models often hallucinate paper titles + URLs."
+    input: "List 5 academic papers on retrieval failures or hallucinations in RAG systems published 2022-2024. For each: title, lead author, year, arXiv URL. If you cannot remember a real paper exactly, say so explicitly — DO NOT invent."
+    expected_behavior: "Either real papers with verifiable details, OR an explicit acknowledgement that the model cannot guarantee paper accuracy. Fabricated arXiv IDs = P0 fail."
+    metrics: ["Substance", "Hedging"]
+
+  # ── UX design ───────────────────────────────────────────────────────────────
+  - id: ux-1-html-form
+    severity: P1
+    description: "Concrete HTML/CSS for a constrained spec. Tests whether the model can produce usable code, not generic templates."
+    input: "Write a mobile-first sign-up form for a B2B SaaS. Fields: work email, company name, team size (1-10/11-50/51+/dropdown). Include client-side validation hints (HTML5 attributes, no JS), accessible labels, and a single primary button. Output only the HTML+CSS, in one code block."
+    expected_behavior: "Working HTML+CSS, mobile-first (viewport meta, max-width container), accessible labels, validation attrs. Not just a template."
+    metrics: ["Substance", "Structure", "Length"]
+
+  - id: ux-2-flow-critique
+    severity: P1
+    description: "Critique with prioritization. Tests whether the model picks 3 real issues vs lists 12 generic ones."
+    input: |
+      Here's a SaaS onboarding flow:
+      1. Email signup → email verification (24h delay sometimes)
+      2. After verification, a 12-question profile setup before user can see the product
+      3. Product loads with empty state and a "watch our 4-min intro video" modal
+      4. After dismissing video, user is on the dashboard with no data and a "create your first project" CTA in the bottom-right
+      Identify the 3 biggest UX gaps in this flow, in priority order, with concrete fixes for each. Do not list more than 3.
+    expected_behavior: "Exactly 3 gaps, prioritized by impact, each with a concrete fix. Should flag email verification delay (high friction), 12-question survey before value (gate), CTA placement (low affordance)."
+    metrics: ["Substance", "Structure"]
+
+  - id: ux-3-experiment-design
+    severity: P1
+    description: "Designs an A/B test with explicit assumptions and success criteria."
+    input: "Design an A/B test to validate whether adding a 30-second product tour at first login improves week-1 retention. Include: variants, hypothesis, primary metric, sample size estimate (state the assumptions), success criteria, and what could go wrong (false positives, novelty effects, etc.)."
+    expected_behavior: "Two variants named, explicit hypothesis, primary metric defined, sample-size reasoning with named baseline + MDE, success criteria, at least 2 named risks."
+    metrics: ["Substance", "Hedging", "Structure"]
+
+# ─────────────────────────────────────────────────────────────────────────────
+# APPENDIX — PRO TIPS (specific to this eval)
+# ─────────────────────────────────────────────────────────────────────────────
+#
+# Setup
+#   ✓ export GEMINI_API_KEY="..." in your terminal BEFORE launching claude
+#   ✓ Verify with: echo "key set: ${GEMINI_API_KEY:+yes}"  (should print "key set: yes")
+#   ✓ Never paste the key into the YAML or chat
+#
+# Running
+#   ✓ Always pass --judge-model claude-sonnet-4-6 — without it, all subjective
+#     metrics will be marked "needs-judge" and you won't get scores.
+#   ✓ Try one case first: --only dr-1-multisource-synthesis
+#   ✓ Cost is small: ~$0.20 Gemini + ~$0.20 Claude judge for the full 6-case run.
+#
+# Comparing against Claude
+#   ✓ Duplicate this YAML with target.type: claude-session, model: claude-sonnet-4-6
+#   ✓ Run both. Diff the summary.md files.
+#   ✓ Watch for "Hedging" — that's where models diverge most on dr-2 and dr-3.
+#
+# What you'll learn
+#   ✓ Whether Gemini hallucinates citations on dr-3 (most models do).
+#   ✓ Whether Gemini's UX critique prioritizes (ux-2) or lists everything.
+#   ✓ Whether the HTML on ux-1 is mobile-first and accessible by default.
+# ─────────────────────────────────────────────────────────────────────────────

From 2f5739f1f75a00b38584a097b49483628b527818 Mon Sep 17 00:00:00 2001
From: Ryan Alberts <ryan.a.alberts@gmail.com>
Date: Sat, 25 Apr 2026 00:00:02 -0500
Subject: [PATCH 2/3] v0.4.2: gitignore per-run eval artifacts

outputs/eval-runs/ is per-user run output, not canonical example
material. The parent outputs/ stays tracked so the example artifacts
(Ultraplan eval, Gemini eval YAML, roadmap, verification report)
remain in the repo.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .gitignore | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.gitignore b/.gitignore
index 3566634..381b60d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -21,3 +21,6 @@ venv/
 .claude/settings.local.json
 .claude/projects/
 .claude/worktrees/
+
+# Per-run eval execution artifacts (the parent outputs/ stays tracked for canonical examples)
+outputs/eval-runs/

From 00ff6feda07f7a6087cc2aeb4d052b292df9982a Mon Sep 17 00:00:00 2001
From: Ryan Alberts <ryan.a.alberts@gmail.com>
Date: Sat, 25 Apr 2026 00:27:09 -0500
Subject: [PATCH 3/3] v0.4.2: runner improvements from Gemini eval debug

Real-world test against Gemini 2.5 Pro hit immediate 429s on every
call (free-tier quota exhausted). The runner correctly captured the
errors and refused to fake scores. Two improvements that came out of
debugging:

bin/run-eval.py
- target.delay_between_cases_sec: optional sleep between cases for
  rate-limit respect on http targets
- 429 errors now emit a clear WARN telling the user to check their
  quota / try a smaller model / set a delay
- Captures the full HTTP error body (300 chars) into evidence so
  diagnosis doesn't require re-reading raw network logs

eval YAML
- Switched from gemini-2.5-pro to gemini-2.0-flash (more permissive
  free-tier limits)
- Added delay_between_cases_sec: 6
- Documented why in description

Note: the actual Gemini comparison didn't produce real scores in this
run (quota exhausted on user's API key). The eval will need to be
re-run after the user's quota resets or with a key that has access.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 bin/run-eval.py                                 | 17 +++++++++++++++--
 ...gemini-deepresearch-uxdesign-2026-04-25.yaml | 11 +++++++++--
 2 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/bin/run-eval.py b/bin/run-eval.py
index 1284061..82f99fd 100755
--- a/bin/run-eval.py
+++ b/bin/run-eval.py
@@ -33,6 +33,7 @@
 import shutil
 import subprocess
 import sys
+import time
 from datetime import datetime, timezone
 from pathlib import Path
 
@@ -169,7 +170,15 @@ def invoke_target(target: dict, prompt: str, dry_run: bool) -> tuple[str, dict]:
                 raw = r.read().decode()
                 status = r.status
         except urllib.error.HTTPError as e:
-            return "", {"type": "http", "status": e.code, "error": str(e)}
+            err_body = ""
+            try:
+                err_body = e.read().decode()[:300]
+            except Exception:
+                pass
+            if e.code == 429:
+                warn(f"HTTP 429 from target — rate limit or quota exhausted. Body: {err_body[:120]}")
+                warn("If all cases 429, your API key likely lacks access to this model OR the free-tier quota is exhausted. Try a smaller/cheaper model or set target.delay_between_cases_sec.")
+            return "", {"type": "http", "status": e.code, "error": str(e), "body_excerpt": err_body}
         except urllib.error.URLError as e:
             fatal(f"http target unreachable: {e}")
         path = target.get("response_path", "$")
@@ -373,10 +382,14 @@ def main() -> int:
 
     tokens_used = 0
     run_results = []
-    for tc in cases:
+    delay_sec = float(target.get("delay_between_cases_sec", 0))
+    for i, tc in enumerate(cases):
         if args.max_tokens and tokens_used >= args.max_tokens:
             warn(f"token cap {args.max_tokens:,} reached — partial run, {len(run_results)} cases done")
             break
+        if i > 0 and delay_sec > 0 and not args.dry_run:
+            info(f"  sleeping {delay_sec}s (rate-limit respect)")
+            time.sleep(delay_sec)
         info(f"  case {tc['id']} ({tc.get('severity', '?')})")
 
         output, evidence = invoke_target(target, tc["input"], args.dry_run)
diff --git a/outputs/eval-gemini-deepresearch-uxdesign-2026-04-25.yaml b/outputs/eval-gemini-deepresearch-uxdesign-2026-04-25.yaml
index ea06127..9d6645a 100644
--- a/outputs/eval-gemini-deepresearch-uxdesign-2026-04-25.yaml
+++ b/outputs/eval-gemini-deepresearch-uxdesign-2026-04-25.yaml
@@ -15,15 +15,20 @@
 #   Judge calls : 6 × ~4K tokens ≈ 24K tokens against Claude session
 # ─────────────────────────────────────────────────────────────────────────────
 
-name: "Gemini 2.5 Pro — deep research + UX design"
+name: "Gemini 2.0 Flash — deep research + UX design"
 description: >
   Six cases probing Gemini's deep-research synthesis and its ability to
   produce / critique UX. Judged by Claude Sonnet (different family, no
   self-grading bias).
 
+  Why flash and not pro: free-tier quota for gemini-2.5-pro is too tight
+  for a 6-case run (every call 429s on first attempt). Flash has much
+  more permissive free-tier limits and is the closer apples-to-apples
+  comparison against Claude Haiku/Sonnet anyway.
+
 target:
   type: http
-  url: "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-pro:generateContent"
+  url: "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent"
   method: POST
   request_template: '{"contents":[{"parts":[{"text":"{{input}}"}]}],"generationConfig":{"maxOutputTokens":2048,"temperature":0.3}}'
   response_path: ".candidates[0].content.parts[0].text"
@@ -31,6 +36,8 @@ target:
     x-goog-api-key: "${GEMINI_API_KEY}"
   requires:
     - GEMINI_API_KEY
+  # Free tier limits are per-minute; 6s spacing keeps us under 10 RPM comfortably.
+  delay_between_cases_sec: 6
 
 capabilities:
   - "Deep research: multi-source synthesis with explicit uncertainty"