From 5244c2a9bfdad587348616902385038893444750 Mon Sep 17 00:00:00 2001 From: Berk Durmus Date: Sun, 25 Jan 2026 05:07:44 +0300 Subject: [PATCH 1/2] feat(evals): add AI Eval Playground for comparing models and prompts --- components/ai-eval/ApiKeyDialog.tsx | 231 +++++++++++ components/ai-eval/EvalComparisonGrid.tsx | 154 ++++++++ components/ai-eval/EvalConfigPanel.tsx | 265 +++++++++++++ components/ai-eval/EvalJudgePanel.tsx | 186 +++++++++ components/ai-eval/EvalModelSelector.tsx | 204 ++++++++++ components/ai-eval/EvalResultCell.tsx | 149 +++++++ components/ai-eval/EvalScoreDisplay.tsx | 151 +++++++ components/hooks/useApiKeys.ts | 99 +++++ components/utils/ai-eval-judge.ts | 362 +++++++++++++++++ components/utils/ai-eval-providers.ts | 309 +++++++++++++++ components/utils/ai-eval-schemas.test.ts | 354 +++++++++++++++++ components/utils/ai-eval-schemas.ts | 354 +++++++++++++++++ components/utils/tools-list.ts | 6 + package-lock.json | 12 +- package.json | 3 +- pages/utilities/ai-eval.tsx | 460 ++++++++++++++++++++++ 16 files changed, 3297 insertions(+), 2 deletions(-) create mode 100644 components/ai-eval/ApiKeyDialog.tsx create mode 100644 components/ai-eval/EvalComparisonGrid.tsx create mode 100644 components/ai-eval/EvalConfigPanel.tsx create mode 100644 components/ai-eval/EvalJudgePanel.tsx create mode 100644 components/ai-eval/EvalModelSelector.tsx create mode 100644 components/ai-eval/EvalResultCell.tsx create mode 100644 components/ai-eval/EvalScoreDisplay.tsx create mode 100644 components/hooks/useApiKeys.ts create mode 100644 components/utils/ai-eval-judge.ts create mode 100644 components/utils/ai-eval-providers.ts create mode 100644 components/utils/ai-eval-schemas.test.ts create mode 100644 components/utils/ai-eval-schemas.ts create mode 100644 pages/utilities/ai-eval.tsx diff --git a/components/ai-eval/ApiKeyDialog.tsx b/components/ai-eval/ApiKeyDialog.tsx new file mode 100644 index 0000000..d68c2ed --- /dev/null +++ b/components/ai-eval/ApiKeyDialog.tsx @@ -0,0 +1,231 @@ +import { useState, useCallback } from "react"; +import { + Dialog, + DialogContent, + DialogHeader, + DialogTitle, + DialogDescription, + DialogTrigger, +} from "@/components/ds/DialogComponent"; +import { Button } from "@/components/ds/ButtonComponent"; +import { Input } from "@/components/ds/InputComponent"; +import { Label } from "@/components/ds/LabelComponent"; +import { PROVIDERS, ProviderId } from "@/components/utils/ai-eval-schemas"; +import { UseApiKeysReturn } from "@/components/hooks/useApiKeys"; +import { Key, Check, X, Loader2, Eye, EyeOff } from "lucide-react"; + +interface ApiKeyDialogProps { + apiKeys: UseApiKeysReturn; + children?: React.ReactNode; +} + +interface ProviderKeyInputProps { + providerId: ProviderId; + providerName: string; + apiKeys: UseApiKeysReturn; +} + +function ProviderKeyInput({ + providerId, + providerName, + apiKeys, +}: ProviderKeyInputProps) { + const [value, setValue] = useState(apiKeys.getKey(providerId) || ""); + const [showKey, setShowKey] = useState(false); + const [testing, setTesting] = useState(false); + const [testResult, setTestResult] = useState(null); + + const hasKey = apiKeys.hasKey(providerId); + + const handleSave = useCallback(() => { + if (value.trim()) { + apiKeys.setKey(providerId, value.trim()); + setTestResult(null); + } + }, [apiKeys, providerId, value]); + + const handleRemove = useCallback(() => { + apiKeys.removeKey(providerId); + setValue(""); + setTestResult(null); + }, [apiKeys, providerId]); + + const handleTest = useCallback(async () => { + if (!value.trim()) return; + + // First save the key + apiKeys.setKey(providerId, value.trim()); + + setTesting(true); + setTestResult(null); + + try { + const result = await apiKeys.testKey(providerId); + setTestResult(result); + } catch { + setTestResult(false); + } finally { + setTesting(false); + } + }, [apiKeys, providerId, value]); + + return ( +
+
+ +
+ {hasKey && testResult === null && ( + + + Configured + + )} + {testResult === true && ( + + + Valid + + )} + {testResult === false && ( + + + Invalid + + )} +
+
+ +
+
+ { + setValue(e.target.value); + setTestResult(null); + }} + onBlur={handleSave} + className="pr-10 font-mono text-xs" + /> + +
+ + + + {hasKey && ( + + )} +
+
+ ); +} + +export function ApiKeyDialog({ apiKeys, children }: ApiKeyDialogProps) { + const configuredCount = PROVIDERS.filter((p) => + apiKeys.hasKey(p.id) + ).length; + + return ( + + + {children || ( + + )} + + + + + + + API Keys + + + Your API keys are stored in session storage and cleared when you + close the browser. Keys never leave your browser. + + + +
+ {PROVIDERS.map((provider) => ( + + ))} +
+ +
+

+ Need API keys?{" "} + + OpenAI + {" "} + ·{" "} + + Anthropic + {" "} + ·{" "} + + Google AI + +

+
+
+
+ ); +} diff --git a/components/ai-eval/EvalComparisonGrid.tsx b/components/ai-eval/EvalComparisonGrid.tsx new file mode 100644 index 0000000..ca4ef7b --- /dev/null +++ b/components/ai-eval/EvalComparisonGrid.tsx @@ -0,0 +1,154 @@ +import { + ModelConfig, + JudgeEvaluation, + ComparisonMode, +} from "@/components/utils/ai-eval-schemas"; +import { EvalResultCell } from "./EvalResultCell"; + +interface ResultData { + id: string; + modelId: string; + promptId: string; + output: string | null; + evaluation: JudgeEvaluation | null; + isLoading: boolean; + error: string | null; + latencyMs?: number; +} + +interface EvalComparisonGridProps { + mode: ComparisonMode; + models: ModelConfig[]; + results: ResultData[]; + winnerIds: string[]; +} + +export function EvalComparisonGrid({ + mode, + models, + results, + winnerIds, +}: EvalComparisonGridProps) { + // Determine grid columns based on number of items + const gridCols = + models.length === 2 + ? "grid-cols-2" + : models.length === 3 + ? "grid-cols-3" + : "grid-cols-2 lg:grid-cols-4"; + + if (mode === "model-vs-model") { + // In model-vs-model mode, we show one column per model + return ( +
+ {models.map((model, index) => { + const result = results.find((r) => r.modelId === model.id); + const label = `Model ${String.fromCharCode(65 + index)}`; + + return ( + + ); + })} +
+ ); + } + + // In prompt-vs-prompt mode, we show one column per prompt variant + // Group results by promptId + const promptIds = Array.from(new Set(results.map((r) => r.promptId))); + const model = models[0]; // Single model in prompt-vs-prompt mode + + return ( +
+ {promptIds.map((promptId, index) => { + const result = results.find((r) => r.promptId === promptId); + const label = `Prompt ${String.fromCharCode(65 + index)}`; + + return ( + + ); + })} +
+ ); +} + +// Summary component for showing overall comparison results +interface ComparisonSummaryProps { + results: ResultData[]; + winnerIds: string[]; + comparisonReasoning?: string; +} + +export function ComparisonSummary({ + results, + winnerIds, + comparisonReasoning, +}: ComparisonSummaryProps) { + const completedResults = results.filter( + (r) => r.output && r.evaluation && !r.isLoading + ); + + if (completedResults.length < 2) { + return null; + } + + // Sort by overall score + const sorted = [...completedResults].sort( + (a, b) => + (b.evaluation?.overallScore ?? 0) - (a.evaluation?.overallScore ?? 0) + ); + + return ( +
+

Comparison Summary

+ +
+ {sorted.map((result, index) => { + const isWinner = winnerIds.includes(result.id); + return ( +
+ #{index + 1} + {result.modelId} + + {result.evaluation?.overallScore.toFixed(1)} + +
+ ); + })} +
+ + {comparisonReasoning && ( +

+ {comparisonReasoning} +

+ )} +
+ ); +} diff --git a/components/ai-eval/EvalConfigPanel.tsx b/components/ai-eval/EvalConfigPanel.tsx new file mode 100644 index 0000000..d36ee88 --- /dev/null +++ b/components/ai-eval/EvalConfigPanel.tsx @@ -0,0 +1,265 @@ +import { useState, useEffect, useCallback, useMemo } from "react"; +import { Card } from "@/components/ds/CardComponent"; +import { Label } from "@/components/ds/LabelComponent"; +import { Textarea } from "@/components/ds/TextareaComponent"; +import { Input } from "@/components/ds/InputComponent"; +import { + extractVariables, + DEFAULT_SYSTEM_PROMPT, + DEFAULT_USER_PROMPT, + ComparisonMode, +} from "@/components/utils/ai-eval-schemas"; +import { Plus, Trash2 } from "lucide-react"; +import { Button } from "@/components/ds/ButtonComponent"; + +interface PromptVariant { + id: string; + systemPrompt: string; + userPrompt: string; +} + +interface EvalConfigPanelProps { + mode: ComparisonMode; + onPromptsChange: (prompts: PromptVariant[]) => void; + onVariablesChange: (variables: Record) => void; +} + +export function EvalConfigPanel({ + mode, + onPromptsChange, + onVariablesChange, +}: EvalConfigPanelProps) { + // For model-vs-model mode, we have a single prompt + // For prompt-vs-prompt mode, we have multiple prompt variants + const [prompts, setPrompts] = useState([ + { + id: "prompt-1", + systemPrompt: DEFAULT_SYSTEM_PROMPT, + userPrompt: DEFAULT_USER_PROMPT, + }, + ]); + + const [variables, setVariables] = useState>({ + question: "", + }); + + // Extract all variables from all prompts + const allVariables = useMemo(() => { + const vars = new Set(); + prompts.forEach((p) => { + extractVariables(p.systemPrompt).forEach((v) => vars.add(v)); + extractVariables(p.userPrompt).forEach((v) => vars.add(v)); + }); + return Array.from(vars); + }, [prompts]); + + // Update variables state when new variables are detected + useEffect(() => { + setVariables((prev) => { + const next: Record = {}; + allVariables.forEach((v) => { + next[v] = prev[v] ?? ""; + }); + return next; + }); + }, [allVariables]); + + // Notify parent of changes + useEffect(() => { + onPromptsChange(prompts); + }, [prompts, onPromptsChange]); + + useEffect(() => { + onVariablesChange(variables); + }, [variables, onVariablesChange]); + + const updatePrompt = useCallback( + (id: string, field: "systemPrompt" | "userPrompt", value: string) => { + setPrompts((prev) => + prev.map((p) => (p.id === id ? { ...p, [field]: value } : p)) + ); + }, + [] + ); + + const addPromptVariant = useCallback(() => { + const newId = `prompt-${Date.now()}`; + setPrompts((prev) => [ + ...prev, + { + id: newId, + systemPrompt: prev[0]?.systemPrompt || DEFAULT_SYSTEM_PROMPT, + userPrompt: prev[0]?.userPrompt || DEFAULT_USER_PROMPT, + }, + ]); + }, []); + + const removePromptVariant = useCallback((id: string) => { + setPrompts((prev) => { + if (prev.length <= 1) return prev; + return prev.filter((p) => p.id !== id); + }); + }, []); + + const updateVariable = useCallback((name: string, value: string) => { + setVariables((prev) => ({ ...prev, [name]: value })); + }, []); + + // In model-vs-model mode, show single prompt config + if (mode === "model-vs-model") { + const prompt = prompts[0]; + return ( + +
+
+ +