diff --git a/PR_DESCRIPTION.md b/PR_DESCRIPTION.md
new file mode 100644
index 0000000..7f0e881
--- /dev/null
+++ b/PR_DESCRIPTION.md
@@ -0,0 +1,67 @@
+## Summary
+
+- Add new AI Eval Playground utility for comparing AI model outputs and prompts
+- Implement BYOK (Bring Your Own Key) support for OpenAI, Anthropic, and Google AI
+- Build LLM-as-judge scoring system with configurable criteria weights
+- Create clean, table-based comparison UI following Linear.app design patterns
+
+## Features
+
+### Comparison Modes
+- **Model vs Model**: Compare 2-4 models with the same prompt
+- **Prompt vs Prompt**: Compare 2-4 prompt variations with the same model
+
+### Supported Providers
+| Provider | Models |
+|----------|--------|
+| OpenAI | GPT-4o, GPT-4o Mini, GPT-4 Turbo, GPT-3.5 Turbo |
+| Anthropic | Claude 3.5 Sonnet, Claude 3.5 Haiku, Claude 3 Opus |
+| Google AI | Gemini 2.0 Flash, Gemini 1.5 Pro, Gemini 1.5 Flash |
+
+### LLM-as-Judge Scoring
+- 5 evaluation criteria: Accuracy, Relevance, Clarity, Completeness, Conciseness
+- Adjustable weight sliders for custom scoring emphasis
+- Pairwise comparison with winner detection
+- Visual score badges and breakdown bars
+
+### Security
+- API keys stored in sessionStorage only (cleared on browser close)
+- All processing happens client-side
+- Keys never leave the browser
+
+## Files Added
+
+```
+components/
+├── ai-eval/
+│   ├── ApiKeyDialog.tsx
+│   ├── EvalComparisonGrid.tsx
+│   ├── EvalConfigPanel.tsx
+│   ├── EvalJudgePanel.tsx
+│   ├── EvalModelSelector.tsx
+│   ├── EvalResultCell.tsx
+│   └── EvalScoreDisplay.tsx
+├── hooks/
+│   └── useApiKeys.ts
+└── utils/
+    ├── ai-eval-judge.ts
+    ├── ai-eval-providers.ts
+    ├── ai-eval-schemas.ts
+    └── ai-eval-schemas.test.ts
+
+pages/utilities/
+└── ai-eval.tsx
+```
+
+## Screenshots
+
+<!-- Add screenshots here -->
+
+## Test Plan
+
+- [x] Unit tests for schema validation and utility functions (37 tests passing)
+- [x] Build passes with no TypeScript errors
+- [ ] Manual testing with real API keys for each provider
+- [ ] Verify API key dialog saves/clears correctly
+- [ ] Test both comparison modes with multiple models
+- [ ] Verify judge scoring produces valid results
diff --git a/components/ai-eval/ApiKeyDialog.tsx b/components/ai-eval/ApiKeyDialog.tsx
new file mode 100644
index 0000000..2bffe9a
--- /dev/null
+++ b/components/ai-eval/ApiKeyDialog.tsx
@@ -0,0 +1,196 @@
+import { useState, useCallback } from "react";
+import {
+  Dialog,
+  DialogContent,
+  DialogHeader,
+  DialogTitle,
+  DialogDescription,
+  DialogTrigger,
+} from "@/components/ds/DialogComponent";
+import { Button } from "@/components/ds/ButtonComponent";
+import { PROVIDERS, ProviderId } from "@/components/utils/ai-eval-schemas";
+import { UseApiKeysReturn } from "@/components/hooks/useApiKeys";
+import { Key, Check, X, Loader2, Eye, EyeOff, ExternalLink } from "lucide-react";
+
+interface ApiKeyDialogProps {
+  apiKeys: UseApiKeysReturn;
+  children?: React.ReactNode;
+}
+
+interface ProviderKeyRowProps {
+  providerId: ProviderId;
+  providerName: string;
+  apiKeys: UseApiKeysReturn;
+  keyUrl: string;
+}
+
+function ProviderKeyRow({ providerId, providerName, apiKeys, keyUrl }: ProviderKeyRowProps) {
+  const [value, setValue] = useState(apiKeys.getKey(providerId) || "");
+  const [showKey, setShowKey] = useState(false);
+  const [testing, setTesting] = useState(false);
+  const [testResult, setTestResult] = useState<boolean | null>(null);
+
+  const hasKey = apiKeys.hasKey(providerId);
+
+  const handleSave = useCallback(() => {
+    if (value.trim()) {
+      apiKeys.setKey(providerId, value.trim());
+      setTestResult(null);
+    }
+  }, [apiKeys, providerId, value]);
+
+  const handleRemove = useCallback(() => {
+    apiKeys.removeKey(providerId);
+    setValue("");
+    setTestResult(null);
+  }, [apiKeys, providerId]);
+
+  const handleTest = useCallback(async () => {
+    if (!value.trim()) return;
+    apiKeys.setKey(providerId, value.trim());
+    setTesting(true);
+    setTestResult(null);
+    try {
+      const result = await apiKeys.testKey(providerId);
+      setTestResult(result);
+    } catch {
+      setTestResult(false);
+    } finally {
+      setTesting(false);
+    }
+  }, [apiKeys, providerId, value]);
+
+  return (
+    <div className="py-4 border-b border-border last:border-b-0">
+      <div className="flex items-center justify-between mb-3">
+        <div className="flex items-center gap-3">
+          <span className="text-sm font-medium">{providerName}</span>
+          {hasKey && testResult === null && (
+            <span className="flex items-center gap-1 text-xs text-green-600 dark:text-green-400">
+              <Check className="h-3 w-3" />
+              Configured
+            </span>
+          )}
+          {testResult === true && (
+            <span className="flex items-center gap-1 text-xs text-green-600 dark:text-green-400">
+              <Check className="h-3 w-3" />
+              Valid
+            </span>
+          )}
+          {testResult === false && (
+            <span className="flex items-center gap-1 text-xs text-red-600 dark:text-red-400">
+              <X className="h-3 w-3" />
+              Invalid
+            </span>
+          )}
+        </div>
+        <a
+          href={keyUrl}
+          target="_blank"
+          rel="noopener noreferrer"
+          className="flex items-center gap-1 text-xs text-muted-foreground hover:text-foreground transition-colors"
+        >
+          Get key
+          <ExternalLink className="h-3 w-3" />
+        </a>
+      </div>
+
+      <div className="flex gap-2">
+        <div className="relative flex-1">
+          <input
+            type={showKey ? "text" : "password"}
+            placeholder={`Enter ${providerName} API key`}
+            value={value}
+            onChange={(e) => {
+              setValue(e.target.value);
+              setTestResult(null);
+            }}
+            onBlur={handleSave}
+            className="w-full h-10 pl-3 pr-10 rounded-lg border border-input bg-background text-sm font-mono focus:outline-none focus:ring-2 focus:ring-ring"
+          />
+          <button
+            type="button"
+            onClick={() => setShowKey(!showKey)}
+            className="absolute right-3 top-1/2 -translate-y-1/2 text-muted-foreground hover:text-foreground transition-colors"
+          >
+            {showKey ? <EyeOff className="h-4 w-4" /> : <Eye className="h-4 w-4" />}
+          </button>
+        </div>
+
+        <Button
+          variant="outline"
+          size="sm"
+          onClick={handleTest}
+          disabled={!value.trim() || testing}
+          className="h-10 px-4"
+        >
+          {testing ? <Loader2 className="h-4 w-4 animate-spin" /> : "Test"}
+        </Button>
+
+        {hasKey && (
+          <Button
+            variant="outline"
+            size="sm"
+            onClick={handleRemove}
+            className="h-10 px-4 text-red-600 hover:text-red-700 hover:bg-red-50 dark:hover:bg-red-950"
+          >
+            Remove
+          </Button>
+        )}
+      </div>
+    </div>
+  );
+}
+
+const PROVIDER_KEY_URLS: Record<ProviderId, string> = {
+  openai: "https://platform.openai.com/api-keys",
+  anthropic: "https://console.anthropic.com/settings/keys",
+  google: "https://aistudio.google.com/app/apikey",
+};
+
+export function ApiKeyDialog({ apiKeys, children }: ApiKeyDialogProps) {
+  const configuredCount = PROVIDERS.filter((p) => apiKeys.hasKey(p.id)).length;
+
+  return (
+    <Dialog>
+      <DialogTrigger asChild>
+        {children || (
+          <Button variant="outline" className="gap-2 h-10">
+            <Key className="h-4 w-4" />
+            <span>API Keys</span>
+            {configuredCount > 0 && (
+              <span className="ml-1 flex items-center justify-center w-5 h-5 text-xs font-medium rounded-full bg-green-100 dark:bg-green-900/50 text-green-700 dark:text-green-300">
+                {configuredCount}
+              </span>
+            )}
+          </Button>
+        )}
+      </DialogTrigger>
+
+      <DialogContent className="sm:max-w-lg">
+        <DialogHeader>
+          <DialogTitle className="flex items-center gap-2">
+            <Key className="h-5 w-5" />
+            API Keys
+          </DialogTitle>
+          <DialogDescription>
+            Keys are stored in session storage and cleared when you close the browser.
+            Your keys never leave your browser.
+          </DialogDescription>
+        </DialogHeader>
+
+        <div className="mt-2">
+          {PROVIDERS.map((provider) => (
+            <ProviderKeyRow
+              key={provider.id}
+              providerId={provider.id}
+              providerName={provider.name}
+              apiKeys={apiKeys}
+              keyUrl={PROVIDER_KEY_URLS[provider.id]}
+            />
+          ))}
+        </div>
+      </DialogContent>
+    </Dialog>
+  );
+}
diff --git a/components/ai-eval/EvalJudgeConfig.tsx b/components/ai-eval/EvalJudgeConfig.tsx
new file mode 100644
index 0000000..f1ef87d
--- /dev/null
+++ b/components/ai-eval/EvalJudgeConfig.tsx
@@ -0,0 +1,181 @@
+import { useCallback } from "react";
+import { Slider } from "@/components/ds/SliderComponent";
+import {
+  ModelConfig,
+  CriteriaWeights,
+  DEFAULT_CRITERIA_WEIGHTS,
+  PROVIDERS,
+} from "@/components/utils/ai-eval-schemas";
+import { UseApiKeysReturn } from "@/components/hooks/useApiKeys";
+import { ChevronDown } from "lucide-react";
+
+interface EvalJudgeConfigProps {
+  judgeModel: ModelConfig | null;
+  onJudgeModelChange: (model: ModelConfig | null) => void;
+  weights: CriteriaWeights;
+  onWeightsChange: (weights: CriteriaWeights) => void;
+  autoEvaluate: boolean;
+  onAutoEvaluateChange: (value: boolean) => void;
+  apiKeys: UseApiKeysReturn;
+}
+
+const CRITERIA = [
+  { key: "accuracy" as const, label: "Accuracy", desc: "Factual correctness" },
+  { key: "relevance" as const, label: "Relevance", desc: "Addresses the prompt" },
+  { key: "clarity" as const, label: "Clarity", desc: "Clear and organized" },
+  { key: "completeness" as const, label: "Completeness", desc: "Comprehensive" },
+  { key: "conciseness" as const, label: "Conciseness", desc: "Appropriate length" },
+];
+
+export function EvalJudgeConfig({
+  judgeModel,
+  onJudgeModelChange,
+  weights,
+  onWeightsChange,
+  autoEvaluate,
+  onAutoEvaluateChange,
+  apiKeys,
+}: EvalJudgeConfigProps) {
+  const handleModelChange = (e: React.ChangeEvent<HTMLSelectElement>) => {
+    const modelId = e.target.value;
+    if (!modelId) {
+      onJudgeModelChange(null);
+      return;
+    }
+    for (const provider of PROVIDERS) {
+      const model = provider.models.find((m) => m.id === modelId);
+      if (model) {
+        onJudgeModelChange(model);
+        return;
+      }
+    }
+  };
+
+  const updateWeight = useCallback(
+    (key: keyof CriteriaWeights, value: number) => {
+      const newValue = value / 100;
+      const newWeights = { ...weights, [key]: newValue };
+      const total = Object.values(newWeights).reduce((sum, w) => sum + w, 0);
+      if (total > 0) {
+        Object.keys(newWeights).forEach((k) => {
+          newWeights[k as keyof CriteriaWeights] /= total;
+        });
+      }
+      onWeightsChange(newWeights);
+    },
+    [weights, onWeightsChange]
+  );
+
+  const resetWeights = () => onWeightsChange(DEFAULT_CRITERIA_WEIGHTS);
+
+  return (
+    <div className="rounded-xl border border-border bg-card overflow-hidden">
+      <div className="px-4 py-3 border-b border-border bg-muted/30">
+        <span className="text-xs font-medium text-muted-foreground uppercase tracking-wider">
+          Judge Configuration
+        </span>
+      </div>
+
+      <div className="p-6">
+        <div className="grid grid-cols-1 md:grid-cols-3 gap-8">
+          {/* Judge Model Selection */}
+          <div className="space-y-3">
+            <label className="text-sm font-medium">Judge Model</label>
+            <div className="relative">
+              <select
+                value={judgeModel?.id || ""}
+                onChange={handleModelChange}
+                className="w-full h-11 pl-4 pr-10 rounded-lg border border-input bg-background text-sm appearance-none cursor-pointer focus:outline-none focus:ring-2 focus:ring-ring"
+              >
+                <option value="">Select judge model...</option>
+                {PROVIDERS.map((provider) => (
+                  <optgroup
+                    key={provider.id}
+                    label={`${provider.name}${!apiKeys.hasKey(provider.id) ? " (no key)" : ""}`}
+                  >
+                    {provider.models.map((model) => (
+                      <option
+                        key={model.id}
+                        value={model.id}
+                        disabled={!apiKeys.hasKey(provider.id)}
+                      >
+                        {model.name}
+                      </option>
+                    ))}
+                  </optgroup>
+                ))}
+              </select>
+              <ChevronDown className="absolute right-3 top-1/2 -translate-y-1/2 h-4 w-4 text-muted-foreground pointer-events-none" />
+            </div>
+            <p className="text-xs text-muted-foreground">
+              Recommended: Use a different model than compared models
+            </p>
+          </div>
+
+          {/* Auto-evaluate Toggle */}
+          <div className="space-y-3">
+            <label className="text-sm font-medium">Auto-evaluate</label>
+            <label className="flex items-center gap-3 cursor-pointer">
+              <div
+                onClick={() => onAutoEvaluateChange(!autoEvaluate)}
+                className={`
+                  relative w-11 h-6 rounded-full transition-colors cursor-pointer
+                  ${autoEvaluate ? "bg-primary" : "bg-muted"}
+                `}
+              >
+                <div
+                  className={`
+                    absolute top-1 w-4 h-4 rounded-full bg-white shadow-sm transition-transform
+                    ${autoEvaluate ? "translate-x-6" : "translate-x-1"}
+                  `}
+                />
+              </div>
+              <span className="text-sm text-muted-foreground">
+                Score responses after generation
+              </span>
+            </label>
+          </div>
+
+          {/* Reset Weights */}
+          <div className="space-y-3">
+            <div className="flex items-center justify-between">
+              <label className="text-sm font-medium">Criteria Weights</label>
+              <button
+                onClick={resetWeights}
+                className="text-xs text-primary hover:underline"
+              >
+                Reset to default
+              </button>
+            </div>
+            <p className="text-xs text-muted-foreground">
+              Adjust how each criterion affects the overall score
+            </p>
+          </div>
+        </div>
+
+        {/* Criteria Sliders */}
+        <div className="mt-8 grid grid-cols-1 md:grid-cols-5 gap-6">
+          {CRITERIA.map(({ key, label, desc }) => (
+            <div key={key} className="space-y-2">
+              <div className="flex items-center justify-between">
+                <span className="text-sm font-medium">{label}</span>
+                <span className="text-xs font-mono text-muted-foreground">
+                  {Math.round(weights[key] * 100)}%
+                </span>
+              </div>
+              <Slider
+                value={[Math.round(weights[key] * 100)]}
+                onValueChange={(values) => updateWeight(key, values[0])}
+                min={0}
+                max={100}
+                step={5}
+                className="w-full"
+              />
+              <p className="text-xs text-muted-foreground">{desc}</p>
+            </div>
+          ))}
+        </div>
+      </div>
+    </div>
+  );
+}
diff --git a/components/ai-eval/EvalModelSelector.tsx b/components/ai-eval/EvalModelSelector.tsx
new file mode 100644
index 0000000..684feb4
--- /dev/null
+++ b/components/ai-eval/EvalModelSelector.tsx
@@ -0,0 +1,88 @@
+import { useCallback, useMemo } from "react";
+import { PROVIDERS, ModelConfig } from "@/components/utils/ai-eval-schemas";
+import { UseApiKeysReturn } from "@/components/hooks/useApiKeys";
+import { Check } from "lucide-react";
+
+interface EvalModelSelectorProps {
+  selectedModels: ModelConfig[];
+  onModelsChange: (models: ModelConfig[]) => void;
+  apiKeys: UseApiKeysReturn;
+  maxSelections: number;
+}
+
+export function EvalModelSelector({
+  selectedModels,
+  onModelsChange,
+  apiKeys,
+  maxSelections,
+}: EvalModelSelectorProps) {
+  const selectedIds = useMemo(
+    () => new Set(selectedModels.map((m) => m.id)),
+    [selectedModels]
+  );
+
+  const handleToggle = useCallback(
+    (model: ModelConfig) => {
+      if (selectedIds.has(model.id)) {
+        onModelsChange(selectedModels.filter((m) => m.id !== model.id));
+      } else if (selectedModels.length < maxSelections) {
+        onModelsChange([...selectedModels, model]);
+      } else if (maxSelections === 1) {
+        onModelsChange([model]);
+      }
+    },
+    [selectedModels, onModelsChange, selectedIds, maxSelections]
+  );
+
+  return (
+    <div className="space-y-6">
+      {PROVIDERS.map((provider) => {
+        const hasKey = apiKeys.hasKey(provider.id);
+
+        return (
+          <div key={provider.id}>
+            <div className="flex items-center gap-2 mb-3">
+              <span className="text-sm font-medium">{provider.name}</span>
+              {!hasKey && (
+                <span className="text-xs text-amber-600 dark:text-amber-400 bg-amber-50 dark:bg-amber-950/30 px-2 py-0.5 rounded-full">
+                  No API key
+                </span>
+              )}
+            </div>
+
+            <div className="flex flex-wrap gap-2">
+              {provider.models.map((model) => {
+                const isSelected = selectedIds.has(model.id);
+                const isDisabled =
+                  !hasKey || (!isSelected && selectedModels.length >= maxSelections && maxSelections > 1);
+
+                return (
+                  <button
+                    key={model.id}
+                    onClick={() => !isDisabled && handleToggle(model)}
+                    disabled={isDisabled}
+                    className={`
+                      relative flex items-center gap-2 px-4 py-2.5 text-sm font-medium rounded-lg border-2 transition-all
+                      ${
+                        isSelected
+                          ? "border-primary bg-primary/5 text-primary"
+                          : isDisabled
+                            ? "border-border bg-muted/50 text-muted-foreground/50 cursor-not-allowed"
+                            : "border-border bg-background text-foreground hover:border-primary/50 hover:bg-primary/5 cursor-pointer"
+                      }
+                    `}
+                  >
+                    {isSelected && (
+                      <Check className="h-4 w-4" />
+                    )}
+                    {model.name}
+                  </button>
+                );
+              })}
+            </div>
+          </div>
+        );
+      })}
+    </div>
+  );
+}
diff --git a/components/ai-eval/EvalResultCard.tsx b/components/ai-eval/EvalResultCard.tsx
new file mode 100644
index 0000000..16ee33d
--- /dev/null
+++ b/components/ai-eval/EvalResultCard.tsx
@@ -0,0 +1,169 @@
+import { useState } from "react";
+import { JudgeEvaluation, getScoreColorClass } from "@/components/utils/ai-eval-schemas";
+import { Loader2, AlertCircle, Copy, Check, Trophy, ChevronDown, ChevronUp } from "lucide-react";
+
+interface EvalResultCardProps {
+  label: string;
+  sublabel?: string;
+  output: string | null;
+  evaluation: JudgeEvaluation | null;
+  isLoading: boolean;
+  error: string | null;
+  isWinner?: boolean;
+  latencyMs?: number;
+}
+
+export function EvalResultCard({
+  label,
+  sublabel,
+  output,
+  evaluation,
+  isLoading,
+  error,
+  isWinner,
+  latencyMs,
+}: EvalResultCardProps) {
+  const [copied, setCopied] = useState(false);
+  const [showScoreDetails, setShowScoreDetails] = useState(false);
+
+  const handleCopy = async () => {
+    if (!output) return;
+    await navigator.clipboard.writeText(output);
+    setCopied(true);
+    setTimeout(() => setCopied(false), 2000);
+  };
+
+  return (
+    <div
+      className={`
+        rounded-xl border-2 bg-card overflow-hidden transition-all
+        ${isWinner ? "border-green-500/50 shadow-lg shadow-green-500/10" : "border-border"}
+      `}
+    >
+      {/* Header */}
+      <div className="px-4 py-3 border-b border-border bg-muted/30 flex items-center justify-between">
+        <div className="flex items-center gap-2">
+          {isWinner && <Trophy className="h-4 w-4 text-green-500" />}
+          <div>
+            <span className="font-semibold text-sm">{label}</span>
+            {sublabel && (
+              <span className="text-xs text-muted-foreground ml-2">{sublabel}</span>
+            )}
+          </div>
+        </div>
+
+        <div className="flex items-center gap-3">
+          {latencyMs !== undefined && (
+            <span className="text-xs text-muted-foreground">
+              {(latencyMs / 1000).toFixed(2)}s
+            </span>
+          )}
+          {evaluation && (
+            <div
+              className={`
+                px-3 py-1 rounded-full text-sm font-bold
+                ${isWinner ? "bg-green-500/10 text-green-600 dark:text-green-400" : "bg-muted"}
+                ${getScoreColorClass(evaluation.overallScore)}
+              `}
+            >
+              {evaluation.overallScore.toFixed(1)}
+            </div>
+          )}
+        </div>
+      </div>
+
+      {/* Content */}
+      <div className="p-4">
+        {isLoading ? (
+          <div className="flex items-center justify-center py-12 text-muted-foreground">
+            <Loader2 className="h-6 w-6 animate-spin mr-3" />
+            <span className="text-sm">Generating response...</span>
+          </div>
+        ) : error ? (
+          <div className="py-8 px-4 bg-red-50 dark:bg-red-950/20 rounded-lg">
+            <div className="flex items-start gap-3 text-red-600 dark:text-red-400">
+              <AlertCircle className="h-5 w-5 mt-0.5 shrink-0" />
+              <p className="text-sm">{error}</p>
+            </div>
+          </div>
+        ) : output ? (
+          <div className="space-y-4">
+            {/* Output Text */}
+            <div className="relative group">
+              <div className="bg-muted/50 rounded-lg p-4 text-sm leading-relaxed max-h-80 overflow-y-auto">
+                <pre className="whitespace-pre-wrap font-sans">{output}</pre>
+              </div>
+              <button
+                onClick={handleCopy}
+                className="absolute top-3 right-3 p-2 rounded-md bg-background border border-border opacity-0 group-hover:opacity-100 transition-opacity hover:bg-muted"
+              >
+                {copied ? (
+                  <Check className="h-4 w-4 text-green-500" />
+                ) : (
+                  <Copy className="h-4 w-4 text-muted-foreground" />
+                )}
+              </button>
+            </div>
+
+            {/* Evaluation Scores */}
+            {evaluation && (
+              <div className="border-t border-border pt-4">
+                <button
+                  onClick={() => setShowScoreDetails(!showScoreDetails)}
+                  className="w-full flex items-center justify-between text-sm text-muted-foreground hover:text-foreground transition-colors"
+                >
+                  <span>Score Breakdown</span>
+                  {showScoreDetails ? (
+                    <ChevronUp className="h-4 w-4" />
+                  ) : (
+                    <ChevronDown className="h-4 w-4" />
+                  )}
+                </button>
+
+                {showScoreDetails && (
+                  <div className="mt-4 space-y-3">
+                    {[
+                      { key: "accuracy", label: "Accuracy" },
+                      { key: "relevance", label: "Relevance" },
+                      { key: "clarity", label: "Clarity" },
+                      { key: "completeness", label: "Completeness" },
+                      { key: "conciseness", label: "Conciseness" },
+                    ].map(({ key, label }) => {
+                      const score = evaluation.scores[key as keyof typeof evaluation.scores];
+                      return (
+                        <div key={key} className="flex items-center gap-3">
+                          <span className="text-xs text-muted-foreground w-24">{label}</span>
+                          <div className="flex-1 h-2 bg-muted rounded-full overflow-hidden">
+                            <div
+                              className={`h-full rounded-full transition-all ${
+                                score >= 8 ? "bg-green-500" : score >= 5 ? "bg-yellow-500" : "bg-red-500"
+                              }`}
+                              style={{ width: `${score * 10}%` }}
+                            />
+                          </div>
+                          <span className={`text-xs font-medium w-6 text-right ${getScoreColorClass(score)}`}>
+                            {score}
+                          </span>
+                        </div>
+                      );
+                    })}
+
+                    {evaluation.reasoning && (
+                      <p className="text-xs text-muted-foreground pt-2 border-t border-border mt-3">
+                        {evaluation.reasoning}
+                      </p>
+                    )}
+                  </div>
+                )}
+              </div>
+            )}
+          </div>
+        ) : (
+          <div className="py-12 text-center text-muted-foreground text-sm">
+            Run evaluation to see output
+          </div>
+        )}
+      </div>
+    </div>
+  );
+}
diff --git a/components/hooks/useApiKeys.ts b/components/hooks/useApiKeys.ts
new file mode 100644
index 0000000..3f5cb0e
--- /dev/null
+++ b/components/hooks/useApiKeys.ts
@@ -0,0 +1,99 @@
+import { useState, useEffect, useCallback } from "react";
+import {
+  StoredApiKeys,
+  ProviderId,
+  StoredApiKeysSchema,
+} from "@/components/utils/ai-eval-schemas";
+import { validateApiKey } from "@/components/utils/ai-eval-providers";
+
+const STORAGE_KEY = "jam-ai-eval-keys";
+
+export interface UseApiKeysReturn {
+  keys: StoredApiKeys;
+  setKey: (providerId: ProviderId, key: string) => void;
+  removeKey: (providerId: ProviderId) => void;
+  hasKey: (providerId: ProviderId) => boolean;
+  getKey: (providerId: ProviderId) => string | undefined;
+  testKey: (providerId: ProviderId) => Promise<boolean>;
+  isLoaded: boolean;
+}
+
+export function useApiKeys(): UseApiKeysReturn {
+  const [keys, setKeys] = useState<StoredApiKeys>({});
+  const [isLoaded, setIsLoaded] = useState(false);
+
+  // Load from sessionStorage on mount
+  useEffect(() => {
+    try {
+      const stored = sessionStorage.getItem(STORAGE_KEY);
+      if (stored) {
+        const parsed = JSON.parse(stored);
+        const validated = StoredApiKeysSchema.safeParse(parsed);
+        if (validated.success) {
+          setKeys(validated.data);
+        }
+      }
+    } catch (error) {
+      console.error("Failed to load API keys from sessionStorage:", error);
+    }
+    setIsLoaded(true);
+  }, []);
+
+  // Save to sessionStorage whenever keys change
+  useEffect(() => {
+    if (!isLoaded) return;
+    try {
+      sessionStorage.setItem(STORAGE_KEY, JSON.stringify(keys));
+    } catch (error) {
+      console.error("Failed to save API keys to sessionStorage:", error);
+    }
+  }, [keys, isLoaded]);
+
+  const setKey = useCallback((providerId: ProviderId, key: string) => {
+    setKeys((prev) => ({
+      ...prev,
+      [providerId]: key,
+    }));
+  }, []);
+
+  const removeKey = useCallback((providerId: ProviderId) => {
+    setKeys((prev) => {
+      const next = { ...prev };
+      delete next[providerId];
+      return next;
+    });
+  }, []);
+
+  const hasKey = useCallback(
+    (providerId: ProviderId) => {
+      return Boolean(keys[providerId]);
+    },
+    [keys]
+  );
+
+  const getKey = useCallback(
+    (providerId: ProviderId) => {
+      return keys[providerId];
+    },
+    [keys]
+  );
+
+  const testKey = useCallback(
+    async (providerId: ProviderId): Promise<boolean> => {
+      const key = keys[providerId];
+      if (!key) return false;
+      return validateApiKey(providerId, key);
+    },
+    [keys]
+  );
+
+  return {
+    keys,
+    setKey,
+    removeKey,
+    hasKey,
+    getKey,
+    testKey,
+    isLoaded,
+  };
+}
diff --git a/components/utils/ai-eval-judge.ts b/components/utils/ai-eval-judge.ts
new file mode 100644
index 0000000..9063097
--- /dev/null
+++ b/components/utils/ai-eval-judge.ts
@@ -0,0 +1,362 @@
+import {
+  JudgeEvaluation,
+  JudgeEvaluationSchema,
+  CriteriaWeights,
+  calculateWeightedScore,
+  ChatMessage,
+} from "./ai-eval-schemas";
+import { chat } from "./ai-eval-providers";
+
+// ============================================================================
+// Judge System Prompts
+// ============================================================================
+
+const SINGLE_RESPONSE_JUDGE_PROMPT = `You are an expert AI output evaluator. Your task is to analyze an AI response and score it objectively.
+
+Evaluate the response on these criteria (1-10 scale):
+
+1. ACCURACY (1-10): Is the information factually correct and reliable?
+   - 1-3: Contains significant errors or misinformation
+   - 4-6: Mostly accurate with minor issues
+   - 7-10: Highly accurate and reliable
+
+2. RELEVANCE (1-10): Does it directly address what was asked?
+   - 1-3: Off-topic or misses the point
+   - 4-6: Partially addresses the question
+   - 7-10: Directly and fully addresses the query
+
+3. CLARITY (1-10): Is it well-organized and easy to understand?
+   - 1-3: Confusing or poorly structured
+   - 4-6: Understandable but could be clearer
+   - 7-10: Crystal clear and well-organized
+
+4. COMPLETENESS (1-10): Does it cover all important aspects?
+   - 1-3: Missing critical information
+   - 4-6: Covers basics but lacks depth
+   - 7-10: Comprehensive and thorough
+
+5. CONCISENESS (1-10): Is it appropriately detailed without being verbose?
+   - 1-3: Extremely verbose or too brief
+   - 4-6: Could be more concise or needs more detail
+   - 7-10: Perfectly balanced length
+
+You MUST respond with valid JSON matching this exact schema:
+{
+  "scores": {
+    "accuracy": <number 1-10>,
+    "relevance": <number 1-10>,
+    "clarity": <number 1-10>,
+    "completeness": <number 1-10>,
+    "conciseness": <number 1-10>
+  },
+  "overallScore": <number 1-10>,
+  "reasoning": "<brief explanation of your evaluation, max 500 chars>"
+}
+
+Be fair, objective, and consistent in your scoring.`;
+
+const COMPARISON_JUDGE_PROMPT = `You are an expert AI output evaluator. Your task is to compare two AI responses (A and B) to the same prompt and determine which is better.
+
+Evaluate BOTH responses on these criteria (1-10 scale):
+
+1. ACCURACY: Is the information factually correct?
+2. RELEVANCE: Does it directly address what was asked?
+3. CLARITY: Is it well-organized and easy to understand?
+4. COMPLETENESS: Does it cover all important aspects?
+5. CONCISENESS: Is it appropriately detailed without being verbose?
+
+You MUST respond with valid JSON matching this exact schema:
+{
+  "responseA": {
+    "scores": {
+      "accuracy": <number 1-10>,
+      "relevance": <number 1-10>,
+      "clarity": <number 1-10>,
+      "completeness": <number 1-10>,
+      "conciseness": <number 1-10>
+    },
+    "overallScore": <number 1-10>,
+    "reasoning": "<brief explanation, max 300 chars>"
+  },
+  "responseB": {
+    "scores": {
+      "accuracy": <number 1-10>,
+      "relevance": <number 1-10>,
+      "clarity": <number 1-10>,
+      "completeness": <number 1-10>,
+      "conciseness": <number 1-10>
+    },
+    "overallScore": <number 1-10>,
+    "reasoning": "<brief explanation, max 300 chars>"
+  },
+  "winner": "<'A', 'B', or 'tie'>",
+  "comparisonReasoning": "<why one is better or why they're equal, max 300 chars>"
+}
+
+Be fair, objective, and explain your reasoning clearly.`;
+
+// ============================================================================
+// Judge Evaluation Functions
+// ============================================================================
+
+interface JudgeSingleParams {
+  apiKey: string;
+  judgeModel: string;
+  originalPrompt: string;
+  response: string;
+  weights: CriteriaWeights;
+}
+
+interface JudgeCompareParams {
+  apiKey: string;
+  judgeModel: string;
+  originalPrompt: string;
+  responseA: string;
+  responseB: string;
+  weights: CriteriaWeights;
+}
+
+interface ComparisonResult {
+  evaluationA: JudgeEvaluation;
+  evaluationB: JudgeEvaluation;
+  winner: "A" | "B" | "tie";
+  comparisonReasoning: string;
+}
+
+/**
+ * Evaluate a single response using LLM-as-judge
+ */
+export async function judgeSingleResponse(
+  params: JudgeSingleParams
+): Promise<JudgeEvaluation> {
+  const { apiKey, judgeModel, originalPrompt, response, weights } = params;
+
+  const userMessage = `## Original Prompt
+${originalPrompt}
+
+## AI Response to Evaluate
+${response}
+
+Evaluate this response now.`;
+
+  const messages: ChatMessage[] = [
+    { role: "system", content: SINGLE_RESPONSE_JUDGE_PROMPT },
+    { role: "user", content: userMessage },
+  ];
+
+  const result = await chat(apiKey, {
+    model: judgeModel,
+    messages,
+    jsonMode: true,
+    temperature: 0.3, // Lower temperature for more consistent scoring
+  });
+
+  // Parse and validate the response
+  const parsed = parseJudgeResponse(result.content);
+
+  // Recalculate overall score with user's weights
+  parsed.overallScore = calculateWeightedScore(parsed.scores, weights);
+
+  return parsed;
+}
+
+/**
+ * Compare two responses using LLM-as-judge (pairwise comparison)
+ */
+export async function judgeCompareResponses(
+  params: JudgeCompareParams
+): Promise<ComparisonResult> {
+  const { apiKey, judgeModel, originalPrompt, responseA, responseB, weights } =
+    params;
+
+  const userMessage = `## Original Prompt
+${originalPrompt}
+
+## Response A
+${responseA}
+
+## Response B
+${responseB}
+
+Compare these responses and provide your evaluation.`;
+
+  const messages: ChatMessage[] = [
+    { role: "system", content: COMPARISON_JUDGE_PROMPT },
+    { role: "user", content: userMessage },
+  ];
+
+  const result = await chat(apiKey, {
+    model: judgeModel,
+    messages,
+    jsonMode: true,
+    temperature: 0.3,
+  });
+
+  // Parse the comparison response
+  const parsed = parseComparisonResponse(result.content);
+
+  // Recalculate overall scores with user's weights
+  parsed.evaluationA.overallScore = calculateWeightedScore(
+    parsed.evaluationA.scores,
+    weights
+  );
+  parsed.evaluationB.overallScore = calculateWeightedScore(
+    parsed.evaluationB.scores,
+    weights
+  );
+
+  // Re-determine winner based on weighted scores
+  if (parsed.evaluationA.overallScore > parsed.evaluationB.overallScore + 0.5) {
+    parsed.winner = "A";
+  } else if (
+    parsed.evaluationB.overallScore >
+    parsed.evaluationA.overallScore + 0.5
+  ) {
+    parsed.winner = "B";
+  } else {
+    parsed.winner = "tie";
+  }
+
+  return parsed;
+}
+
+// ============================================================================
+// Response Parsing
+// ============================================================================
+
+function parseJudgeResponse(content: string): JudgeEvaluation {
+  try {
+    // Try to extract JSON from the response
+    const jsonMatch = content.match(/\{[\s\S]*\}/);
+    if (!jsonMatch) {
+      throw new Error("No JSON found in response");
+    }
+
+    const parsed = JSON.parse(jsonMatch[0]);
+    const validated = JudgeEvaluationSchema.parse(parsed);
+    return validated;
+  } catch (error) {
+    console.error("Failed to parse judge response:", content, error);
+
+    // Return a default evaluation on parse failure
+    return {
+      scores: {
+        accuracy: 5,
+        relevance: 5,
+        clarity: 5,
+        completeness: 5,
+        conciseness: 5,
+      },
+      overallScore: 5,
+      reasoning: "Failed to parse evaluation. Using default scores.",
+    };
+  }
+}
+
+function parseComparisonResponse(content: string): ComparisonResult {
+  try {
+    const jsonMatch = content.match(/\{[\s\S]*\}/);
+    if (!jsonMatch) {
+      throw new Error("No JSON found in response");
+    }
+
+    const parsed = JSON.parse(jsonMatch[0]);
+
+    // Validate both evaluations
+    const evalA = JudgeEvaluationSchema.parse({
+      scores: parsed.responseA.scores,
+      overallScore: parsed.responseA.overallScore,
+      reasoning: parsed.responseA.reasoning,
+    });
+
+    const evalB = JudgeEvaluationSchema.parse({
+      scores: parsed.responseB.scores,
+      overallScore: parsed.responseB.overallScore,
+      reasoning: parsed.responseB.reasoning,
+    });
+
+    return {
+      evaluationA: evalA,
+      evaluationB: evalB,
+      winner: parsed.winner as "A" | "B" | "tie",
+      comparisonReasoning: parsed.comparisonReasoning || "",
+    };
+  } catch (error) {
+    console.error("Failed to parse comparison response:", content, error);
+
+    // Return default evaluations on parse failure
+    const defaultEval: JudgeEvaluation = {
+      scores: {
+        accuracy: 5,
+        relevance: 5,
+        clarity: 5,
+        completeness: 5,
+        conciseness: 5,
+      },
+      overallScore: 5,
+      reasoning: "Failed to parse evaluation.",
+    };
+
+    return {
+      evaluationA: { ...defaultEval },
+      evaluationB: { ...defaultEval },
+      winner: "tie",
+      comparisonReasoning: "Failed to parse comparison. Using default scores.",
+    };
+  }
+}
+
+// ============================================================================
+// Batch Evaluation
+// ============================================================================
+
+interface BatchEvalParams {
+  apiKey: string;
+  judgeModel: string;
+  evaluations: Array<{
+    id: string;
+    originalPrompt: string;
+    response: string;
+  }>;
+  weights: CriteriaWeights;
+}
+
+/**
+ * Evaluate multiple responses in batch (sequential to respect rate limits)
+ */
+export async function judgeBatchResponses(
+  params: BatchEvalParams
+): Promise<Map<string, JudgeEvaluation>> {
+  const results = new Map<string, JudgeEvaluation>();
+
+  for (const item of params.evaluations) {
+    try {
+      const evaluation = await judgeSingleResponse({
+        apiKey: params.apiKey,
+        judgeModel: params.judgeModel,
+        originalPrompt: item.originalPrompt,
+        response: item.response,
+        weights: params.weights,
+      });
+      results.set(item.id, evaluation);
+    } catch (error) {
+      console.error(`Failed to evaluate ${item.id}:`, error);
+      results.set(item.id, {
+        scores: {
+          accuracy: 0,
+          relevance: 0,
+          clarity: 0,
+          completeness: 0,
+          conciseness: 0,
+        },
+        overallScore: 0,
+        reasoning:
+          error instanceof Error
+            ? error.message
+            : "Evaluation failed",
+      });
+    }
+  }
+
+  return results;
+}
diff --git a/components/utils/ai-eval-providers.ts b/components/utils/ai-eval-providers.ts
new file mode 100644
index 0000000..9568cc6
--- /dev/null
+++ b/components/utils/ai-eval-providers.ts
@@ -0,0 +1,309 @@
+import {
+  ChatParams,
+  ChatResponse,
+  ProviderId,
+  getProviderById,
+  getModelById,
+} from "./ai-eval-schemas";
+
+// ============================================================================
+// Provider Adapter Interface
+// ============================================================================
+
+export interface ProviderAdapter {
+  id: ProviderId;
+  name: string;
+  chat(apiKey: string, params: ChatParams): Promise<ChatResponse>;
+  validateKey(apiKey: string): Promise<boolean>;
+}
+
+// ============================================================================
+// OpenAI Adapter
+// ============================================================================
+
+const openaiAdapter: ProviderAdapter = {
+  id: "openai",
+  name: "OpenAI",
+
+  async chat(apiKey: string, params: ChatParams): Promise<ChatResponse> {
+    const provider = getProviderById("openai");
+    if (!provider) throw new Error("OpenAI provider not found");
+
+    const body: Record<string, unknown> = {
+      model: params.model,
+      messages: params.messages,
+      max_tokens: params.maxTokens ?? 4096,
+      temperature: params.temperature ?? 0.7,
+    };
+
+    if (params.jsonMode) {
+      body.response_format = { type: "json_object" };
+    }
+
+    const response = await fetch(provider.apiEndpoint, {
+      method: "POST",
+      headers: {
+        "Content-Type": "application/json",
+        Authorization: `Bearer ${apiKey}`,
+      },
+      body: JSON.stringify(body),
+    });
+
+    if (!response.ok) {
+      const error = await response.json().catch(() => ({}));
+      throw new Error(
+        error.error?.message || `OpenAI API error: ${response.status}`
+      );
+    }
+
+    const data = await response.json();
+    const choice = data.choices?.[0];
+
+    return {
+      content: choice?.message?.content || "",
+      model: data.model,
+      usage: data.usage
+        ? {
+            promptTokens: data.usage.prompt_tokens,
+            completionTokens: data.usage.completion_tokens,
+            totalTokens: data.usage.total_tokens,
+          }
+        : undefined,
+      finishReason: choice?.finish_reason,
+    };
+  },
+
+  async validateKey(apiKey: string): Promise<boolean> {
+    try {
+      const response = await fetch("https://api.openai.com/v1/models", {
+        headers: {
+          Authorization: `Bearer ${apiKey}`,
+        },
+      });
+      return response.ok;
+    } catch {
+      return false;
+    }
+  },
+};
+
+// ============================================================================
+// Anthropic Adapter
+// ============================================================================
+
+const anthropicAdapter: ProviderAdapter = {
+  id: "anthropic",
+  name: "Anthropic",
+
+  async chat(apiKey: string, params: ChatParams): Promise<ChatResponse> {
+    const provider = getProviderById("anthropic");
+    if (!provider) throw new Error("Anthropic provider not found");
+
+    // Anthropic uses a different message format
+    // System message is separate from the messages array
+    const systemMessage = params.messages.find((m) => m.role === "system");
+    const otherMessages = params.messages.filter((m) => m.role !== "system");
+
+    const body: Record<string, unknown> = {
+      model: params.model,
+      max_tokens: params.maxTokens ?? 4096,
+      messages: otherMessages.map((m) => ({
+        role: m.role === "assistant" ? "assistant" : "user",
+        content: m.content,
+      })),
+    };
+
+    if (systemMessage) {
+      body.system = systemMessage.content;
+    }
+
+    const response = await fetch(provider.apiEndpoint, {
+      method: "POST",
+      headers: {
+        "Content-Type": "application/json",
+        "x-api-key": apiKey,
+        "anthropic-version": "2023-06-01",
+        "anthropic-dangerous-direct-browser-access": "true",
+      },
+      body: JSON.stringify(body),
+    });
+
+    if (!response.ok) {
+      const error = await response.json().catch(() => ({}));
+      throw new Error(
+        error.error?.message || `Anthropic API error: ${response.status}`
+      );
+    }
+
+    const data = await response.json();
+
+    return {
+      content: data.content?.[0]?.text || "",
+      model: data.model,
+      usage: data.usage
+        ? {
+            promptTokens: data.usage.input_tokens,
+            completionTokens: data.usage.output_tokens,
+            totalTokens: data.usage.input_tokens + data.usage.output_tokens,
+          }
+        : undefined,
+      finishReason: data.stop_reason,
+    };
+  },
+
+  async validateKey(apiKey: string): Promise<boolean> {
+    try {
+      // Anthropic doesn't have a simple models endpoint, so we make a minimal request
+      const response = await fetch("https://api.anthropic.com/v1/messages", {
+        method: "POST",
+        headers: {
+          "Content-Type": "application/json",
+          "x-api-key": apiKey,
+          "anthropic-version": "2023-06-01",
+          "anthropic-dangerous-direct-browser-access": "true",
+        },
+        body: JSON.stringify({
+          model: "claude-3-5-haiku-20241022",
+          max_tokens: 1,
+          messages: [{ role: "user", content: "Hi" }],
+        }),
+      });
+      return response.ok;
+    } catch {
+      return false;
+    }
+  },
+};
+
+// ============================================================================
+// Google AI Adapter
+// ============================================================================
+
+const googleAdapter: ProviderAdapter = {
+  id: "google",
+  name: "Google AI",
+
+  async chat(apiKey: string, params: ChatParams): Promise<ChatResponse> {
+    const model = getModelById(params.model);
+    if (!model) throw new Error(`Model not found: ${params.model}`);
+
+    const endpoint = `https://generativelanguage.googleapis.com/v1beta/models/${params.model}:generateContent?key=${apiKey}`;
+
+    // Convert messages to Google format
+    const systemMessage = params.messages.find((m) => m.role === "system");
+    const otherMessages = params.messages.filter((m) => m.role !== "system");
+
+    const contents = otherMessages.map((m) => ({
+      role: m.role === "assistant" ? "model" : "user",
+      parts: [{ text: m.content }],
+    }));
+
+    const body: Record<string, unknown> = {
+      contents,
+      generationConfig: {
+        maxOutputTokens: params.maxTokens ?? 4096,
+        temperature: params.temperature ?? 0.7,
+      },
+    };
+
+    if (systemMessage) {
+      body.systemInstruction = {
+        parts: [{ text: systemMessage.content }],
+      };
+    }
+
+    if (params.jsonMode) {
+      (body.generationConfig as Record<string, unknown>).responseMimeType =
+        "application/json";
+    }
+
+    const response = await fetch(endpoint, {
+      method: "POST",
+      headers: {
+        "Content-Type": "application/json",
+      },
+      body: JSON.stringify(body),
+    });
+
+    if (!response.ok) {
+      const error = await response.json().catch(() => ({}));
+      throw new Error(
+        error.error?.message || `Google AI API error: ${response.status}`
+      );
+    }
+
+    const data = await response.json();
+    const candidate = data.candidates?.[0];
+    const content = candidate?.content?.parts?.[0]?.text || "";
+
+    return {
+      content,
+      model: params.model,
+      usage: data.usageMetadata
+        ? {
+            promptTokens: data.usageMetadata.promptTokenCount || 0,
+            completionTokens: data.usageMetadata.candidatesTokenCount || 0,
+            totalTokens: data.usageMetadata.totalTokenCount || 0,
+          }
+        : undefined,
+      finishReason: candidate?.finishReason,
+    };
+  },
+
+  async validateKey(apiKey: string): Promise<boolean> {
+    try {
+      const response = await fetch(
+        `https://generativelanguage.googleapis.com/v1beta/models?key=${apiKey}`
+      );
+      return response.ok;
+    } catch {
+      return false;
+    }
+  },
+};
+
+// ============================================================================
+// Adapter Registry
+// ============================================================================
+
+const adapters: Record<ProviderId, ProviderAdapter> = {
+  openai: openaiAdapter,
+  anthropic: anthropicAdapter,
+  google: googleAdapter,
+};
+
+export function getAdapter(providerId: ProviderId): ProviderAdapter {
+  const adapter = adapters[providerId];
+  if (!adapter) {
+    throw new Error(`No adapter found for provider: ${providerId}`);
+  }
+  return adapter;
+}
+
+export function getAdapterForModel(modelId: string): ProviderAdapter {
+  const model = getModelById(modelId);
+  if (!model) {
+    throw new Error(`Model not found: ${modelId}`);
+  }
+  return getAdapter(model.providerId);
+}
+
+// ============================================================================
+// Unified Chat Function
+// ============================================================================
+
+export async function chat(
+  apiKey: string,
+  params: ChatParams
+): Promise<ChatResponse> {
+  const adapter = getAdapterForModel(params.model);
+  return adapter.chat(apiKey, params);
+}
+
+export async function validateApiKey(
+  providerId: ProviderId,
+  apiKey: string
+): Promise<boolean> {
+  const adapter = getAdapter(providerId);
+  return adapter.validateKey(apiKey);
+}
diff --git a/components/utils/ai-eval-schemas.test.ts b/components/utils/ai-eval-schemas.test.ts
new file mode 100644
index 0000000..746ab69
--- /dev/null
+++ b/components/utils/ai-eval-schemas.test.ts
@@ -0,0 +1,354 @@
+import {
+  extractVariables,
+  resolveTemplate,
+  calculateWeightedScore,
+  getScoreColorClass,
+  getScoreBgClass,
+  getModelById,
+  getProviderById,
+  getProviderForModel,
+  JudgeEvaluationSchema,
+  ScoreBreakdownSchema,
+  DEFAULT_CRITERIA_WEIGHTS,
+} from "./ai-eval-schemas";
+
+describe("extractVariables", () => {
+  it("extracts single variable", () => {
+    const result = extractVariables("Hello {{name}}!");
+    expect(result).toEqual(["name"]);
+  });
+
+  it("extracts multiple variables", () => {
+    const result = extractVariables("{{greeting}} {{name}}, how are you?");
+    expect(result).toEqual(["greeting", "name"]);
+  });
+
+  it("extracts duplicate variables only once", () => {
+    const result = extractVariables("{{name}} and {{name}} again");
+    expect(result).toEqual(["name"]);
+  });
+
+  it("returns empty array when no variables", () => {
+    const result = extractVariables("Hello world!");
+    expect(result).toEqual([]);
+  });
+
+  it("handles variables with underscores", () => {
+    const result = extractVariables("{{first_name}} {{last_name}}");
+    expect(result).toEqual(["first_name", "last_name"]);
+  });
+
+  it("handles variables with numbers", () => {
+    const result = extractVariables("{{var1}} {{var2}}");
+    expect(result).toEqual(["var1", "var2"]);
+  });
+});
+
+describe("resolveTemplate", () => {
+  it("resolves single variable", () => {
+    const result = resolveTemplate("Hello {{name}}!", { name: "World" });
+    expect(result).toBe("Hello World!");
+  });
+
+  it("resolves multiple variables", () => {
+    const result = resolveTemplate("{{greeting}} {{name}}!", {
+      greeting: "Hi",
+      name: "Alice",
+    });
+    expect(result).toBe("Hi Alice!");
+  });
+
+  it("keeps unreplaced variables as-is", () => {
+    const result = resolveTemplate("Hello {{name}} and {{other}}!", {
+      name: "World",
+    });
+    expect(result).toBe("Hello World and {{other}}!");
+  });
+
+  it("handles empty variables object", () => {
+    const result = resolveTemplate("Hello {{name}}!", {});
+    expect(result).toBe("Hello {{name}}!");
+  });
+
+  it("handles template without variables", () => {
+    const result = resolveTemplate("Hello World!", { name: "Test" });
+    expect(result).toBe("Hello World!");
+  });
+});
+
+describe("calculateWeightedScore", () => {
+  it("calculates weighted score correctly", () => {
+    const scores = {
+      accuracy: 8,
+      relevance: 9,
+      clarity: 7,
+      completeness: 8,
+      conciseness: 6,
+    };
+
+    const weights = {
+      accuracy: 0.2,
+      relevance: 0.2,
+      clarity: 0.2,
+      completeness: 0.2,
+      conciseness: 0.2,
+    };
+
+    const result = calculateWeightedScore(scores, weights);
+    // (8*0.2 + 9*0.2 + 7*0.2 + 8*0.2 + 6*0.2) = 7.6
+    expect(result).toBe(7.6);
+  });
+
+  it("handles uneven weights", () => {
+    const scores = {
+      accuracy: 10,
+      relevance: 5,
+      clarity: 5,
+      completeness: 5,
+      conciseness: 5,
+    };
+
+    const weights = {
+      accuracy: 0.5,
+      relevance: 0.125,
+      clarity: 0.125,
+      completeness: 0.125,
+      conciseness: 0.125,
+    };
+
+    const result = calculateWeightedScore(scores, weights);
+    // (10*0.5 + 5*0.125 + 5*0.125 + 5*0.125 + 5*0.125) = 7.5
+    expect(result).toBe(7.5);
+  });
+
+  it("uses default weights", () => {
+    const scores = {
+      accuracy: 8,
+      relevance: 8,
+      clarity: 8,
+      completeness: 8,
+      conciseness: 8,
+    };
+
+    const result = calculateWeightedScore(scores, DEFAULT_CRITERIA_WEIGHTS);
+    expect(result).toBe(8);
+  });
+});
+
+describe("getScoreColorClass", () => {
+  it("returns green for high scores (8-10)", () => {
+    expect(getScoreColorClass(8)).toContain("green");
+    expect(getScoreColorClass(9)).toContain("green");
+    expect(getScoreColorClass(10)).toContain("green");
+  });
+
+  it("returns yellow for medium scores (5-7)", () => {
+    expect(getScoreColorClass(5)).toContain("yellow");
+    expect(getScoreColorClass(6)).toContain("yellow");
+    expect(getScoreColorClass(7)).toContain("yellow");
+  });
+
+  it("returns red for low scores (1-4)", () => {
+    expect(getScoreColorClass(1)).toContain("red");
+    expect(getScoreColorClass(4)).toContain("red");
+  });
+});
+
+describe("getScoreBgClass", () => {
+  it("returns green background for high scores", () => {
+    expect(getScoreBgClass(8)).toContain("green");
+  });
+
+  it("returns yellow background for medium scores", () => {
+    expect(getScoreBgClass(6)).toContain("yellow");
+  });
+
+  it("returns red background for low scores", () => {
+    expect(getScoreBgClass(3)).toContain("red");
+  });
+});
+
+describe("getModelById", () => {
+  it("finds OpenAI model", () => {
+    const model = getModelById("gpt-4o");
+    expect(model).toBeDefined();
+    expect(model?.name).toBe("GPT-4o");
+    expect(model?.providerId).toBe("openai");
+  });
+
+  it("finds Anthropic model", () => {
+    const model = getModelById("claude-3-5-sonnet-20241022");
+    expect(model).toBeDefined();
+    expect(model?.name).toBe("Claude 3.5 Sonnet");
+    expect(model?.providerId).toBe("anthropic");
+  });
+
+  it("finds Google model", () => {
+    const model = getModelById("gemini-2.0-flash-exp");
+    expect(model).toBeDefined();
+    expect(model?.name).toBe("Gemini 2.0 Flash");
+    expect(model?.providerId).toBe("google");
+  });
+
+  it("returns undefined for unknown model", () => {
+    const model = getModelById("unknown-model");
+    expect(model).toBeUndefined();
+  });
+});
+
+describe("getProviderById", () => {
+  it("finds OpenAI provider", () => {
+    const provider = getProviderById("openai");
+    expect(provider).toBeDefined();
+    expect(provider?.name).toBe("OpenAI");
+  });
+
+  it("finds Anthropic provider", () => {
+    const provider = getProviderById("anthropic");
+    expect(provider).toBeDefined();
+    expect(provider?.name).toBe("Anthropic");
+  });
+
+  it("finds Google provider", () => {
+    const provider = getProviderById("google");
+    expect(provider).toBeDefined();
+    expect(provider?.name).toBe("Google AI");
+  });
+});
+
+describe("getProviderForModel", () => {
+  it("returns correct provider for model", () => {
+    const provider = getProviderForModel("gpt-4o");
+    expect(provider?.id).toBe("openai");
+  });
+
+  it("returns undefined for unknown model", () => {
+    const provider = getProviderForModel("unknown");
+    expect(provider).toBeUndefined();
+  });
+});
+
+describe("Zod Schemas", () => {
+  describe("ScoreBreakdownSchema", () => {
+    it("validates valid scores", () => {
+      const valid = {
+        accuracy: 8,
+        relevance: 7,
+        clarity: 9,
+        completeness: 6,
+        conciseness: 8,
+      };
+
+      const result = ScoreBreakdownSchema.safeParse(valid);
+      expect(result.success).toBe(true);
+    });
+
+    it("rejects scores below 1", () => {
+      const invalid = {
+        accuracy: 0,
+        relevance: 7,
+        clarity: 9,
+        completeness: 6,
+        conciseness: 8,
+      };
+
+      const result = ScoreBreakdownSchema.safeParse(invalid);
+      expect(result.success).toBe(false);
+    });
+
+    it("rejects scores above 10", () => {
+      const invalid = {
+        accuracy: 11,
+        relevance: 7,
+        clarity: 9,
+        completeness: 6,
+        conciseness: 8,
+      };
+
+      const result = ScoreBreakdownSchema.safeParse(invalid);
+      expect(result.success).toBe(false);
+    });
+
+    it("rejects missing fields", () => {
+      const invalid = {
+        accuracy: 8,
+        relevance: 7,
+      };
+
+      const result = ScoreBreakdownSchema.safeParse(invalid);
+      expect(result.success).toBe(false);
+    });
+  });
+
+  describe("JudgeEvaluationSchema", () => {
+    it("validates valid evaluation", () => {
+      const valid = {
+        scores: {
+          accuracy: 8,
+          relevance: 7,
+          clarity: 9,
+          completeness: 6,
+          conciseness: 8,
+        },
+        overallScore: 7.6,
+        reasoning: "Good response overall.",
+      };
+
+      const result = JudgeEvaluationSchema.safeParse(valid);
+      expect(result.success).toBe(true);
+    });
+
+    it("validates evaluation with winner", () => {
+      const valid = {
+        scores: {
+          accuracy: 8,
+          relevance: 7,
+          clarity: 9,
+          completeness: 6,
+          conciseness: 8,
+        },
+        overallScore: 7.6,
+        reasoning: "Good response overall.",
+        winner: "A",
+      };
+
+      const result = JudgeEvaluationSchema.safeParse(valid);
+      expect(result.success).toBe(true);
+    });
+
+    it("rejects invalid winner value", () => {
+      const invalid = {
+        scores: {
+          accuracy: 8,
+          relevance: 7,
+          clarity: 9,
+          completeness: 6,
+          conciseness: 8,
+        },
+        overallScore: 7.6,
+        reasoning: "Good response overall.",
+        winner: "C",
+      };
+
+      const result = JudgeEvaluationSchema.safeParse(invalid);
+      expect(result.success).toBe(false);
+    });
+
+    it("rejects reasoning over 1000 chars", () => {
+      const invalid = {
+        scores: {
+          accuracy: 8,
+          relevance: 7,
+          clarity: 9,
+          completeness: 6,
+          conciseness: 8,
+        },
+        overallScore: 7.6,
+        reasoning: "a".repeat(1001),
+      };
+
+      const result = JudgeEvaluationSchema.safeParse(invalid);
+      expect(result.success).toBe(false);
+    });
+  });
+});
diff --git a/components/utils/ai-eval-schemas.ts b/components/utils/ai-eval-schemas.ts
new file mode 100644
index 0000000..a34385f
--- /dev/null
+++ b/components/utils/ai-eval-schemas.ts
@@ -0,0 +1,354 @@
+import { z } from "zod";
+
+// ============================================================================
+// Provider & Model Types
+// ============================================================================
+
+export type ProviderId = "openai" | "anthropic" | "google";
+
+export interface ModelConfig {
+  id: string;
+  name: string;
+  providerId: ProviderId;
+  maxTokens: number;
+  supportsJsonMode: boolean;
+}
+
+export interface ProviderConfig {
+  id: ProviderId;
+  name: string;
+  models: ModelConfig[];
+  apiEndpoint: string;
+}
+
+// ============================================================================
+// API Key Management
+// ============================================================================
+
+export interface StoredApiKeys {
+  openai?: string;
+  anthropic?: string;
+  google?: string;
+}
+
+export const StoredApiKeysSchema = z.object({
+  openai: z.string().optional(),
+  anthropic: z.string().optional(),
+  google: z.string().optional(),
+});
+
+// ============================================================================
+// Chat Message Types
+// ============================================================================
+
+export type MessageRole = "system" | "user" | "assistant";
+
+export interface ChatMessage {
+  role: MessageRole;
+  content: string;
+}
+
+export interface ChatParams {
+  model: string;
+  messages: ChatMessage[];
+  maxTokens?: number;
+  temperature?: number;
+  jsonMode?: boolean;
+}
+
+export interface ChatResponse {
+  content: string;
+  model: string;
+  usage?: {
+    promptTokens: number;
+    completionTokens: number;
+    totalTokens: number;
+  };
+  finishReason?: string;
+}
+
+// ============================================================================
+// Evaluation Types
+// ============================================================================
+
+export type ComparisonMode = "model-vs-model" | "prompt-vs-prompt";
+
+export interface PromptConfig {
+  id: string;
+  systemPrompt: string;
+  userPrompt: string;
+  variables: Record<string, string>;
+}
+
+export interface EvaluationInput {
+  mode: ComparisonMode;
+  prompts: PromptConfig[];
+  models: ModelConfig[];
+  judgeModel: ModelConfig;
+  criteriaWeights: CriteriaWeights;
+}
+
+export interface EvaluationResult {
+  id: string;
+  promptId: string;
+  modelId: string;
+  input: {
+    systemPrompt: string;
+    userPrompt: string;
+    resolvedPrompt: string;
+  };
+  output: string;
+  evaluation?: JudgeEvaluation;
+  error?: string;
+  latencyMs: number;
+  timestamp: number;
+}
+
+// ============================================================================
+// LLM-as-Judge Types & Schemas
+// ============================================================================
+
+export interface ScoreBreakdown {
+  accuracy: number;
+  relevance: number;
+  clarity: number;
+  completeness: number;
+  conciseness: number;
+}
+
+export interface JudgeEvaluation {
+  scores: ScoreBreakdown;
+  overallScore: number;
+  reasoning: string;
+  winner?: "A" | "B" | "tie";
+}
+
+export interface CriteriaWeights {
+  accuracy: number;
+  relevance: number;
+  clarity: number;
+  completeness: number;
+  conciseness: number;
+}
+
+// Zod schema for validating judge responses
+export const ScoreBreakdownSchema = z.object({
+  accuracy: z.number().min(1).max(10),
+  relevance: z.number().min(1).max(10),
+  clarity: z.number().min(1).max(10),
+  completeness: z.number().min(1).max(10),
+  conciseness: z.number().min(1).max(10),
+});
+
+export const JudgeEvaluationSchema = z.object({
+  scores: ScoreBreakdownSchema,
+  overallScore: z.number().min(1).max(10),
+  reasoning: z.string().max(1000),
+  winner: z.enum(["A", "B", "tie"]).optional(),
+});
+
+// ============================================================================
+// Default Values
+// ============================================================================
+
+export const DEFAULT_CRITERIA_WEIGHTS: CriteriaWeights = {
+  accuracy: 0.25,
+  relevance: 0.25,
+  clarity: 0.2,
+  completeness: 0.15,
+  conciseness: 0.15,
+};
+
+export const DEFAULT_SYSTEM_PROMPT = "You are a helpful assistant.";
+
+export const DEFAULT_USER_PROMPT = "{{question}}";
+
+// ============================================================================
+// Provider Configurations
+// ============================================================================
+
+export const PROVIDERS: ProviderConfig[] = [
+  {
+    id: "openai",
+    name: "OpenAI",
+    apiEndpoint: "https://api.openai.com/v1/chat/completions",
+    models: [
+      {
+        id: "gpt-4o",
+        name: "GPT-4o",
+        providerId: "openai",
+        maxTokens: 4096,
+        supportsJsonMode: true,
+      },
+      {
+        id: "gpt-4o-mini",
+        name: "GPT-4o Mini",
+        providerId: "openai",
+        maxTokens: 4096,
+        supportsJsonMode: true,
+      },
+      {
+        id: "gpt-4-turbo",
+        name: "GPT-4 Turbo",
+        providerId: "openai",
+        maxTokens: 4096,
+        supportsJsonMode: true,
+      },
+      {
+        id: "gpt-3.5-turbo",
+        name: "GPT-3.5 Turbo",
+        providerId: "openai",
+        maxTokens: 4096,
+        supportsJsonMode: true,
+      },
+    ],
+  },
+  {
+    id: "anthropic",
+    name: "Anthropic",
+    apiEndpoint: "https://api.anthropic.com/v1/messages",
+    models: [
+      {
+        id: "claude-3-5-sonnet-20241022",
+        name: "Claude 3.5 Sonnet",
+        providerId: "anthropic",
+        maxTokens: 4096,
+        supportsJsonMode: false,
+      },
+      {
+        id: "claude-3-5-haiku-20241022",
+        name: "Claude 3.5 Haiku",
+        providerId: "anthropic",
+        maxTokens: 4096,
+        supportsJsonMode: false,
+      },
+      {
+        id: "claude-3-opus-20240229",
+        name: "Claude 3 Opus",
+        providerId: "anthropic",
+        maxTokens: 4096,
+        supportsJsonMode: false,
+      },
+    ],
+  },
+  {
+    id: "google",
+    name: "Google AI",
+    apiEndpoint: "https://generativelanguage.googleapis.com/v1beta/models",
+    models: [
+      {
+        id: "gemini-2.0-flash-exp",
+        name: "Gemini 2.0 Flash",
+        providerId: "google",
+        maxTokens: 8192,
+        supportsJsonMode: true,
+      },
+      {
+        id: "gemini-1.5-pro",
+        name: "Gemini 1.5 Pro",
+        providerId: "google",
+        maxTokens: 8192,
+        supportsJsonMode: true,
+      },
+      {
+        id: "gemini-1.5-flash",
+        name: "Gemini 1.5 Flash",
+        providerId: "google",
+        maxTokens: 8192,
+        supportsJsonMode: true,
+      },
+    ],
+  },
+];
+
+// Helper to get all models flat
+export const ALL_MODELS: ModelConfig[] = PROVIDERS.flatMap((p) => p.models);
+
+// Helper to find model by id
+export function getModelById(modelId: string): ModelConfig | undefined {
+  return ALL_MODELS.find((m) => m.id === modelId);
+}
+
+// Helper to find provider by id
+export function getProviderById(providerId: ProviderId): ProviderConfig | undefined {
+  return PROVIDERS.find((p) => p.id === providerId);
+}
+
+// Helper to get provider for a model
+export function getProviderForModel(modelId: string): ProviderConfig | undefined {
+  const model = getModelById(modelId);
+  if (!model) return undefined;
+  return getProviderById(model.providerId);
+}
+
+// ============================================================================
+// Variable Extraction
+// ============================================================================
+
+/**
+ * Extract variable names from a prompt template
+ * Variables are in the format {{variableName}}
+ */
+export function extractVariables(template: string): string[] {
+  const regex = /\{\{(\w+)\}\}/g;
+  const variables: string[] = [];
+  let match;
+  while ((match = regex.exec(template)) !== null) {
+    if (!variables.includes(match[1])) {
+      variables.push(match[1]);
+    }
+  }
+  return variables;
+}
+
+/**
+ * Resolve variables in a template
+ */
+export function resolveTemplate(
+  template: string,
+  variables: Record<string, string>
+): string {
+  return template.replace(/\{\{(\w+)\}\}/g, (_, varName) => {
+    return variables[varName] ?? `{{${varName}}}`;
+  });
+}
+
+// ============================================================================
+// Score Utilities
+// ============================================================================
+
+/**
+ * Calculate weighted overall score from individual scores
+ */
+export function calculateWeightedScore(
+  scores: ScoreBreakdown,
+  weights: CriteriaWeights
+): number {
+  const total =
+    scores.accuracy * weights.accuracy +
+    scores.relevance * weights.relevance +
+    scores.clarity * weights.clarity +
+    scores.completeness * weights.completeness +
+    scores.conciseness * weights.conciseness;
+
+  // Round to 1 decimal place
+  return Math.round(total * 10) / 10;
+}
+
+/**
+ * Get score color class based on score value
+ */
+export function getScoreColorClass(score: number): string {
+  if (score >= 8) return "text-green-600 dark:text-green-400";
+  if (score >= 5) return "text-yellow-600 dark:text-yellow-400";
+  return "text-red-600 dark:text-red-400";
+}
+
+/**
+ * Get score background color class based on score value
+ */
+export function getScoreBgClass(score: number): string {
+  if (score >= 8) return "bg-green-100 dark:bg-green-900/30";
+  if (score >= 5) return "bg-yellow-100 dark:bg-yellow-900/30";
+  return "bg-red-100 dark:bg-red-900/30";
+}
diff --git a/components/utils/tools-list.ts b/components/utils/tools-list.ts
index aa8ef64..f0055d5 100644
--- a/components/utils/tools-list.ts
+++ b/components/utils/tools-list.ts
@@ -1,4 +1,10 @@
 export const tools = [
+  {
+    title: "AI Eval Playground",
+    description:
+      "Compare AI models and prompts side-by-side with LLM-as-judge scoring. Evaluate GPT-4, Claude, Gemini responses. BYOK - keys stay in your browser.",
+    link: "/utilities/ai-eval",
+  },
   {
     title: "CSV to JSON",
     description:
diff --git a/package-lock.json b/package-lock.json
index 9879cb3..2777321 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -34,7 +34,8 @@
         "react-dom": "^18",
         "react-syntax-highlighter": "^15.5.0",
         "tailwind-merge": "^2.4.0",
-        "tailwindcss-animate": "^1.0.7"
+        "tailwindcss-animate": "^1.0.7",
+        "zod": "^4.3.6"
       },
       "devDependencies": {
         "@testing-library/jest-dom": "^6.4.8",
@@ -13697,6 +13698,15 @@
       "funding": {
         "url": "https://github.com/sponsors/sindresorhus"
       }
+    },
+    "node_modules/zod": {
+      "version": "4.3.6",
+      "resolved": "https://registry.npmjs.org/zod/-/zod-4.3.6.tgz",
+      "integrity": "sha512-rftlrkhHZOcjDwkGlnUtZZkvaPHCsDATp4pGpuOOMDaTdDDXF91wuVDJoWoPsKX/3YPQ5fHuF3STjcYyKr+Qhg==",
+      "license": "MIT",
+      "funding": {
+        "url": "https://github.com/sponsors/colinhacks"
+      }
     }
   }
 }
diff --git a/package.json b/package.json
index a6b44bd..408f439 100644
--- a/package.json
+++ b/package.json
@@ -38,7 +38,8 @@
     "react-dom": "^18",
     "react-syntax-highlighter": "^15.5.0",
     "tailwind-merge": "^2.4.0",
-    "tailwindcss-animate": "^1.0.7"
+    "tailwindcss-animate": "^1.0.7",
+    "zod": "^4.3.6"
   },
   "devDependencies": {
     "@testing-library/jest-dom": "^6.4.8",
diff --git a/pages/utilities/ai-eval.tsx b/pages/utilities/ai-eval.tsx
new file mode 100644
index 0000000..144363f
--- /dev/null
+++ b/pages/utilities/ai-eval.tsx
@@ -0,0 +1,557 @@
+import { useState, useCallback, useMemo } from "react";
+import PageHeader from "@/components/PageHeader";
+import { Button } from "@/components/ds/ButtonComponent";
+import Header from "@/components/Header";
+import { CMDK } from "@/components/CMDK";
+import Meta from "@/components/Meta";
+import CallToActionGrid from "@/components/CallToActionGrid";
+import {
+  ComparisonMode,
+  ModelConfig,
+  CriteriaWeights,
+  DEFAULT_CRITERIA_WEIGHTS,
+  JudgeEvaluation,
+} from "@/components/utils/ai-eval-schemas";
+import { chat } from "@/components/utils/ai-eval-providers";
+import {
+  judgeSingleResponse,
+  judgeCompareResponses,
+} from "@/components/utils/ai-eval-judge";
+import { useApiKeys } from "@/components/hooks/useApiKeys";
+import { ApiKeyDialog } from "@/components/ai-eval/ApiKeyDialog";
+import { EvalModelSelector } from "@/components/ai-eval/EvalModelSelector";
+import { EvalResultCard } from "@/components/ai-eval/EvalResultCard";
+import { EvalJudgeConfig } from "@/components/ai-eval/EvalJudgeConfig";
+import { Play, Loader2, Settings2 } from "lucide-react";
+
+interface ResultData {
+  id: string;
+  modelId: string;
+  promptId: string;
+  output: string | null;
+  evaluation: JudgeEvaluation | null;
+  isLoading: boolean;
+  error: string | null;
+  latencyMs?: number;
+}
+
+export default function AIEval() {
+  const apiKeys = useApiKeys();
+
+  // Mode toggle
+  const [mode, setMode] = useState<ComparisonMode>("model-vs-model");
+
+  // Prompt configuration
+  const [systemPrompt, setSystemPrompt] = useState("You are a helpful assistant.");
+  const [userPrompt, setUserPrompt] = useState("");
+
+  // For prompt-vs-prompt mode
+  const [promptVariants, setPromptVariants] = useState<string[]>(["", ""]);
+
+  // Model selection
+  const [selectedModels, setSelectedModels] = useState<ModelConfig[]>([]);
+  const [singleModel, setSingleModel] = useState<ModelConfig | null>(null);
+
+  // Judge settings
+  const [judgeModel, setJudgeModel] = useState<ModelConfig | null>(null);
+  const [criteriaWeights, setCriteriaWeights] = useState<CriteriaWeights>(
+    DEFAULT_CRITERIA_WEIGHTS
+  );
+  const [autoEvaluate, setAutoEvaluate] = useState(true);
+  const [showJudgeSettings, setShowJudgeSettings] = useState(false);
+
+  // Results state
+  const [results, setResults] = useState<ResultData[]>([]);
+  const [isRunning, setIsRunning] = useState(false);
+  const [comparisonReasoning, setComparisonReasoning] = useState<string>("");
+
+  // Determine winners
+  const winnerIds = useMemo(() => {
+    const completed = results.filter((r) => r.evaluation && !r.isLoading && !r.error);
+    if (completed.length < 2) return [];
+    const maxScore = Math.max(...completed.map((r) => r.evaluation?.overallScore ?? 0));
+    return completed
+      .filter((r) => (r.evaluation?.overallScore ?? 0) >= maxScore - 0.5)
+      .map((r) => r.id);
+  }, [results]);
+
+  // Validate if we can run
+  const canRun = useMemo(() => {
+    if (isRunning) return false;
+    if (mode === "model-vs-model") {
+      if (!userPrompt.trim()) return false;
+      if (selectedModels.length < 2) return false;
+      for (const model of selectedModels) {
+        if (!apiKeys.hasKey(model.providerId)) return false;
+      }
+    } else {
+      if (!singleModel) return false;
+      if (!apiKeys.hasKey(singleModel.providerId)) return false;
+      if (promptVariants.filter((p) => p.trim()).length < 2) return false;
+    }
+    return true;
+  }, [isRunning, mode, userPrompt, selectedModels, singleModel, promptVariants, apiKeys]);
+
+  // Run evaluation
+  const runEvaluation = useCallback(async () => {
+    if (!canRun) return;
+    setIsRunning(true);
+    setComparisonReasoning("");
+
+    const initialResults: ResultData[] = [];
+
+    if (mode === "model-vs-model") {
+      for (const model of selectedModels) {
+        initialResults.push({
+          id: model.id,
+          modelId: model.id,
+          promptId: "main",
+          output: null,
+          evaluation: null,
+          isLoading: true,
+          error: null,
+        });
+      }
+    } else {
+      promptVariants.forEach((_, index) => {
+        if (promptVariants[index].trim()) {
+          initialResults.push({
+            id: `prompt-${index}`,
+            modelId: singleModel!.id,
+            promptId: `prompt-${index}`,
+            output: null,
+            evaluation: null,
+            isLoading: true,
+            error: null,
+          });
+        }
+      });
+    }
+
+    setResults(initialResults);
+
+    // Run generations
+    const generationResults: ResultData[] = [];
+
+    for (const result of initialResults) {
+      const model =
+        mode === "model-vs-model"
+          ? selectedModels.find((m) => m.id === result.modelId)
+          : singleModel;
+
+      const prompt =
+        mode === "model-vs-model"
+          ? userPrompt
+          : promptVariants[parseInt(result.promptId.split("-")[1])];
+
+      if (!model || !prompt) continue;
+
+      const apiKey = apiKeys.getKey(model.providerId);
+      if (!apiKey) {
+        generationResults.push({
+          ...result,
+          isLoading: false,
+          error: `No API key for ${model.providerId}`,
+        });
+        continue;
+      }
+
+      const startTime = Date.now();
+
+      try {
+        const response = await chat(apiKey, {
+          model: model.id,
+          messages: [
+            { role: "system", content: systemPrompt },
+            { role: "user", content: prompt },
+          ],
+        });
+
+        const latencyMs = Date.now() - startTime;
+
+        generationResults.push({
+          ...result,
+          output: response.content,
+          isLoading: false,
+          latencyMs,
+        });
+
+        setResults((prev) =>
+          prev.map((r) =>
+            r.id === result.id
+              ? { ...r, output: response.content, isLoading: false, latencyMs }
+              : r
+          )
+        );
+      } catch (error) {
+        generationResults.push({
+          ...result,
+          isLoading: false,
+          error: error instanceof Error ? error.message : "Generation failed",
+        });
+
+        setResults((prev) =>
+          prev.map((r) =>
+            r.id === result.id
+              ? {
+                  ...r,
+                  isLoading: false,
+                  error: error instanceof Error ? error.message : "Generation failed",
+                }
+              : r
+          )
+        );
+      }
+    }
+
+    // Run judge evaluation
+    if (autoEvaluate && judgeModel) {
+      const judgeApiKey = apiKeys.getKey(judgeModel.providerId);
+      if (judgeApiKey) {
+        const successful = generationResults.filter((r) => r.output && !r.error);
+
+        if (successful.length >= 2) {
+          try {
+            const fullPrompt = `${systemPrompt}\n\n${mode === "model-vs-model" ? userPrompt : promptVariants[0]}`;
+
+            const comparison = await judgeCompareResponses({
+              apiKey: judgeApiKey,
+              judgeModel: judgeModel.id,
+              originalPrompt: fullPrompt,
+              responseA: successful[0].output!,
+              responseB: successful[1].output!,
+              weights: criteriaWeights,
+            });
+
+            setComparisonReasoning(comparison.comparisonReasoning);
+
+            setResults((prev) =>
+              prev.map((r) => {
+                if (r.id === successful[0].id) return { ...r, evaluation: comparison.evaluationA };
+                if (r.id === successful[1].id) return { ...r, evaluation: comparison.evaluationB };
+                return r;
+              })
+            );
+
+            // Evaluate remaining
+            for (let i = 2; i < successful.length; i++) {
+              const result = successful[i];
+              try {
+                const evaluation = await judgeSingleResponse({
+                  apiKey: judgeApiKey,
+                  judgeModel: judgeModel.id,
+                  originalPrompt: fullPrompt,
+                  response: result.output!,
+                  weights: criteriaWeights,
+                });
+                setResults((prev) =>
+                  prev.map((r) => (r.id === result.id ? { ...r, evaluation } : r))
+                );
+              } catch (e) {
+                console.error("Judge failed:", e);
+              }
+            }
+          } catch (e) {
+            console.error("Comparison failed:", e);
+          }
+        }
+      }
+    }
+
+    setIsRunning(false);
+  }, [
+    canRun,
+    mode,
+    selectedModels,
+    singleModel,
+    systemPrompt,
+    userPrompt,
+    promptVariants,
+    apiKeys,
+    autoEvaluate,
+    judgeModel,
+    criteriaWeights,
+  ]);
+
+  const addPromptVariant = () => {
+    if (promptVariants.length < 4) {
+      setPromptVariants([...promptVariants, ""]);
+    }
+  };
+
+  const removePromptVariant = (index: number) => {
+    if (promptVariants.length > 2) {
+      setPromptVariants(promptVariants.filter((_, i) => i !== index));
+    }
+  };
+
+  const updatePromptVariant = (index: number, value: string) => {
+    setPromptVariants(promptVariants.map((p, i) => (i === index ? value : p)));
+  };
+
+  return (
+    <main className="min-h-screen bg-background">
+      <Meta
+        title="AI Eval Playground | Compare Models & Prompts | Free & Open Source"
+        description="Compare AI models and prompts side-by-side with LLM-as-judge scoring. Evaluate GPT-4, Claude, Gemini responses with automated quality metrics."
+      />
+      <Header />
+      <CMDK />
+
+      <div className="container max-w-7xl py-8">
+        {/* Header */}
+        <div className="mb-8">
+          <PageHeader
+            title="AI Eval Playground"
+            description="Compare models and prompts with automated LLM-as-judge scoring"
+          />
+        </div>
+
+        {/* Toolbar */}
+        <div className="flex items-center justify-between mb-6 pb-6 border-b border-border">
+          <div className="flex items-center gap-1 p-1 bg-muted rounded-lg">
+            <button
+              onClick={() => setMode("model-vs-model")}
+              className={`px-4 py-2 text-sm font-medium rounded-md transition-all ${
+                mode === "model-vs-model"
+                  ? "bg-background text-foreground shadow-sm"
+                  : "text-muted-foreground hover:text-foreground"
+              }`}
+            >
+              Model vs Model
+            </button>
+            <button
+              onClick={() => setMode("prompt-vs-prompt")}
+              className={`px-4 py-2 text-sm font-medium rounded-md transition-all ${
+                mode === "prompt-vs-prompt"
+                  ? "bg-background text-foreground shadow-sm"
+                  : "text-muted-foreground hover:text-foreground"
+              }`}
+            >
+              Prompt vs Prompt
+            </button>
+          </div>
+
+          <div className="flex items-center gap-3">
+            <button
+              onClick={() => setShowJudgeSettings(!showJudgeSettings)}
+              className={`flex items-center gap-2 px-3 py-2 text-sm rounded-lg border transition-all ${
+                showJudgeSettings
+                  ? "border-primary bg-primary/5 text-primary"
+                  : "border-border text-muted-foreground hover:text-foreground hover:border-foreground/20"
+              }`}
+            >
+              <Settings2 className="h-4 w-4" />
+              Judge Settings
+            </button>
+            <ApiKeyDialog apiKeys={apiKeys} />
+          </div>
+        </div>
+
+        {/* Judge Settings Panel (collapsible) */}
+        {showJudgeSettings && (
+          <div className="mb-6">
+            <EvalJudgeConfig
+              judgeModel={judgeModel}
+              onJudgeModelChange={setJudgeModel}
+              weights={criteriaWeights}
+              onWeightsChange={setCriteriaWeights}
+              autoEvaluate={autoEvaluate}
+              onAutoEvaluateChange={setAutoEvaluate}
+              apiKeys={apiKeys}
+            />
+          </div>
+        )}
+
+        {/* Main Content */}
+        <div className="space-y-6">
+          {/* System Prompt - Full Width */}
+          <div className="rounded-xl border border-border bg-card overflow-hidden">
+            <div className="px-4 py-3 border-b border-border bg-muted/30">
+              <span className="text-xs font-medium text-muted-foreground uppercase tracking-wider">
+                System Prompt
+              </span>
+            </div>
+            <textarea
+              value={systemPrompt}
+              onChange={(e) => setSystemPrompt(e.target.value)}
+              placeholder="You are a helpful assistant..."
+              className="w-full px-4 py-4 bg-transparent text-sm font-mono resize-none focus:outline-none min-h-[80px]"
+              rows={2}
+            />
+          </div>
+
+          {mode === "model-vs-model" ? (
+            <>
+              {/* User Prompt - Full Width */}
+              <div className="rounded-xl border border-border bg-card overflow-hidden">
+                <div className="px-4 py-3 border-b border-border bg-muted/30">
+                  <span className="text-xs font-medium text-muted-foreground uppercase tracking-wider">
+                    User Prompt
+                  </span>
+                </div>
+                <textarea
+                  value={userPrompt}
+                  onChange={(e) => setUserPrompt(e.target.value)}
+                  placeholder="Enter your prompt here..."
+                  className="w-full px-4 py-4 bg-transparent text-sm resize-none focus:outline-none min-h-[120px]"
+                  rows={4}
+                />
+              </div>
+
+              {/* Model Selection */}
+              <div className="rounded-xl border border-border bg-card overflow-hidden">
+                <div className="px-4 py-3 border-b border-border bg-muted/30 flex items-center justify-between">
+                  <span className="text-xs font-medium text-muted-foreground uppercase tracking-wider">
+                    Select Models to Compare
+                  </span>
+                  <span className="text-xs text-muted-foreground">
+                    {selectedModels.length}/4 selected
+                  </span>
+                </div>
+                <div className="p-4">
+                  <EvalModelSelector
+                    selectedModels={selectedModels}
+                    onModelsChange={setSelectedModels}
+                    apiKeys={apiKeys}
+                    maxSelections={4}
+                  />
+                </div>
+              </div>
+            </>
+          ) : (
+            <>
+              {/* Single Model Selection */}
+              <div className="rounded-xl border border-border bg-card overflow-hidden">
+                <div className="px-4 py-3 border-b border-border bg-muted/30">
+                  <span className="text-xs font-medium text-muted-foreground uppercase tracking-wider">
+                    Select Model
+                  </span>
+                </div>
+                <div className="p-4">
+                  <EvalModelSelector
+                    selectedModels={singleModel ? [singleModel] : []}
+                    onModelsChange={(models) => setSingleModel(models[0] || null)}
+                    apiKeys={apiKeys}
+                    maxSelections={1}
+                  />
+                </div>
+              </div>
+
+              {/* Prompt Variants Grid */}
+              <div className="grid grid-cols-1 md:grid-cols-2 gap-4">
+                {promptVariants.map((prompt, index) => (
+                  <div
+                    key={index}
+                    className="rounded-xl border border-border bg-card overflow-hidden"
+                  >
+                    <div className="px-4 py-3 border-b border-border bg-muted/30 flex items-center justify-between">
+                      <span className="text-xs font-medium text-muted-foreground uppercase tracking-wider">
+                        Prompt {String.fromCharCode(65 + index)}
+                      </span>
+                      {promptVariants.length > 2 && (
+                        <button
+                          onClick={() => removePromptVariant(index)}
+                          className="text-xs text-muted-foreground hover:text-destructive transition-colors"
+                        >
+                          Remove
+                        </button>
+                      )}
+                    </div>
+                    <textarea
+                      value={prompt}
+                      onChange={(e) => updatePromptVariant(index, e.target.value)}
+                      placeholder={`Enter prompt variant ${String.fromCharCode(65 + index)}...`}
+                      className="w-full px-4 py-4 bg-transparent text-sm resize-none focus:outline-none min-h-[120px]"
+                      rows={4}
+                    />
+                  </div>
+                ))}
+              </div>
+
+              {promptVariants.length < 4 && (
+                <button
+                  onClick={addPromptVariant}
+                  className="w-full py-3 border border-dashed border-border rounded-xl text-sm text-muted-foreground hover:text-foreground hover:border-foreground/20 transition-all"
+                >
+                  + Add Prompt Variant
+                </button>
+              )}
+            </>
+          )}
+
+          {/* Run Button */}
+          <Button
+            onClick={runEvaluation}
+            disabled={!canRun}
+            size="lg"
+            className="w-full h-12 text-base font-medium"
+          >
+            {isRunning ? (
+              <>
+                <Loader2 className="h-5 w-5 animate-spin mr-2" />
+                Running Evaluation...
+              </>
+            ) : (
+              <>
+                <Play className="h-5 w-5 mr-2" />
+                Run Evaluation
+              </>
+            )}
+          </Button>
+
+          {/* Results */}
+          {results.length > 0 && (
+            <div className="space-y-4">
+              <div className="flex items-center justify-between">
+                <h2 className="text-lg font-semibold">Results</h2>
+                {comparisonReasoning && (
+                  <p className="text-sm text-muted-foreground max-w-xl">
+                    {comparisonReasoning}
+                  </p>
+                )}
+              </div>
+
+              <div
+                className={`grid gap-4 ${
+                  results.length === 2
+                    ? "grid-cols-1 md:grid-cols-2"
+                    : results.length === 3
+                      ? "grid-cols-1 md:grid-cols-3"
+                      : "grid-cols-1 md:grid-cols-2 lg:grid-cols-4"
+                }`}
+              >
+                {results.map((result, index) => {
+                  const model =
+                    mode === "model-vs-model"
+                      ? selectedModels.find((m) => m.id === result.modelId)
+                      : singleModel;
+
+                  return (
+                    <EvalResultCard
+                      key={result.id}
+                      label={
+                        mode === "model-vs-model"
+                          ? model?.name || "Unknown"
+                          : `Prompt ${String.fromCharCode(65 + index)}`
+                      }
+                      sublabel={mode === "model-vs-model" ? undefined : model?.name}
+                      output={result.output}
+                      evaluation={result.evaluation}
+                      isLoading={result.isLoading}
+                      error={result.error}
+                      isWinner={winnerIds.includes(result.id)}
+                      latencyMs={result.latencyMs}
+                    />
+                  );
+                })}
+              </div>
+            </div>
+          )}
+        </div>
+      </div>
+
+      <CallToActionGrid />
+    </main>
+  );
+}