From 5244c2a9bfdad587348616902385038893444750 Mon Sep 17 00:00:00 2001
From: Berk Durmus <berkdurmus@yahoo.com>
Date: Sun, 25 Jan 2026 05:07:44 +0300
Subject: [PATCH 1/2] feat(evals): add AI Eval Playground for comparing models
 and prompts

---
 components/ai-eval/ApiKeyDialog.tsx       | 231 +++++++++++
 components/ai-eval/EvalComparisonGrid.tsx | 154 ++++++++
 components/ai-eval/EvalConfigPanel.tsx    | 265 +++++++++++++
 components/ai-eval/EvalJudgePanel.tsx     | 186 +++++++++
 components/ai-eval/EvalModelSelector.tsx  | 204 ++++++++++
 components/ai-eval/EvalResultCell.tsx     | 149 +++++++
 components/ai-eval/EvalScoreDisplay.tsx   | 151 +++++++
 components/hooks/useApiKeys.ts            |  99 +++++
 components/utils/ai-eval-judge.ts         | 362 +++++++++++++++++
 components/utils/ai-eval-providers.ts     | 309 +++++++++++++++
 components/utils/ai-eval-schemas.test.ts  | 354 +++++++++++++++++
 components/utils/ai-eval-schemas.ts       | 354 +++++++++++++++++
 components/utils/tools-list.ts            |   6 +
 package-lock.json                         |  12 +-
 package.json                              |   3 +-
 pages/utilities/ai-eval.tsx               | 460 ++++++++++++++++++++++
 16 files changed, 3297 insertions(+), 2 deletions(-)
 create mode 100644 components/ai-eval/ApiKeyDialog.tsx
 create mode 100644 components/ai-eval/EvalComparisonGrid.tsx
 create mode 100644 components/ai-eval/EvalConfigPanel.tsx
 create mode 100644 components/ai-eval/EvalJudgePanel.tsx
 create mode 100644 components/ai-eval/EvalModelSelector.tsx
 create mode 100644 components/ai-eval/EvalResultCell.tsx
 create mode 100644 components/ai-eval/EvalScoreDisplay.tsx
 create mode 100644 components/hooks/useApiKeys.ts
 create mode 100644 components/utils/ai-eval-judge.ts
 create mode 100644 components/utils/ai-eval-providers.ts
 create mode 100644 components/utils/ai-eval-schemas.test.ts
 create mode 100644 components/utils/ai-eval-schemas.ts
 create mode 100644 pages/utilities/ai-eval.tsx
diff --git a/components/ai-eval/ApiKeyDialog.tsx b/components/ai-eval/ApiKeyDialog.tsx
new file mode 100644
index 0000000..d68c2ed
--- /dev/null
+++ b/components/ai-eval/ApiKeyDialog.tsx
@@ -0,0 +1,231 @@
+import { useState, useCallback } from "react";
+import {
+  Dialog,
+  DialogContent,
+  DialogHeader,
+  DialogTitle,
+  DialogDescription,
+  DialogTrigger,
+} from "@/components/ds/DialogComponent";
+import { Button } from "@/components/ds/ButtonComponent";
+import { Input } from "@/components/ds/InputComponent";
+import { Label } from "@/components/ds/LabelComponent";
+import { PROVIDERS, ProviderId } from "@/components/utils/ai-eval-schemas";
+import { UseApiKeysReturn } from "@/components/hooks/useApiKeys";
+import { Key, Check, X, Loader2, Eye, EyeOff } from "lucide-react";
+
+interface ApiKeyDialogProps {
+  apiKeys: UseApiKeysReturn;
+  children?: React.ReactNode;
+}
+
+interface ProviderKeyInputProps {
+  providerId: ProviderId;
+  providerName: string;
+  apiKeys: UseApiKeysReturn;
+}
+
+function ProviderKeyInput({
+  providerId,
+  providerName,
+  apiKeys,
+}: ProviderKeyInputProps) {
+  const [value, setValue] = useState(apiKeys.getKey(providerId) || "");
+  const [showKey, setShowKey] = useState(false);
+  const [testing, setTesting] = useState(false);
+  const [testResult, setTestResult] = useState<boolean | null>(null);
+
+  const hasKey = apiKeys.hasKey(providerId);
+
+  const handleSave = useCallback(() => {
+    if (value.trim()) {
+      apiKeys.setKey(providerId, value.trim());
+      setTestResult(null);
+    }
+  }, [apiKeys, providerId, value]);
+
+  const handleRemove = useCallback(() => {
+    apiKeys.removeKey(providerId);
+    setValue("");
+    setTestResult(null);
+  }, [apiKeys, providerId]);
+
+  const handleTest = useCallback(async () => {
+    if (!value.trim()) return;
+
+    // First save the key
+    apiKeys.setKey(providerId, value.trim());
+
+    setTesting(true);
+    setTestResult(null);
+
+    try {
+      const result = await apiKeys.testKey(providerId);
+      setTestResult(result);
+    } catch {
+      setTestResult(false);
+    } finally {
+      setTesting(false);
+    }
+  }, [apiKeys, providerId, value]);
+
+  return (
+    <div className="space-y-2">
+      <div className="flex items-center justify-between">
+        <Label className="text-sm font-medium">{providerName}</Label>
+        <div className="flex items-center gap-1">
+          {hasKey && testResult === null && (
+            <span className="text-xs text-muted-foreground flex items-center gap-1">
+              <Check className="h-3 w-3 text-green-500" />
+              Configured
+            </span>
+          )}
+          {testResult === true && (
+            <span className="text-xs text-green-600 dark:text-green-400 flex items-center gap-1">
+              <Check className="h-3 w-3" />
+              Valid
+            </span>
+          )}
+          {testResult === false && (
+            <span className="text-xs text-red-600 dark:text-red-400 flex items-center gap-1">
+              <X className="h-3 w-3" />
+              Invalid
+            </span>
+          )}
+        </div>
+      </div>
+
+      <div className="flex gap-2">
+        <div className="relative flex-1">
+          <Input
+            type={showKey ? "text" : "password"}
+            placeholder={`Enter ${providerName} API key`}
+            value={value}
+            onChange={(e) => {
+              setValue(e.target.value);
+              setTestResult(null);
+            }}
+            onBlur={handleSave}
+            className="pr-10 font-mono text-xs"
+          />
+          <button
+            type="button"
+            onClick={() => setShowKey(!showKey)}
+            className="absolute right-2 top-1/2 -translate-y-1/2 text-muted-foreground hover:text-foreground transition-colors"
+          >
+            {showKey ? (
+              <EyeOff className="h-4 w-4" />
+            ) : (
+              <Eye className="h-4 w-4" />
+            )}
+          </button>
+        </div>
+
+        <Button
+          variant="outline"
+          size="sm"
+          onClick={handleTest}
+          disabled={!value.trim() || testing}
+          className="shrink-0"
+        >
+          {testing ? (
+            <Loader2 className="h-4 w-4 animate-spin" />
+          ) : (
+            "Test"
+          )}
+        </Button>
+
+        {hasKey && (
+          <Button
+            variant="outline"
+            size="sm"
+            onClick={handleRemove}
+            className="shrink-0 text-red-600 hover:text-red-700 hover:bg-red-50 dark:hover:bg-red-950"
+          >
+            Remove
+          </Button>
+        )}
+      </div>
+    </div>
+  );
+}
+
+export function ApiKeyDialog({ apiKeys, children }: ApiKeyDialogProps) {
+  const configuredCount = PROVIDERS.filter((p) =>
+    apiKeys.hasKey(p.id)
+  ).length;
+
+  return (
+    <Dialog>
+      <DialogTrigger asChild>
+        {children || (
+          <Button variant="outline" size="sm" className="gap-2">
+            <Key className="h-4 w-4" />
+            API Keys
+            {configuredCount > 0 && (
+              <span className="ml-1 bg-green-100 dark:bg-green-900/50 text-green-700 dark:text-green-300 text-xs px-1.5 py-0.5 rounded-full">
+                {configuredCount}
+              </span>
+            )}
+          </Button>
+        )}
+      </DialogTrigger>
+
+      <DialogContent className="sm:max-w-md">
+        <DialogHeader>
+          <DialogTitle className="flex items-center gap-2">
+            <Key className="h-5 w-5" />
+            API Keys
+          </DialogTitle>
+          <DialogDescription>
+            Your API keys are stored in session storage and cleared when you
+            close the browser. Keys never leave your browser.
+          </DialogDescription>
+        </DialogHeader>
+
+        <div className="space-y-6 mt-4">
+          {PROVIDERS.map((provider) => (
+            <ProviderKeyInput
+              key={provider.id}
+              providerId={provider.id}
+              providerName={provider.name}
+              apiKeys={apiKeys}
+            />
+          ))}
+        </div>
+
+        <div className="mt-6 pt-4 border-t border-border">
+          <p className="text-xs text-muted-foreground">
+            Need API keys?{" "}
+            <a
+              href="https://platform.openai.com/api-keys"
+              target="_blank"
+              rel="noopener noreferrer"
+              className="text-primary hover:underline"
+            >
+              OpenAI
+            </a>{" "}
+            ·{" "}
+            <a
+              href="https://console.anthropic.com/settings/keys"
+              target="_blank"
+              rel="noopener noreferrer"
+              className="text-primary hover:underline"
+            >
+              Anthropic
+            </a>{" "}
+            ·{" "}
+            <a
+              href="https://aistudio.google.com/app/apikey"
+              target="_blank"
+              rel="noopener noreferrer"
+              className="text-primary hover:underline"
+            >
+              Google AI
+            </a>
+          </p>
+        </div>
+      </DialogContent>
+    </Dialog>
+  );
+}
diff --git a/components/ai-eval/EvalComparisonGrid.tsx b/components/ai-eval/EvalComparisonGrid.tsx
new file mode 100644
index 0000000..ca4ef7b
--- /dev/null
+++ b/components/ai-eval/EvalComparisonGrid.tsx
@@ -0,0 +1,154 @@
+import {
+  ModelConfig,
+  JudgeEvaluation,
+  ComparisonMode,
+} from "@/components/utils/ai-eval-schemas";
+import { EvalResultCell } from "./EvalResultCell";
+
+interface ResultData {
+  id: string;
+  modelId: string;
+  promptId: string;
+  output: string | null;
+  evaluation: JudgeEvaluation | null;
+  isLoading: boolean;
+  error: string | null;
+  latencyMs?: number;
+}
+
+interface EvalComparisonGridProps {
+  mode: ComparisonMode;
+  models: ModelConfig[];
+  results: ResultData[];
+  winnerIds: string[];
+}
+
+export function EvalComparisonGrid({
+  mode,
+  models,
+  results,
+  winnerIds,
+}: EvalComparisonGridProps) {
+  // Determine grid columns based on number of items
+  const gridCols =
+    models.length === 2
+      ? "grid-cols-2"
+      : models.length === 3
+        ? "grid-cols-3"
+        : "grid-cols-2 lg:grid-cols-4";
+
+  if (mode === "model-vs-model") {
+    // In model-vs-model mode, we show one column per model
+    return (
+      <div className={`grid ${gridCols} gap-4`}>
+        {models.map((model, index) => {
+          const result = results.find((r) => r.modelId === model.id);
+          const label = `Model ${String.fromCharCode(65 + index)}`;
+
+          return (
+            <EvalResultCell
+              key={model.id}
+              label={label}
+              model={model}
+              output={result?.output ?? null}
+              evaluation={result?.evaluation ?? null}
+              isLoading={result?.isLoading ?? false}
+              error={result?.error ?? null}
+              isWinner={result ? winnerIds.includes(result.id) : false}
+              latencyMs={result?.latencyMs}
+            />
+          );
+        })}
+      </div>
+    );
+  }
+
+  // In prompt-vs-prompt mode, we show one column per prompt variant
+  // Group results by promptId
+  const promptIds = Array.from(new Set(results.map((r) => r.promptId)));
+  const model = models[0]; // Single model in prompt-vs-prompt mode
+
+  return (
+    <div className={`grid ${promptIds.length === 2 ? "grid-cols-2" : promptIds.length === 3 ? "grid-cols-3" : "grid-cols-2 lg:grid-cols-4"} gap-4`}>
+      {promptIds.map((promptId, index) => {
+        const result = results.find((r) => r.promptId === promptId);
+        const label = `Prompt ${String.fromCharCode(65 + index)}`;
+
+        return (
+          <EvalResultCell
+            key={promptId}
+            label={label}
+            model={model}
+            output={result?.output ?? null}
+            evaluation={result?.evaluation ?? null}
+            isLoading={result?.isLoading ?? false}
+            error={result?.error ?? null}
+            isWinner={result ? winnerIds.includes(result.id) : false}
+            latencyMs={result?.latencyMs}
+          />
+        );
+      })}
+    </div>
+  );
+}
+
+// Summary component for showing overall comparison results
+interface ComparisonSummaryProps {
+  results: ResultData[];
+  winnerIds: string[];
+  comparisonReasoning?: string;
+}
+
+export function ComparisonSummary({
+  results,
+  winnerIds,
+  comparisonReasoning,
+}: ComparisonSummaryProps) {
+  const completedResults = results.filter(
+    (r) => r.output && r.evaluation && !r.isLoading
+  );
+
+  if (completedResults.length < 2) {
+    return null;
+  }
+
+  // Sort by overall score
+  const sorted = [...completedResults].sort(
+    (a, b) =>
+      (b.evaluation?.overallScore ?? 0) - (a.evaluation?.overallScore ?? 0)
+  );
+
+  return (
+    <div className="bg-muted/50 rounded-xl p-4 space-y-3">
+      <h3 className="text-sm font-semibold">Comparison Summary</h3>
+
+      <div className="flex items-center gap-4 text-sm">
+        {sorted.map((result, index) => {
+          const isWinner = winnerIds.includes(result.id);
+          return (
+            <div
+              key={result.id}
+              className={`flex items-center gap-2 ${
+                isWinner
+                  ? "text-green-600 dark:text-green-400 font-medium"
+                  : "text-muted-foreground"
+              }`}
+            >
+              <span className="font-mono">#{index + 1}</span>
+              <span>{result.modelId}</span>
+              <span className="font-semibold">
+                {result.evaluation?.overallScore.toFixed(1)}
+              </span>
+            </div>
+          );
+        })}
+      </div>
+
+      {comparisonReasoning && (
+        <p className="text-xs text-muted-foreground border-t border-border pt-3">
+          {comparisonReasoning}
+        </p>
+      )}
+    </div>
+  );
+}
diff --git a/components/ai-eval/EvalConfigPanel.tsx b/components/ai-eval/EvalConfigPanel.tsx
new file mode 100644
index 0000000..d36ee88
--- /dev/null
+++ b/components/ai-eval/EvalConfigPanel.tsx
@@ -0,0 +1,265 @@
+import { useState, useEffect, useCallback, useMemo } from "react";
+import { Card } from "@/components/ds/CardComponent";
+import { Label } from "@/components/ds/LabelComponent";
+import { Textarea } from "@/components/ds/TextareaComponent";
+import { Input } from "@/components/ds/InputComponent";
+import {
+  extractVariables,
+  DEFAULT_SYSTEM_PROMPT,
+  DEFAULT_USER_PROMPT,
+  ComparisonMode,
+} from "@/components/utils/ai-eval-schemas";
+import { Plus, Trash2 } from "lucide-react";
+import { Button } from "@/components/ds/ButtonComponent";
+
+interface PromptVariant {
+  id: string;
+  systemPrompt: string;
+  userPrompt: string;
+}
+
+interface EvalConfigPanelProps {
+  mode: ComparisonMode;
+  onPromptsChange: (prompts: PromptVariant[]) => void;
+  onVariablesChange: (variables: Record<string, string>) => void;
+}
+
+export function EvalConfigPanel({
+  mode,
+  onPromptsChange,
+  onVariablesChange,
+}: EvalConfigPanelProps) {
+  // For model-vs-model mode, we have a single prompt
+  // For prompt-vs-prompt mode, we have multiple prompt variants
+  const [prompts, setPrompts] = useState<PromptVariant[]>([
+    {
+      id: "prompt-1",
+      systemPrompt: DEFAULT_SYSTEM_PROMPT,
+      userPrompt: DEFAULT_USER_PROMPT,
+    },
+  ]);
+
+  const [variables, setVariables] = useState<Record<string, string>>({
+    question: "",
+  });
+
+  // Extract all variables from all prompts
+  const allVariables = useMemo(() => {
+    const vars = new Set<string>();
+    prompts.forEach((p) => {
+      extractVariables(p.systemPrompt).forEach((v) => vars.add(v));
+      extractVariables(p.userPrompt).forEach((v) => vars.add(v));
+    });
+    return Array.from(vars);
+  }, [prompts]);
+
+  // Update variables state when new variables are detected
+  useEffect(() => {
+    setVariables((prev) => {
+      const next: Record<string, string> = {};
+      allVariables.forEach((v) => {
+        next[v] = prev[v] ?? "";
+      });
+      return next;
+    });
+  }, [allVariables]);
+
+  // Notify parent of changes
+  useEffect(() => {
+    onPromptsChange(prompts);
+  }, [prompts, onPromptsChange]);
+
+  useEffect(() => {
+    onVariablesChange(variables);
+  }, [variables, onVariablesChange]);
+
+  const updatePrompt = useCallback(
+    (id: string, field: "systemPrompt" | "userPrompt", value: string) => {
+      setPrompts((prev) =>
+        prev.map((p) => (p.id === id ? { ...p, [field]: value } : p))
+      );
+    },
+    []
+  );
+
+  const addPromptVariant = useCallback(() => {
+    const newId = `prompt-${Date.now()}`;
+    setPrompts((prev) => [
+      ...prev,
+      {
+        id: newId,
+        systemPrompt: prev[0]?.systemPrompt || DEFAULT_SYSTEM_PROMPT,
+        userPrompt: prev[0]?.userPrompt || DEFAULT_USER_PROMPT,
+      },
+    ]);
+  }, []);
+
+  const removePromptVariant = useCallback((id: string) => {
+    setPrompts((prev) => {
+      if (prev.length <= 1) return prev;
+      return prev.filter((p) => p.id !== id);
+    });
+  }, []);
+
+  const updateVariable = useCallback((name: string, value: string) => {
+    setVariables((prev) => ({ ...prev, [name]: value }));
+  }, []);
+
+  // In model-vs-model mode, show single prompt config
+  if (mode === "model-vs-model") {
+    const prompt = prompts[0];
+    return (
+      <Card className="p-6 hover:shadow-none shadow-none rounded-xl">
+        <div className="space-y-6">
+          <div>
+            <Label className="mb-2 block text-sm font-medium">
+              System Prompt
+            </Label>
+            <Textarea
+              rows={3}
+              placeholder="You are a helpful assistant..."
+              value={prompt.systemPrompt}
+              onChange={(e) =>
+                updatePrompt(prompt.id, "systemPrompt", e.target.value)
+              }
+              className="font-mono text-sm"
+            />
+          </div>
+
+          <div>
+            <Label className="mb-2 block text-sm font-medium">
+              User Prompt
+            </Label>
+            <Textarea
+              rows={4}
+              placeholder="Enter your prompt here. Use {{variable}} for dynamic values."
+              value={prompt.userPrompt}
+              onChange={(e) =>
+                updatePrompt(prompt.id, "userPrompt", e.target.value)
+              }
+              className="font-mono text-sm"
+            />
+            <p className="mt-1 text-xs text-muted-foreground">
+              Use {"{{variableName}}"} to create dynamic inputs
+            </p>
+          </div>
+
+          {allVariables.length > 0 && (
+            <div>
+              <Label className="mb-3 block text-sm font-medium">Variables</Label>
+              <div className="space-y-3">
+                {allVariables.map((varName) => (
+                  <div key={varName} className="flex items-center gap-3">
+                    <code className="text-xs bg-muted px-2 py-1 rounded font-mono min-w-[100px]">
+                      {"{{"}{varName}{"}}"}
+                    </code>
+                    <Input
+                      placeholder={`Enter value for ${varName}`}
+                      value={variables[varName] || ""}
+                      onChange={(e) => updateVariable(varName, e.target.value)}
+                      className="flex-1"
+                    />
+                  </div>
+                ))}
+              </div>
+            </div>
+          )}
+        </div>
+      </Card>
+    );
+  }
+
+  // In prompt-vs-prompt mode, show multiple prompt editors
+  return (
+    <div className="space-y-4">
+      {prompts.map((prompt, index) => (
+        <Card
+          key={prompt.id}
+          className="p-6 hover:shadow-none shadow-none rounded-xl"
+        >
+          <div className="flex items-center justify-between mb-4">
+            <h3 className="text-sm font-semibold">
+              Prompt Variant {String.fromCharCode(65 + index)}
+            </h3>
+            {prompts.length > 1 && (
+              <Button
+                variant="ghost"
+                size="sm"
+                onClick={() => removePromptVariant(prompt.id)}
+                className="text-muted-foreground hover:text-destructive"
+              >
+                <Trash2 className="h-4 w-4" />
+              </Button>
+            )}
+          </div>
+
+          <div className="space-y-4">
+            <div>
+              <Label className="mb-2 block text-xs font-medium text-muted-foreground">
+                System Prompt
+              </Label>
+              <Textarea
+                rows={2}
+                placeholder="You are a helpful assistant..."
+                value={prompt.systemPrompt}
+                onChange={(e) =>
+                  updatePrompt(prompt.id, "systemPrompt", e.target.value)
+                }
+                className="font-mono text-sm"
+              />
+            </div>
+
+            <div>
+              <Label className="mb-2 block text-xs font-medium text-muted-foreground">
+                User Prompt
+              </Label>
+              <Textarea
+                rows={3}
+                placeholder="Enter your prompt variant here..."
+                value={prompt.userPrompt}
+                onChange={(e) =>
+                  updatePrompt(prompt.id, "userPrompt", e.target.value)
+                }
+                className="font-mono text-sm"
+              />
+            </div>
+          </div>
+        </Card>
+      ))}
+
+      {prompts.length < 4 && (
+        <Button
+          variant="outline"
+          onClick={addPromptVariant}
+          className="w-full gap-2"
+        >
+          <Plus className="h-4 w-4" />
+          Add Prompt Variant
+        </Button>
+      )}
+
+      {allVariables.length > 0 && (
+        <Card className="p-6 hover:shadow-none shadow-none rounded-xl">
+          <Label className="mb-3 block text-sm font-medium">
+            Shared Variables
+          </Label>
+          <div className="space-y-3">
+            {allVariables.map((varName) => (
+              <div key={varName} className="flex items-center gap-3">
+                <code className="text-xs bg-muted px-2 py-1 rounded font-mono min-w-[100px]">
+                  {"{{"}{varName}{"}}"}
+                </code>
+                <Input
+                  placeholder={`Enter value for ${varName}`}
+                  value={variables[varName] || ""}
+                  onChange={(e) => updateVariable(varName, e.target.value)}
+                  className="flex-1"
+                />
+              </div>
+            ))}
+          </div>
+        </Card>
+      )}
+    </div>
+  );
+}
diff --git a/components/ai-eval/EvalJudgePanel.tsx b/components/ai-eval/EvalJudgePanel.tsx
new file mode 100644
index 0000000..953e494
--- /dev/null
+++ b/components/ai-eval/EvalJudgePanel.tsx
@@ -0,0 +1,186 @@
+import { useCallback, useMemo } from "react";
+import { Card } from "@/components/ds/CardComponent";
+import { Label } from "@/components/ds/LabelComponent";
+import { Checkbox } from "@/components/ds/CheckboxComponent";
+import { Slider } from "@/components/ds/SliderComponent";
+import {
+  ModelConfig,
+  CriteriaWeights,
+  DEFAULT_CRITERIA_WEIGHTS,
+} from "@/components/utils/ai-eval-schemas";
+import { EvalModelSelector } from "./EvalModelSelector";
+import { UseApiKeysReturn } from "@/components/hooks/useApiKeys";
+import { AlertCircle, Scale } from "lucide-react";
+
+interface EvalJudgePanelProps {
+  judgeModel: ModelConfig | null;
+  onJudgeModelChange: (model: ModelConfig | null) => void;
+  comparedModelIds: string[];
+  weights: CriteriaWeights;
+  onWeightsChange: (weights: CriteriaWeights) => void;
+  autoEvaluate: boolean;
+  onAutoEvaluateChange: (value: boolean) => void;
+  apiKeys: UseApiKeysReturn;
+}
+
+const CRITERIA_INFO: {
+  key: keyof CriteriaWeights;
+  label: string;
+  description: string;
+}[] = [
+  {
+    key: "accuracy",
+    label: "Accuracy",
+    description: "Factual correctness",
+  },
+  {
+    key: "relevance",
+    label: "Relevance",
+    description: "Addresses the prompt",
+  },
+  {
+    key: "clarity",
+    label: "Clarity",
+    description: "Well-organized and clear",
+  },
+  {
+    key: "completeness",
+    label: "Completeness",
+    description: "Comprehensive coverage",
+  },
+  {
+    key: "conciseness",
+    label: "Conciseness",
+    description: "Appropriately detailed",
+  },
+];
+
+export function EvalJudgePanel({
+  judgeModel,
+  onJudgeModelChange,
+  comparedModelIds,
+  weights,
+  onWeightsChange,
+  autoEvaluate,
+  onAutoEvaluateChange,
+  apiKeys,
+}: EvalJudgePanelProps) {
+  // Check if judge model is same as compared models
+  const judgeMatchesCompared = useMemo(() => {
+    if (!judgeModel) return false;
+    return comparedModelIds.includes(judgeModel.id);
+  }, [judgeModel, comparedModelIds]);
+
+  // Calculate total weight percentage
+  const totalWeight = useMemo(() => {
+    return Object.values(weights).reduce((sum, w) => sum + w, 0);
+  }, [weights]);
+
+  const updateWeight = useCallback(
+    (key: keyof CriteriaWeights, value: number) => {
+      // Normalize value to percentage (0-1)
+      const newValue = value / 100;
+      const newWeights = { ...weights, [key]: newValue };
+
+      // Normalize all weights to sum to 1
+      const total = Object.values(newWeights).reduce((sum, w) => sum + w, 0);
+      if (total > 0) {
+        Object.keys(newWeights).forEach((k) => {
+          newWeights[k as keyof CriteriaWeights] /= total;
+        });
+      }
+
+      onWeightsChange(newWeights);
+    },
+    [weights, onWeightsChange]
+  );
+
+  const resetWeights = useCallback(() => {
+    onWeightsChange(DEFAULT_CRITERIA_WEIGHTS);
+  }, [onWeightsChange]);
+
+  return (
+    <Card className="p-6 hover:shadow-none shadow-none rounded-xl">
+      <div className="flex items-center gap-2 mb-4">
+        <Scale className="h-4 w-4 text-muted-foreground" />
+        <h3 className="text-sm font-semibold">Judge Settings</h3>
+      </div>
+
+      <div className="space-y-6">
+        {/* Judge Model Selection */}
+        <div>
+          <EvalModelSelector
+            value={judgeModel}
+            onChange={onJudgeModelChange}
+            apiKeys={apiKeys}
+            label="Judge Model"
+            showWarning={judgeMatchesCompared}
+            warningText="Using same model as compared - consider using a different judge"
+          />
+        </div>
+
+        {/* Auto-evaluate toggle */}
+        <div className="flex items-center justify-between">
+          <div>
+            <Label className="text-sm">Auto-evaluate</Label>
+            <p className="text-xs text-muted-foreground">
+              Automatically score responses after generation
+            </p>
+          </div>
+          <Checkbox
+            checked={autoEvaluate}
+            onCheckedChange={(checked) =>
+              onAutoEvaluateChange(checked === true)
+            }
+          />
+        </div>
+
+        {/* Criteria Weights */}
+        <div>
+          <div className="flex items-center justify-between mb-3">
+            <Label className="text-sm">Criteria Weights</Label>
+            <button
+              onClick={resetWeights}
+              className="text-xs text-muted-foreground hover:text-foreground transition-colors"
+            >
+              Reset
+            </button>
+          </div>
+
+          <div className="space-y-4">
+            {CRITERIA_INFO.map(({ key, label, description }) => (
+              <div key={key} className="space-y-2">
+                <div className="flex items-center justify-between">
+                  <div>
+                    <span className="text-sm font-medium">{label}</span>
+                    <span className="text-xs text-muted-foreground ml-2">
+                      {description}
+                    </span>
+                  </div>
+                  <span className="text-sm font-mono text-muted-foreground">
+                    {Math.round(weights[key] * 100)}%
+                  </span>
+                </div>
+                <Slider
+                  value={[Math.round(weights[key] * 100)]}
+                  onValueChange={(values) => updateWeight(key, values[0])}
+                  min={0}
+                  max={100}
+                  step={5}
+                  className="w-full"
+                />
+              </div>
+            ))}
+          </div>
+
+          {Math.abs(totalWeight - 1) > 0.01 && (
+            <p className="mt-3 text-xs text-amber-600 dark:text-amber-400 flex items-center gap-1">
+              <AlertCircle className="h-3 w-3" />
+              Weights will be normalized to sum to 100%
+            </p>
+          )}
+        </div>
+      </div>
+    </Card>
+  );
+}
diff --git a/components/ai-eval/EvalModelSelector.tsx b/components/ai-eval/EvalModelSelector.tsx
new file mode 100644
index 0000000..6668e7c
--- /dev/null
+++ b/components/ai-eval/EvalModelSelector.tsx
@@ -0,0 +1,204 @@
+import { useCallback, useMemo } from "react";
+import {
+  PROVIDERS,
+  ModelConfig,
+} from "@/components/utils/ai-eval-schemas";
+import { UseApiKeysReturn } from "@/components/hooks/useApiKeys";
+import { ChevronDown, AlertCircle } from "lucide-react";
+
+interface EvalModelSelectorProps {
+  value: ModelConfig | null;
+  onChange: (model: ModelConfig | null) => void;
+  apiKeys: UseApiKeysReturn;
+  label?: string;
+  excludeModels?: string[];
+  showWarning?: boolean;
+  warningText?: string;
+}
+
+export function EvalModelSelector({
+  value,
+  onChange,
+  apiKeys,
+  label,
+  excludeModels = [],
+  showWarning = false,
+  warningText,
+}: EvalModelSelectorProps) {
+  // Group models by provider with availability status
+  const providerGroups = useMemo(() => {
+    return PROVIDERS.map((provider) => ({
+      ...provider,
+      hasKey: apiKeys.hasKey(provider.id),
+      models: provider.models.filter((m) => !excludeModels.includes(m.id)),
+    }));
+  }, [apiKeys, excludeModels]);
+
+  const handleChange = useCallback(
+    (e: React.ChangeEvent<HTMLSelectElement>) => {
+      const modelId = e.target.value;
+      if (!modelId) {
+        onChange(null);
+        return;
+      }
+
+      for (const provider of PROVIDERS) {
+        const model = provider.models.find((m) => m.id === modelId);
+        if (model) {
+          onChange(model);
+          return;
+        }
+      }
+    },
+    [onChange]
+  );
+
+  const selectedProviderId = value?.providerId;
+  const hasKeyForSelected = selectedProviderId
+    ? apiKeys.hasKey(selectedProviderId)
+    : true;
+
+  return (
+    <div className="space-y-2">
+      {label && (
+        <label className="block text-sm font-medium text-foreground">
+          {label}
+        </label>
+      )}
+
+      <div className="relative">
+        <select
+          value={value?.id || ""}
+          onChange={handleChange}
+          className="w-full h-10 pl-3 pr-10 rounded-lg border border-input bg-muted text-sm ring-offset-background focus:outline-none focus:ring-2 focus:ring-ring focus:ring-offset-2 appearance-none cursor-pointer"
+        >
+          <option value="">Select a model...</option>
+          {providerGroups.map((provider) => (
+            <optgroup
+              key={provider.id}
+              label={`${provider.name}${!provider.hasKey ? " (no API key)" : ""}`}
+            >
+              {provider.models.map((model) => (
+                <option
+                  key={model.id}
+                  value={model.id}
+                  disabled={!provider.hasKey}
+                >
+                  {model.name}
+                </option>
+              ))}
+            </optgroup>
+          ))}
+        </select>
+        <ChevronDown className="absolute right-3 top-1/2 -translate-y-1/2 h-4 w-4 text-muted-foreground pointer-events-none" />
+      </div>
+
+      {!hasKeyForSelected && value && (
+        <p className="text-xs text-amber-600 dark:text-amber-400 flex items-center gap-1">
+          <AlertCircle className="h-3 w-3" />
+          Add API key for {value.providerId} to use this model
+        </p>
+      )}
+
+      {showWarning && warningText && (
+        <p className="text-xs text-amber-600 dark:text-amber-400 flex items-center gap-1">
+          <AlertCircle className="h-3 w-3" />
+          {warningText}
+        </p>
+      )}
+    </div>
+  );
+}
+
+// Multi-select version for selecting multiple models
+interface EvalMultiModelSelectorProps {
+  values: ModelConfig[];
+  onChange: (models: ModelConfig[]) => void;
+  apiKeys: UseApiKeysReturn;
+  label?: string;
+  maxSelections?: number;
+}
+
+export function EvalMultiModelSelector({
+  values,
+  onChange,
+  apiKeys,
+  label,
+  maxSelections = 4,
+}: EvalMultiModelSelectorProps) {
+  const selectedIds = useMemo(() => new Set(values.map((m) => m.id)), [values]);
+
+  const providerGroups = useMemo(() => {
+    return PROVIDERS.map((provider) => ({
+      ...provider,
+      hasKey: apiKeys.hasKey(provider.id),
+    }));
+  }, [apiKeys]);
+
+  const handleToggle = useCallback(
+    (model: ModelConfig) => {
+      if (selectedIds.has(model.id)) {
+        onChange(values.filter((m) => m.id !== model.id));
+      } else if (values.length < maxSelections) {
+        onChange([...values, model]);
+      }
+    },
+    [values, onChange, selectedIds, maxSelections]
+  );
+
+  return (
+    <div className="space-y-3">
+      {label && (
+        <label className="block text-sm font-medium text-foreground">
+          {label}
+          <span className="text-muted-foreground font-normal ml-2">
+            ({values.length}/{maxSelections})
+          </span>
+        </label>
+      )}
+
+      <div className="space-y-4">
+        {providerGroups.map((provider) => (
+          <div key={provider.id}>
+            <div className="text-xs font-medium text-muted-foreground mb-2 flex items-center gap-2">
+              {provider.name}
+              {!provider.hasKey && (
+                <span className="text-amber-600 dark:text-amber-400">
+                  (no API key)
+                </span>
+              )}
+            </div>
+            <div className="flex flex-wrap gap-2">
+              {provider.models.map((model) => {
+                const isSelected = selectedIds.has(model.id);
+                const isDisabled =
+                  !provider.hasKey ||
+                  (!isSelected && values.length >= maxSelections);
+
+                return (
+                  <button
+                    key={model.id}
+                    onClick={() => !isDisabled && handleToggle(model)}
+                    disabled={isDisabled}
+                    className={`
+                      px-3 py-1.5 text-sm rounded-lg border transition-colors
+                      ${
+                        isSelected
+                          ? "bg-primary text-primary-foreground border-primary"
+                          : isDisabled
+                            ? "bg-muted text-muted-foreground border-border opacity-50 cursor-not-allowed"
+                            : "bg-background text-foreground border-border hover:border-primary hover:bg-accent"
+                      }
+                    `}
+                  >
+                    {model.name}
+                  </button>
+                );
+              })}
+            </div>
+          </div>
+        ))}
+      </div>
+    </div>
+  );
+}
diff --git a/components/ai-eval/EvalResultCell.tsx b/components/ai-eval/EvalResultCell.tsx
new file mode 100644
index 0000000..51abd1c
--- /dev/null
+++ b/components/ai-eval/EvalResultCell.tsx
@@ -0,0 +1,149 @@
+import { useState } from "react";
+import { Card } from "@/components/ds/CardComponent";
+import {
+  ModelConfig,
+  JudgeEvaluation,
+  getProviderById,
+} from "@/components/utils/ai-eval-schemas";
+import { EvalScoreDisplay, ScoreBadge } from "./EvalScoreDisplay";
+import { Loader2, AlertCircle, ChevronDown, ChevronUp, Copy, Check } from "lucide-react";
+
+interface EvalResultCellProps {
+  label: string;
+  model: ModelConfig;
+  output: string | null;
+  evaluation: JudgeEvaluation | null;
+  isLoading: boolean;
+  error: string | null;
+  isWinner?: boolean;
+  latencyMs?: number;
+}
+
+export function EvalResultCell({
+  label,
+  model,
+  output,
+  evaluation,
+  isLoading,
+  error,
+  isWinner,
+  latencyMs,
+}: EvalResultCellProps) {
+  const [expanded, setExpanded] = useState(true);
+  const [copied, setCopied] = useState(false);
+
+  const provider = getProviderById(model.providerId);
+
+  const handleCopy = async () => {
+    if (!output) return;
+    await navigator.clipboard.writeText(output);
+    setCopied(true);
+    setTimeout(() => setCopied(false), 2000);
+  };
+
+  return (
+    <Card
+      className={`
+        p-4 hover:shadow-none shadow-none rounded-xl transition-all
+        ${isWinner ? "ring-2 ring-green-500/50" : ""}
+      `}
+    >
+      {/* Header */}
+      <div className="flex items-center justify-between mb-3">
+        <div className="flex items-center gap-2">
+          <span className="text-sm font-semibold">{label}</span>
+          {isWinner && (
+            <span className="text-xs bg-green-100 dark:bg-green-900/50 text-green-700 dark:text-green-300 px-2 py-0.5 rounded-full">
+              Winner
+            </span>
+          )}
+        </div>
+        {evaluation && (
+          <ScoreBadge
+            score={evaluation.overallScore}
+            isWinner={isWinner}
+            size="sm"
+          />
+        )}
+      </div>
+
+      {/* Model Info */}
+      <div className="flex items-center gap-2 mb-3 text-xs text-muted-foreground">
+        <span className="font-medium">{provider?.name}</span>
+        <span>·</span>
+        <span>{model.name}</span>
+        {latencyMs !== undefined && (
+          <>
+            <span>·</span>
+            <span>{(latencyMs / 1000).toFixed(2)}s</span>
+          </>
+        )}
+      </div>
+
+      {/* Content */}
+      {isLoading ? (
+        <div className="flex items-center justify-center py-8 text-muted-foreground">
+          <Loader2 className="h-5 w-5 animate-spin mr-2" />
+          <span className="text-sm">Generating...</span>
+        </div>
+      ) : error ? (
+        <div className="py-4 px-3 bg-red-50 dark:bg-red-950/30 rounded-lg">
+          <div className="flex items-start gap-2 text-red-600 dark:text-red-400">
+            <AlertCircle className="h-4 w-4 mt-0.5 shrink-0" />
+            <p className="text-sm">{error}</p>
+          </div>
+        </div>
+      ) : output ? (
+        <div className="space-y-3">
+          {/* Output */}
+          <div className="relative">
+            <button
+              onClick={() => setExpanded(!expanded)}
+              className="flex items-center gap-1 text-xs text-muted-foreground hover:text-foreground mb-2"
+            >
+              {expanded ? (
+                <ChevronUp className="h-3 w-3" />
+              ) : (
+                <ChevronDown className="h-3 w-3" />
+              )}
+              Output
+            </button>
+            
+            {expanded && (
+              <div className="relative group">
+                <div className="bg-muted rounded-lg p-3 text-sm leading-relaxed max-h-64 overflow-y-auto">
+                  <pre className="whitespace-pre-wrap font-sans">{output}</pre>
+                </div>
+                <button
+                  onClick={handleCopy}
+                  className="absolute top-2 right-2 p-1.5 rounded-md bg-background/80 opacity-0 group-hover:opacity-100 transition-opacity hover:bg-background"
+                >
+                  {copied ? (
+                    <Check className="h-3.5 w-3.5 text-green-500" />
+                  ) : (
+                    <Copy className="h-3.5 w-3.5 text-muted-foreground" />
+                  )}
+                </button>
+              </div>
+            )}
+          </div>
+
+          {/* Evaluation */}
+          {evaluation && (
+            <div className="pt-3 border-t border-border">
+              <EvalScoreDisplay
+                evaluation={evaluation}
+                isWinner={isWinner}
+                showBreakdown={true}
+              />
+            </div>
+          )}
+        </div>
+      ) : (
+        <div className="py-8 text-center text-muted-foreground text-sm">
+          Run evaluation to see output
+        </div>
+      )}
+    </Card>
+  );
+}
diff --git a/components/ai-eval/EvalScoreDisplay.tsx b/components/ai-eval/EvalScoreDisplay.tsx
new file mode 100644
index 0000000..2aee27d
--- /dev/null
+++ b/components/ai-eval/EvalScoreDisplay.tsx
@@ -0,0 +1,151 @@
+import {
+  JudgeEvaluation,
+  ScoreBreakdown,
+  getScoreColorClass,
+  getScoreBgClass,
+} from "@/components/utils/ai-eval-schemas";
+import { Trophy } from "lucide-react";
+
+interface ScoreBadgeProps {
+  score: number;
+  isWinner?: boolean;
+  size?: "sm" | "md" | "lg";
+}
+
+export function ScoreBadge({ score, isWinner, size = "md" }: ScoreBadgeProps) {
+  const sizeClasses = {
+    sm: "text-sm px-2 py-0.5",
+    md: "text-base px-3 py-1",
+    lg: "text-lg px-4 py-1.5 font-semibold",
+  };
+
+  return (
+    <div
+      className={`
+        inline-flex items-center gap-1.5 rounded-full font-medium
+        ${sizeClasses[size]}
+        ${getScoreBgClass(score)}
+        ${getScoreColorClass(score)}
+      `}
+    >
+      {isWinner && <Trophy className="h-3.5 w-3.5" />}
+      <span>{score.toFixed(1)}</span>
+      <span className="opacity-60">/ 10</span>
+    </div>
+  );
+}
+
+interface ScoreBarProps {
+  label: string;
+  score: number;
+  maxScore?: number;
+}
+
+export function ScoreBar({ label, score, maxScore = 10 }: ScoreBarProps) {
+  const percentage = (score / maxScore) * 100;
+
+  return (
+    <div className="flex items-center gap-3">
+      <span className="text-xs text-muted-foreground w-24 shrink-0">
+        {label}
+      </span>
+      <div className="flex-1 h-2 bg-muted rounded-full overflow-hidden">
+        <div
+          className={`h-full rounded-full transition-all ${
+            score >= 8
+              ? "bg-green-500"
+              : score >= 5
+                ? "bg-yellow-500"
+                : "bg-red-500"
+          }`}
+          style={{ width: `${percentage}%` }}
+        />
+      </div>
+      <span className={`text-xs font-medium w-6 text-right ${getScoreColorClass(score)}`}>
+        {score}
+      </span>
+    </div>
+  );
+}
+
+interface ScoreBreakdownDisplayProps {
+  scores: ScoreBreakdown;
+  compact?: boolean;
+}
+
+export function ScoreBreakdownDisplay({
+  scores,
+  compact,
+}: ScoreBreakdownDisplayProps) {
+  const criteria: { key: keyof ScoreBreakdown; label: string }[] = [
+    { key: "accuracy", label: "Accuracy" },
+    { key: "relevance", label: "Relevance" },
+    { key: "clarity", label: "Clarity" },
+    { key: "completeness", label: "Completeness" },
+    { key: "conciseness", label: "Conciseness" },
+  ];
+
+  if (compact) {
+    return (
+      <div className="grid grid-cols-5 gap-2 text-center">
+        {criteria.map(({ key, label }) => (
+          <div key={key} className="space-y-1">
+            <div className="text-[10px] text-muted-foreground uppercase tracking-wider">
+              {label.slice(0, 3)}
+            </div>
+            <div className={`text-sm font-medium ${getScoreColorClass(scores[key])}`}>
+              {scores[key]}
+            </div>
+          </div>
+        ))}
+      </div>
+    );
+  }
+
+  return (
+    <div className="space-y-2">
+      {criteria.map(({ key, label }) => (
+        <ScoreBar key={key} label={label} score={scores[key]} />
+      ))}
+    </div>
+  );
+}
+
+interface EvalScoreDisplayProps {
+  evaluation: JudgeEvaluation;
+  isWinner?: boolean;
+  showBreakdown?: boolean;
+}
+
+export function EvalScoreDisplay({
+  evaluation,
+  isWinner,
+  showBreakdown = true,
+}: EvalScoreDisplayProps) {
+  return (
+    <div className="space-y-4">
+      <div className="flex items-center justify-between">
+        <span className="text-sm font-medium text-muted-foreground">Score</span>
+        <ScoreBadge
+          score={evaluation.overallScore}
+          isWinner={isWinner}
+          size="md"
+        />
+      </div>
+
+      {showBreakdown && (
+        <div className="pt-2 border-t border-border">
+          <ScoreBreakdownDisplay scores={evaluation.scores} />
+        </div>
+      )}
+
+      {evaluation.reasoning && (
+        <div className="pt-2 border-t border-border">
+          <p className="text-xs text-muted-foreground leading-relaxed">
+            {evaluation.reasoning}
+          </p>
+        </div>
+      )}
+    </div>
+  );
+}
diff --git a/components/hooks/useApiKeys.ts b/components/hooks/useApiKeys.ts
new file mode 100644
index 0000000..3f5cb0e
--- /dev/null
+++ b/components/hooks/useApiKeys.ts
@@ -0,0 +1,99 @@
+import { useState, useEffect, useCallback } from "react";
+import {
+  StoredApiKeys,
+  ProviderId,
+  StoredApiKeysSchema,
+} from "@/components/utils/ai-eval-schemas";
+import { validateApiKey } from "@/components/utils/ai-eval-providers";
+
+const STORAGE_KEY = "jam-ai-eval-keys";
+
+export interface UseApiKeysReturn {
+  keys: StoredApiKeys;
+  setKey: (providerId: ProviderId, key: string) => void;
+  removeKey: (providerId: ProviderId) => void;
+  hasKey: (providerId: ProviderId) => boolean;
+  getKey: (providerId: ProviderId) => string | undefined;
+  testKey: (providerId: ProviderId) => Promise<boolean>;
+  isLoaded: boolean;
+}
+
+export function useApiKeys(): UseApiKeysReturn {
+  const [keys, setKeys] = useState<StoredApiKeys>({});
+  const [isLoaded, setIsLoaded] = useState(false);
+
+  // Load from sessionStorage on mount
+  useEffect(() => {
+    try {
+      const stored = sessionStorage.getItem(STORAGE_KEY);
+      if (stored) {
+        const parsed = JSON.parse(stored);
+        const validated = StoredApiKeysSchema.safeParse(parsed);
+        if (validated.success) {
+          setKeys(validated.data);
+        }
+      }
+    } catch (error) {
+      console.error("Failed to load API keys from sessionStorage:", error);
+    }
+    setIsLoaded(true);
+  }, []);
+
+  // Save to sessionStorage whenever keys change
+  useEffect(() => {
+    if (!isLoaded) return;
+    try {
+      sessionStorage.setItem(STORAGE_KEY, JSON.stringify(keys));
+    } catch (error) {
+      console.error("Failed to save API keys to sessionStorage:", error);
+    }
+  }, [keys, isLoaded]);
+
+  const setKey = useCallback((providerId: ProviderId, key: string) => {
+    setKeys((prev) => ({
+      ...prev,
+      [providerId]: key,
+    }));
+  }, []);
+
+  const removeKey = useCallback((providerId: ProviderId) => {
+    setKeys((prev) => {
+      const next = { ...prev };
+      delete next[providerId];
+      return next;
+    });
+  }, []);
+
+  const hasKey = useCallback(
+    (providerId: ProviderId) => {
+      return Boolean(keys[providerId]);
+    },
+    [keys]
+  );
+
+  const getKey = useCallback(
+    (providerId: ProviderId) => {
+      return keys[providerId];
+    },
+    [keys]
+  );
+
+  const testKey = useCallback(
+    async (providerId: ProviderId): Promise<boolean> => {
+      const key = keys[providerId];
+      if (!key) return false;
+      return validateApiKey(providerId, key);
+    },
+    [keys]
+  );
+
+  return {
+    keys,
+    setKey,
+    removeKey,
+    hasKey,
+    getKey,
+    testKey,
+    isLoaded,
+  };
+}
diff --git a/components/utils/ai-eval-judge.ts b/components/utils/ai-eval-judge.ts
new file mode 100644
index 0000000..9063097
--- /dev/null
+++ b/components/utils/ai-eval-judge.ts
@@ -0,0 +1,362 @@
+import {
+  JudgeEvaluation,
+  JudgeEvaluationSchema,
+  CriteriaWeights,
+  calculateWeightedScore,
+  ChatMessage,
+} from "./ai-eval-schemas";
+import { chat } from "./ai-eval-providers";
+
+// ============================================================================
+// Judge System Prompts
+// ============================================================================
+
+const SINGLE_RESPONSE_JUDGE_PROMPT = `You are an expert AI output evaluator. Your task is to analyze an AI response and score it objectively.
+
+Evaluate the response on these criteria (1-10 scale):
+
+1. ACCURACY (1-10): Is the information factually correct and reliable?
+   - 1-3: Contains significant errors or misinformation
+   - 4-6: Mostly accurate with minor issues
+   - 7-10: Highly accurate and reliable
+
+2. RELEVANCE (1-10): Does it directly address what was asked?
+   - 1-3: Off-topic or misses the point
+   - 4-6: Partially addresses the question
+   - 7-10: Directly and fully addresses the query
+
+3. CLARITY (1-10): Is it well-organized and easy to understand?
+   - 1-3: Confusing or poorly structured
+   - 4-6: Understandable but could be clearer
+   - 7-10: Crystal clear and well-organized
+
+4. COMPLETENESS (1-10): Does it cover all important aspects?
+   - 1-3: Missing critical information
+   - 4-6: Covers basics but lacks depth
+   - 7-10: Comprehensive and thorough
+
+5. CONCISENESS (1-10): Is it appropriately detailed without being verbose?
+   - 1-3: Extremely verbose or too brief
+   - 4-6: Could be more concise or needs more detail
+   - 7-10: Perfectly balanced length
+
+You MUST respond with valid JSON matching this exact schema:
+{
+  "scores": {
+    "accuracy": <number 1-10>,
+    "relevance": <number 1-10>,
+    "clarity": <number 1-10>,
+    "completeness": <number 1-10>,
+    "conciseness": <number 1-10>
+  },
+  "overallScore": <number 1-10>,
+  "reasoning": "<brief explanation of your evaluation, max 500 chars>"
+}
+
+Be fair, objective, and consistent in your scoring.`;
+
+const COMPARISON_JUDGE_PROMPT = `You are an expert AI output evaluator. Your task is to compare two AI responses (A and B) to the same prompt and determine which is better.
+
+Evaluate BOTH responses on these criteria (1-10 scale):
+
+1. ACCURACY: Is the information factually correct?
+2. RELEVANCE: Does it directly address what was asked?
+3. CLARITY: Is it well-organized and easy to understand?
+4. COMPLETENESS: Does it cover all important aspects?
+5. CONCISENESS: Is it appropriately detailed without being verbose?
+
+You MUST respond with valid JSON matching this exact schema:
+{
+  "responseA": {
+    "scores": {
+      "accuracy": <number 1-10>,
+      "relevance": <number 1-10>,
+      "clarity": <number 1-10>,
+      "completeness": <number 1-10>,
+      "conciseness": <number 1-10>
+    },
+    "overallScore": <number 1-10>,
+    "reasoning": "<brief explanation, max 300 chars>"
+  },
+  "responseB": {
+    "scores": {
+      "accuracy": <number 1-10>,
+      "relevance": <number 1-10>,
+      "clarity": <number 1-10>,
+      "completeness": <number 1-10>,
+      "conciseness": <number 1-10>
+    },
+    "overallScore": <number 1-10>,
+    "reasoning": "<brief explanation, max 300 chars>"
+  },
+  "winner": "<'A', 'B', or 'tie'>",
+  "comparisonReasoning": "<why one is better or why they're equal, max 300 chars>"
+}
+
+Be fair, objective, and explain your reasoning clearly.`;
+
+// ============================================================================
+// Judge Evaluation Functions
+// ============================================================================
+
+interface JudgeSingleParams {
+  apiKey: string;
+  judgeModel: string;
+  originalPrompt: string;
+  response: string;
+  weights: CriteriaWeights;
+}
+
+interface JudgeCompareParams {
+  apiKey: string;
+  judgeModel: string;
+  originalPrompt: string;
+  responseA: string;
+  responseB: string;
+  weights: CriteriaWeights;
+}
+
+interface ComparisonResult {
+  evaluationA: JudgeEvaluation;
+  evaluationB: JudgeEvaluation;
+  winner: "A" | "B" | "tie";
+  comparisonReasoning: string;
+}
+
+/**
+ * Evaluate a single response using LLM-as-judge
+ */
+export async function judgeSingleResponse(
+  params: JudgeSingleParams
+): Promise<JudgeEvaluation> {
+  const { apiKey, judgeModel, originalPrompt, response, weights } = params;
+
+  const userMessage = `## Original Prompt
+${originalPrompt}
+
+## AI Response to Evaluate
+${response}
+
+Evaluate this response now.`;
+
+  const messages: ChatMessage[] = [
+    { role: "system", content: SINGLE_RESPONSE_JUDGE_PROMPT },
+    { role: "user", content: userMessage },
+  ];
+
+  const result = await chat(apiKey, {
+    model: judgeModel,
+    messages,
+    jsonMode: true,
+    temperature: 0.3, // Lower temperature for more consistent scoring
+  });
+
+  // Parse and validate the response
+  const parsed = parseJudgeResponse(result.content);
+
+  // Recalculate overall score with user's weights
+  parsed.overallScore = calculateWeightedScore(parsed.scores, weights);
+
+  return parsed;
+}
+
+/**
+ * Compare two responses using LLM-as-judge (pairwise comparison)
+ */
+export async function judgeCompareResponses(
+  params: JudgeCompareParams
+): Promise<ComparisonResult> {
+  const { apiKey, judgeModel, originalPrompt, responseA, responseB, weights } =
+    params;
+
+  const userMessage = `## Original Prompt
+${originalPrompt}
+
+## Response A
+${responseA}
+
+## Response B
+${responseB}
+
+Compare these responses and provide your evaluation.`;
+
+  const messages: ChatMessage[] = [
+    { role: "system", content: COMPARISON_JUDGE_PROMPT },
+    { role: "user", content: userMessage },
+  ];
+
+  const result = await chat(apiKey, {
+    model: judgeModel,
+    messages,
+    jsonMode: true,
+    temperature: 0.3,
+  });
+
+  // Parse the comparison response
+  const parsed = parseComparisonResponse(result.content);
+
+  // Recalculate overall scores with user's weights
+  parsed.evaluationA.overallScore = calculateWeightedScore(
+    parsed.evaluationA.scores,
+    weights
+  );
+  parsed.evaluationB.overallScore = calculateWeightedScore(
+    parsed.evaluationB.scores,
+    weights
+  );
+
+  // Re-determine winner based on weighted scores
+  if (parsed.evaluationA.overallScore > parsed.evaluationB.overallScore + 0.5) {
+    parsed.winner = "A";
+  } else if (
+    parsed.evaluationB.overallScore >
+    parsed.evaluationA.overallScore + 0.5
+  ) {
+    parsed.winner = "B";
+  } else {
+    parsed.winner = "tie";
+  }
+
+  return parsed;
+}
+
+// ============================================================================
+// Response Parsing
+// ============================================================================
+
+function parseJudgeResponse(content: string): JudgeEvaluation {
+  try {
+    // Try to extract JSON from the response
+    const jsonMatch = content.match(/\{[\s\S]*\}/);
+    if (!jsonMatch) {
+      throw new Error("No JSON found in response");
+    }
+
+    const parsed = JSON.parse(jsonMatch[0]);
+    const validated = JudgeEvaluationSchema.parse(parsed);
+    return validated;
+  } catch (error) {
+    console.error("Failed to parse judge response:", content, error);
+
+    // Return a default evaluation on parse failure
+    return {
+      scores: {
+        accuracy: 5,
+        relevance: 5,
+        clarity: 5,
+        completeness: 5,
+        conciseness: 5,
+      },
+      overallScore: 5,
+      reasoning: "Failed to parse evaluation. Using default scores.",
+    };
+  }
+}
+
+function parseComparisonResponse(content: string): ComparisonResult {
+  try {
+    const jsonMatch = content.match(/\{[\s\S]*\}/);
+    if (!jsonMatch) {
+      throw new Error("No JSON found in response");
+    }
+
+    const parsed = JSON.parse(jsonMatch[0]);
+
+    // Validate both evaluations
+    const evalA = JudgeEvaluationSchema.parse({
+      scores: parsed.responseA.scores,
+      overallScore: parsed.responseA.overallScore,
+      reasoning: parsed.responseA.reasoning,
+    });
+
+    const evalB = JudgeEvaluationSchema.parse({
+      scores: parsed.responseB.scores,
+      overallScore: parsed.responseB.overallScore,
+      reasoning: parsed.responseB.reasoning,
+    });
+
+    return {
+      evaluationA: evalA,
+      evaluationB: evalB,
+      winner: parsed.winner as "A" | "B" | "tie",
+      comparisonReasoning: parsed.comparisonReasoning || "",
+    };
+  } catch (error) {
+    console.error("Failed to parse comparison response:", content, error);
+
+    // Return default evaluations on parse failure
+    const defaultEval: JudgeEvaluation = {
+      scores: {
+        accuracy: 5,
+        relevance: 5,
+        clarity: 5,
+        completeness: 5,
+        conciseness: 5,
+      },
+      overallScore: 5,
+      reasoning: "Failed to parse evaluation.",
+    };
+
+    return {
+      evaluationA: { ...defaultEval },
+      evaluationB: { ...defaultEval },
+      winner: "tie",
+      comparisonReasoning: "Failed to parse comparison. Using default scores.",
+    };
+  }
+}
+
+// ============================================================================
+// Batch Evaluation
+// ============================================================================
+
+interface BatchEvalParams {
+  apiKey: string;
+  judgeModel: string;
+  evaluations: Array<{
+    id: string;
+    originalPrompt: string;
+    response: string;
+  }>;
+  weights: CriteriaWeights;
+}
+
+/**
+ * Evaluate multiple responses in batch (sequential to respect rate limits)
+ */
+export async function judgeBatchResponses(
+  params: BatchEvalParams
+): Promise<Map<string, JudgeEvaluation>> {
+  const results = new Map<string, JudgeEvaluation>();
+
+  for (const item of params.evaluations) {
+    try {
+      const evaluation = await judgeSingleResponse({
+        apiKey: params.apiKey,
+        judgeModel: params.judgeModel,
+        originalPrompt: item.originalPrompt,
+        response: item.response,
+        weights: params.weights,
+      });
+      results.set(item.id, evaluation);
+    } catch (error) {
+      console.error(`Failed to evaluate ${item.id}:`, error);
+      results.set(item.id, {
+        scores: {
+          accuracy: 0,
+          relevance: 0,
+          clarity: 0,
+          completeness: 0,
+          conciseness: 0,
+        },
+        overallScore: 0,
+        reasoning:
+          error instanceof Error
+            ? error.message
+            : "Evaluation failed",
+      });
+    }
+  }
+
+  return results;
+}
diff --git a/components/utils/ai-eval-providers.ts b/components/utils/ai-eval-providers.ts
new file mode 100644
index 0000000..9568cc6
--- /dev/null
+++ b/components/utils/ai-eval-providers.ts
@@ -0,0 +1,309 @@
+import {
+  ChatParams,
+  ChatResponse,
+  ProviderId,
+  getProviderById,
+  getModelById,
+} from "./ai-eval-schemas";
+
+// ============================================================================
+// Provider Adapter Interface
+// ============================================================================
+
+export interface ProviderAdapter {
+  id: ProviderId;
+  name: string;
+  chat(apiKey: string, params: ChatParams): Promise<ChatResponse>;
+  validateKey(apiKey: string): Promise<boolean>;
+}
+
+// ============================================================================
+// OpenAI Adapter
+// ============================================================================
+
+const openaiAdapter: ProviderAdapter = {
+  id: "openai",
+  name: "OpenAI",
+
+  async chat(apiKey: string, params: ChatParams): Promise<ChatResponse> {
+    const provider = getProviderById("openai");
+    if (!provider) throw new Error("OpenAI provider not found");
+
+    const body: Record<string, unknown> = {
+      model: params.model,
+      messages: params.messages,
+      max_tokens: params.maxTokens ?? 4096,
+      temperature: params.temperature ?? 0.7,
+    };
+
+    if (params.jsonMode) {
+      body.response_format = { type: "json_object" };
+    }
+
+    const response = await fetch(provider.apiEndpoint, {
+      method: "POST",
+      headers: {
+        "Content-Type": "application/json",
+        Authorization: `Bearer ${apiKey}`,
+      },
+      body: JSON.stringify(body),
+    });
+
+    if (!response.ok) {
+      const error = await response.json().catch(() => ({}));
+      throw new Error(
+        error.error?.message || `OpenAI API error: ${response.status}`
+      );
+    }
+
+    const data = await response.json();
+    const choice = data.choices?.[0];
+
+    return {
+      content: choice?.message?.content || "",
+      model: data.model,
+      usage: data.usage
+        ? {
+            promptTokens: data.usage.prompt_tokens,
+            completionTokens: data.usage.completion_tokens,
+            totalTokens: data.usage.total_tokens,
+          }
+        : undefined,
+      finishReason: choice?.finish_reason,
+    };
+  },
+
+  async validateKey(apiKey: string): Promise<boolean> {
+    try {
+      const response = await fetch("https://api.openai.com/v1/models", {
+        headers: {
+          Authorization: `Bearer ${apiKey}`,
+        },
+      });
+      return response.ok;
+    } catch {
+      return false;
+    }
+  },
+};
+
+// ============================================================================
+// Anthropic Adapter
+// ============================================================================
+
+const anthropicAdapter: ProviderAdapter = {
+  id: "anthropic",
+  name: "Anthropic",
+
+  async chat(apiKey: string, params: ChatParams): Promise<ChatResponse> {
+    const provider = getProviderById("anthropic");
+    if (!provider) throw new Error("Anthropic provider not found");
+
+    // Anthropic uses a different message format
+    // System message is separate from the messages array
+    const systemMessage = params.messages.find((m) => m.role === "system");
+    const otherMessages = params.messages.filter((m) => m.role !== "system");
+
+    const body: Record<string, unknown> = {
+      model: params.model,
+      max_tokens: params.maxTokens ?? 4096,
+      messages: otherMessages.map((m) => ({
+        role: m.role === "assistant" ? "assistant" : "user",
+        content: m.content,
+      })),
+    };
+
+    if (systemMessage) {
+      body.system = systemMessage.content;
+    }
+
+    const response = await fetch(provider.apiEndpoint, {
+      method: "POST",
+      headers: {
+        "Content-Type": "application/json",
+        "x-api-key": apiKey,
+        "anthropic-version": "2023-06-01",
+        "anthropic-dangerous-direct-browser-access": "true",
+      },
+      body: JSON.stringify(body),
+    });
+
+    if (!response.ok) {
+      const error = await response.json().catch(() => ({}));
+      throw new Error(
+        error.error?.message || `Anthropic API error: ${response.status}`
+      );
+    }
+
+    const data = await response.json();
+
+    return {
+      content: data.content?.[0]?.text || "",
+      model: data.model,
+      usage: data.usage
+        ? {
+            promptTokens: data.usage.input_tokens,
+            completionTokens: data.usage.output_tokens,
+            totalTokens: data.usage.input_tokens + data.usage.output_tokens,
+          }
+        : undefined,
+      finishReason: data.stop_reason,
+    };
+  },
+
+  async validateKey(apiKey: string): Promise<boolean> {
+    try {
+      // Anthropic doesn't have a simple models endpoint, so we make a minimal request
+      const response = await fetch("https://api.anthropic.com/v1/messages", {
+        method: "POST",
+        headers: {
+          "Content-Type": "application/json",
+          "x-api-key": apiKey,
+          "anthropic-version": "2023-06-01",
+          "anthropic-dangerous-direct-browser-access": "true",
+        },
+        body: JSON.stringify({
+          model: "claude-3-5-haiku-20241022",
+          max_tokens: 1,
+          messages: [{ role: "user", content: "Hi" }],
+        }),
+      });
+      return response.ok;
+    } catch {
+      return false;
+    }
+  },
+};
+
+// ============================================================================
+// Google AI Adapter
+// ============================================================================
+
+const googleAdapter: ProviderAdapter = {
+  id: "google",
+  name: "Google AI",
+
+  async chat(apiKey: string, params: ChatParams): Promise<ChatResponse> {
+    const model = getModelById(params.model);
+    if (!model) throw new Error(`Model not found: ${params.model}`);
+
+    const endpoint = `https://generativelanguage.googleapis.com/v1beta/models/${params.model}:generateContent?key=${apiKey}`;
+
+    // Convert messages to Google format
+    const systemMessage = params.messages.find((m) => m.role === "system");
+    const otherMessages = params.messages.filter((m) => m.role !== "system");
+
+    const contents = otherMessages.map((m) => ({
+      role: m.role === "assistant" ? "model" : "user",
+      parts: [{ text: m.content }],
+    }));
+
+    const body: Record<string, unknown> = {
+      contents,
+      generationConfig: {
+        maxOutputTokens: params.maxTokens ?? 4096,
+        temperature: params.temperature ?? 0.7,
+      },
+    };
+
+    if (systemMessage) {
+      body.systemInstruction = {
+        parts: [{ text: systemMessage.content }],
+      };
+    }
+
+    if (params.jsonMode) {
+      (body.generationConfig as Record<string, unknown>).responseMimeType =
+        "application/json";
+    }
+
+    const response = await fetch(endpoint, {
+      method: "POST",
+      headers: {
+        "Content-Type": "application/json",
+      },
+      body: JSON.stringify(body),
+    });
+
+    if (!response.ok) {
+      const error = await response.json().catch(() => ({}));
+      throw new Error(
+        error.error?.message || `Google AI API error: ${response.status}`
+      );
+    }
+
+    const data = await response.json();
+    const candidate = data.candidates?.[0];
+    const content = candidate?.content?.parts?.[0]?.text || "";
+
+    return {
+      content,
+      model: params.model,
+      usage: data.usageMetadata
+        ? {
+            promptTokens: data.usageMetadata.promptTokenCount || 0,
+            completionTokens: data.usageMetadata.candidatesTokenCount || 0,
+            totalTokens: data.usageMetadata.totalTokenCount || 0,
+          }
+        : undefined,
+      finishReason: candidate?.finishReason,
+    };
+  },
+
+  async validateKey(apiKey: string): Promise<boolean> {
+    try {
+      const response = await fetch(
+        `https://generativelanguage.googleapis.com/v1beta/models?key=${apiKey}`
+      );
+      return response.ok;
+    } catch {
+      return false;
+    }
+  },
+};
+
+// ============================================================================
+// Adapter Registry
+// ============================================================================
+
+const adapters: Record<ProviderId, ProviderAdapter> = {
+  openai: openaiAdapter,
+  anthropic: anthropicAdapter,
+  google: googleAdapter,
+};
+
+export function getAdapter(providerId: ProviderId): ProviderAdapter {
+  const adapter = adapters[providerId];
+  if (!adapter) {
+    throw new Error(`No adapter found for provider: ${providerId}`);
+  }
+  return adapter;
+}
+
+export function getAdapterForModel(modelId: string): ProviderAdapter {
+  const model = getModelById(modelId);
+  if (!model) {
+    throw new Error(`Model not found: ${modelId}`);
+  }
+  return getAdapter(model.providerId);
+}
+
+// ============================================================================
+// Unified Chat Function
+// ============================================================================
+
+export async function chat(
+  apiKey: string,
+  params: ChatParams
+): Promise<ChatResponse> {
+  const adapter = getAdapterForModel(params.model);
+  return adapter.chat(apiKey, params);
+}
+
+export async function validateApiKey(
+  providerId: ProviderId,
+  apiKey: string
+): Promise<boolean> {
+  const adapter = getAdapter(providerId);
+  return adapter.validateKey(apiKey);
+}
diff --git a/components/utils/ai-eval-schemas.test.ts b/components/utils/ai-eval-schemas.test.ts
new file mode 100644
index 0000000..746ab69
--- /dev/null
+++ b/components/utils/ai-eval-schemas.test.ts
@@ -0,0 +1,354 @@
+import {
+  extractVariables,
+  resolveTemplate,
+  calculateWeightedScore,
+  getScoreColorClass,
+  getScoreBgClass,
+  getModelById,
+  getProviderById,
+  getProviderForModel,
+  JudgeEvaluationSchema,
+  ScoreBreakdownSchema,
+  DEFAULT_CRITERIA_WEIGHTS,
+} from "./ai-eval-schemas";
+
+describe("extractVariables", () => {
+  it("extracts single variable", () => {
+    const result = extractVariables("Hello {{name}}!");
+    expect(result).toEqual(["name"]);
+  });
+
+  it("extracts multiple variables", () => {
+    const result = extractVariables("{{greeting}} {{name}}, how are you?");
+    expect(result).toEqual(["greeting", "name"]);
+  });
+
+  it("extracts duplicate variables only once", () => {
+    const result = extractVariables("{{name}} and {{name}} again");
+    expect(result).toEqual(["name"]);
+  });
+
+  it("returns empty array when no variables", () => {
+    const result = extractVariables("Hello world!");
+    expect(result).toEqual([]);
+  });
+
+  it("handles variables with underscores", () => {
+    const result = extractVariables("{{first_name}} {{last_name}}");
+    expect(result).toEqual(["first_name", "last_name"]);
+  });
+
+  it("handles variables with numbers", () => {
+    const result = extractVariables("{{var1}} {{var2}}");
+    expect(result).toEqual(["var1", "var2"]);
+  });
+});
+
+describe("resolveTemplate", () => {
+  it("resolves single variable", () => {
+    const result = resolveTemplate("Hello {{name}}!", { name: "World" });
+    expect(result).toBe("Hello World!");
+  });
+
+  it("resolves multiple variables", () => {
+    const result = resolveTemplate("{{greeting}} {{name}}!", {
+      greeting: "Hi",
+      name: "Alice",
+    });
+    expect(result).toBe("Hi Alice!");
+  });
+
+  it("keeps unreplaced variables as-is", () => {
+    const result = resolveTemplate("Hello {{name}} and {{other}}!", {
+      name: "World",
+    });
+    expect(result).toBe("Hello World and {{other}}!");
+  });
+
+  it("handles empty variables object", () => {
+    const result = resolveTemplate("Hello {{name}}!", {});
+    expect(result).toBe("Hello {{name}}!");
+  });
+
+  it("handles template without variables", () => {
+    const result = resolveTemplate("Hello World!", { name: "Test" });
+    expect(result).toBe("Hello World!");
+  });
+});
+
+describe("calculateWeightedScore", () => {
+  it("calculates weighted score correctly", () => {
+    const scores = {
+      accuracy: 8,
+      relevance: 9,
+      clarity: 7,
+      completeness: 8,
+      conciseness: 6,
+    };
+
+    const weights = {
+      accuracy: 0.2,
+      relevance: 0.2,
+      clarity: 0.2,
+      completeness: 0.2,
+      conciseness: 0.2,
+    };
+
+    const result = calculateWeightedScore(scores, weights);
+    // (8*0.2 + 9*0.2 + 7*0.2 + 8*0.2 + 6*0.2) = 7.6
+    expect(result).toBe(7.6);
+  });
+
+  it("handles uneven weights", () => {
+    const scores = {
+      accuracy: 10,
+      relevance: 5,
+      clarity: 5,
+      completeness: 5,
+      conciseness: 5,
+    };
+
+    const weights = {
+      accuracy: 0.5,
+      relevance: 0.125,
+      clarity: 0.125,
+      completeness: 0.125,
+      conciseness: 0.125,
+    };
+
+    const result = calculateWeightedScore(scores, weights);
+    // (10*0.5 + 5*0.125 + 5*0.125 + 5*0.125 + 5*0.125) = 7.5
+    expect(result).toBe(7.5);
+  });
+
+  it("uses default weights", () => {
+    const scores = {
+      accuracy: 8,
+      relevance: 8,
+      clarity: 8,
+      completeness: 8,
+      conciseness: 8,
+    };
+
+    const result = calculateWeightedScore(scores, DEFAULT_CRITERIA_WEIGHTS);
+    expect(result).toBe(8);
+  });
+});
+
+describe("getScoreColorClass", () => {
+  it("returns green for high scores (8-10)", () => {
+    expect(getScoreColorClass(8)).toContain("green");
+    expect(getScoreColorClass(9)).toContain("green");
+    expect(getScoreColorClass(10)).toContain("green");
+  });
+
+  it("returns yellow for medium scores (5-7)", () => {
+    expect(getScoreColorClass(5)).toContain("yellow");
+    expect(getScoreColorClass(6)).toContain("yellow");
+    expect(getScoreColorClass(7)).toContain("yellow");
+  });
+
+  it("returns red for low scores (1-4)", () => {
+    expect(getScoreColorClass(1)).toContain("red");
+    expect(getScoreColorClass(4)).toContain("red");
+  });
+});
+
+describe("getScoreBgClass", () => {
+  it("returns green background for high scores", () => {
+    expect(getScoreBgClass(8)).toContain("green");
+  });
+
+  it("returns yellow background for medium scores", () => {
+    expect(getScoreBgClass(6)).toContain("yellow");
+  });
+
+  it("returns red background for low scores", () => {
+    expect(getScoreBgClass(3)).toContain("red");
+  });
+});
+
+describe("getModelById", () => {
+  it("finds OpenAI model", () => {
+    const model = getModelById("gpt-4o");
+    expect(model).toBeDefined();
+    expect(model?.name).toBe("GPT-4o");
+    expect(model?.providerId).toBe("openai");
+  });
+
+  it("finds Anthropic model", () => {
+    const model = getModelById("claude-3-5-sonnet-20241022");
+    expect(model).toBeDefined();
+    expect(model?.name).toBe("Claude 3.5 Sonnet");
+    expect(model?.providerId).toBe("anthropic");
+  });
+
+  it("finds Google model", () => {
+    const model = getModelById("gemini-2.0-flash-exp");
+    expect(model).toBeDefined();
+    expect(model?.name).toBe("Gemini 2.0 Flash");
+    expect(model?.providerId).toBe("google");
+  });
+
+  it("returns undefined for unknown model", () => {
+    const model = getModelById("unknown-model");
+    expect(model).toBeUndefined();
+  });
+});
+
+describe("getProviderById", () => {
+  it("finds OpenAI provider", () => {
+    const provider = getProviderById("openai");
+    expect(provider).toBeDefined();
+    expect(provider?.name).toBe("OpenAI");
+  });
+
+  it("finds Anthropic provider", () => {
+    const provider = getProviderById("anthropic");
+    expect(provider).toBeDefined();
+    expect(provider?.name).toBe("Anthropic");
+  });
+
+  it("finds Google provider", () => {
+    const provider = getProviderById("google");
+    expect(provider).toBeDefined();
+    expect(provider?.name).toBe("Google AI");
+  });
+});
+
+describe("getProviderForModel", () => {
+  it("returns correct provider for model", () => {
+    const provider = getProviderForModel("gpt-4o");
+    expect(provider?.id).toBe("openai");
+  });
+
+  it("returns undefined for unknown model", () => {
+    const provider = getProviderForModel("unknown");
+    expect(provider).toBeUndefined();
+  });
+});
+
+describe("Zod Schemas", () => {
+  describe("ScoreBreakdownSchema", () => {
+    it("validates valid scores", () => {
+      const valid = {
+        accuracy: 8,
+        relevance: 7,
+        clarity: 9,
+        completeness: 6,
+        conciseness: 8,
+      };
+
+      const result = ScoreBreakdownSchema.safeParse(valid);
+      expect(result.success).toBe(true);
+    });
+
+    it("rejects scores below 1", () => {
+      const invalid = {
+        accuracy: 0,
+        relevance: 7,
+        clarity: 9,
+        completeness: 6,
+        conciseness: 8,
+      };
+
+      const result = ScoreBreakdownSchema.safeParse(invalid);
+      expect(result.success).toBe(false);
+    });
+
+    it("rejects scores above 10", () => {
+      const invalid = {
+        accuracy: 11,
+        relevance: 7,
+        clarity: 9,
+        completeness: 6,
+        conciseness: 8,
+      };
+
+      const result = ScoreBreakdownSchema.safeParse(invalid);
+      expect(result.success).toBe(false);
+    });
+
+    it("rejects missing fields", () => {
+      const invalid = {
+        accuracy: 8,
+        relevance: 7,
+      };
+
+      const result = ScoreBreakdownSchema.safeParse(invalid);
+      expect(result.success).toBe(false);
+    });
+  });
+
+  describe("JudgeEvaluationSchema", () => {
+    it("validates valid evaluation", () => {
+      const valid = {
+        scores: {
+          accuracy: 8,
+          relevance: 7,
+          clarity: 9,
+          completeness: 6,
+          conciseness: 8,
+        },
+        overallScore: 7.6,
+        reasoning: "Good response overall.",
+      };
+
+      const result = JudgeEvaluationSchema.safeParse(valid);
+      expect(result.success).toBe(true);
+    });
+
+    it("validates evaluation with winner", () => {
+      const valid = {
+        scores: {
+          accuracy: 8,
+          relevance: 7,
+          clarity: 9,
+          completeness: 6,
+          conciseness: 8,
+        },
+        overallScore: 7.6,
+        reasoning: "Good response overall.",
+        winner: "A",
+      };
+
+      const result = JudgeEvaluationSchema.safeParse(valid);
+      expect(result.success).toBe(true);
+    });
+
+    it("rejects invalid winner value", () => {
+      const invalid = {
+        scores: {
+          accuracy: 8,
+          relevance: 7,
+          clarity: 9,
+          completeness: 6,
+          conciseness: 8,
+        },
+        overallScore: 7.6,
+        reasoning: "Good response overall.",
+        winner: "C",
+      };
+
+      const result = JudgeEvaluationSchema.safeParse(invalid);
+      expect(result.success).toBe(false);
+    });
+
+    it("rejects reasoning over 1000 chars", () => {
+      const invalid = {
+        scores: {
+          accuracy: 8,
+          relevance: 7,
+          clarity: 9,
+          completeness: 6,
+          conciseness: 8,
+        },
+        overallScore: 7.6,
+        reasoning: "a".repeat(1001),
+      };
+
+      const result = JudgeEvaluationSchema.safeParse(invalid);
+      expect(result.success).toBe(false);
+    });
+  });
+});
diff --git a/components/utils/ai-eval-schemas.ts b/components/utils/ai-eval-schemas.ts
new file mode 100644
index 0000000..a34385f
--- /dev/null
+++ b/components/utils/ai-eval-schemas.ts
@@ -0,0 +1,354 @@
+import { z } from "zod";
+
+// ============================================================================
+// Provider & Model Types
+// ============================================================================
+
+export type ProviderId = "openai" | "anthropic" | "google";
+
+export interface ModelConfig {
+  id: string;
+  name: string;
+  providerId: ProviderId;
+  maxTokens: number;
+  supportsJsonMode: boolean;
+}
+
+export interface ProviderConfig {
+  id: ProviderId;
+  name: string;
+  models: ModelConfig[];
+  apiEndpoint: string;
+}
+
+// ============================================================================
+// API Key Management
+// ============================================================================
+
+export interface StoredApiKeys {
+  openai?: string;
+  anthropic?: string;
+  google?: string;
+}
+
+export const StoredApiKeysSchema = z.object({
+  openai: z.string().optional(),
+  anthropic: z.string().optional(),
+  google: z.string().optional(),
+});
+
+// ============================================================================
+// Chat Message Types
+// ============================================================================
+
+export type MessageRole = "system" | "user" | "assistant";
+
+export interface ChatMessage {
+  role: MessageRole;
+  content: string;
+}
+
+export interface ChatParams {
+  model: string;
+  messages: ChatMessage[];
+  maxTokens?: number;
+  temperature?: number;
+  jsonMode?: boolean;
+}
+
+export interface ChatResponse {
+  content: string;
+  model: string;
+  usage?: {
+    promptTokens: number;
+    completionTokens: number;
+    totalTokens: number;
+  };
+  finishReason?: string;
+}
+
+// ============================================================================
+// Evaluation Types
+// ============================================================================
+
+export type ComparisonMode = "model-vs-model" | "prompt-vs-prompt";
+
+export interface PromptConfig {
+  id: string;
+  systemPrompt: string;
+  userPrompt: string;
+  variables: Record<string, string>;
+}
+
+export interface EvaluationInput {
+  mode: ComparisonMode;
+  prompts: PromptConfig[];
+  models: ModelConfig[];
+  judgeModel: ModelConfig;
+  criteriaWeights: CriteriaWeights;
+}
+
+export interface EvaluationResult {
+  id: string;
+  promptId: string;
+  modelId: string;
+  input: {
+    systemPrompt: string;
+    userPrompt: string;
+    resolvedPrompt: string;
+  };
+  output: string;
+  evaluation?: JudgeEvaluation;
+  error?: string;
+  latencyMs: number;
+  timestamp: number;
+}
+
+// ============================================================================
+// LLM-as-Judge Types & Schemas
+// ============================================================================
+
+export interface ScoreBreakdown {
+  accuracy: number;
+  relevance: number;
+  clarity: number;
+  completeness: number;
+  conciseness: number;
+}
+
+export interface JudgeEvaluation {
+  scores: ScoreBreakdown;
+  overallScore: number;
+  reasoning: string;
+  winner?: "A" | "B" | "tie";
+}
+
+export interface CriteriaWeights {
+  accuracy: number;
+  relevance: number;
+  clarity: number;
+  completeness: number;
+  conciseness: number;
+}
+
+// Zod schema for validating judge responses
+export const ScoreBreakdownSchema = z.object({
+  accuracy: z.number().min(1).max(10),
+  relevance: z.number().min(1).max(10),
+  clarity: z.number().min(1).max(10),
+  completeness: z.number().min(1).max(10),
+  conciseness: z.number().min(1).max(10),
+});
+
+export const JudgeEvaluationSchema = z.object({
+  scores: ScoreBreakdownSchema,
+  overallScore: z.number().min(1).max(10),
+  reasoning: z.string().max(1000),
+  winner: z.enum(["A", "B", "tie"]).optional(),
+});
+
+// ============================================================================
+// Default Values
+// ============================================================================
+
+export const DEFAULT_CRITERIA_WEIGHTS: CriteriaWeights = {
+  accuracy: 0.25,
+  relevance: 0.25,
+  clarity: 0.2,
+  completeness: 0.15,
+  conciseness: 0.15,
+};
+
+export const DEFAULT_SYSTEM_PROMPT = "You are a helpful assistant.";
+
+export const DEFAULT_USER_PROMPT = "{{question}}";
+
+// ============================================================================
+// Provider Configurations
+// ============================================================================
+
+export const PROVIDERS: ProviderConfig[] = [
+  {
+    id: "openai",
+    name: "OpenAI",
+    apiEndpoint: "https://api.openai.com/v1/chat/completions",
+    models: [
+      {
+        id: "gpt-4o",
+        name: "GPT-4o",
+        providerId: "openai",
+        maxTokens: 4096,
+        supportsJsonMode: true,
+      },
+      {
+        id: "gpt-4o-mini",
+        name: "GPT-4o Mini",
+        providerId: "openai",
+        maxTokens: 4096,
+        supportsJsonMode: true,
+      },
+      {
+        id: "gpt-4-turbo",
+        name: "GPT-4 Turbo",
+        providerId: "openai",
+        maxTokens: 4096,
+        supportsJsonMode: true,
+      },
+      {
+        id: "gpt-3.5-turbo",
+        name: "GPT-3.5 Turbo",
+        providerId: "openai",
+        maxTokens: 4096,
+        supportsJsonMode: true,
+      },
+    ],
+  },
+  {
+    id: "anthropic",
+    name: "Anthropic",
+    apiEndpoint: "https://api.anthropic.com/v1/messages",
+    models: [
+      {
+        id: "claude-3-5-sonnet-20241022",
+        name: "Claude 3.5 Sonnet",
+        providerId: "anthropic",
+        maxTokens: 4096,
+        supportsJsonMode: false,
+      },
+      {
+        id: "claude-3-5-haiku-20241022",
+        name: "Claude 3.5 Haiku",
+        providerId: "anthropic",
+        maxTokens: 4096,
+        supportsJsonMode: false,
+      },
+      {
+        id: "claude-3-opus-20240229",
+        name: "Claude 3 Opus",
+        providerId: "anthropic",
+        maxTokens: 4096,
+        supportsJsonMode: false,
+      },
+    ],
+  },
+  {
+    id: "google",
+    name: "Google AI",
+    apiEndpoint: "https://generativelanguage.googleapis.com/v1beta/models",
+    models: [
+      {
+        id: "gemini-2.0-flash-exp",
+        name: "Gemini 2.0 Flash",
+        providerId: "google",
+        maxTokens: 8192,
+        supportsJsonMode: true,
+      },
+      {
+        id: "gemini-1.5-pro",
+        name: "Gemini 1.5 Pro",
+        providerId: "google",
+        maxTokens: 8192,
+        supportsJsonMode: true,
+      },
+      {
+        id: "gemini-1.5-flash",
+        name: "Gemini 1.5 Flash",
+        providerId: "google",
+        maxTokens: 8192,
+        supportsJsonMode: true,
+      },
+    ],
+  },
+];
+
+// Helper to get all models flat
+export const ALL_MODELS: ModelConfig[] = PROVIDERS.flatMap((p) => p.models);
+
+// Helper to find model by id
+export function getModelById(modelId: string): ModelConfig | undefined {
+  return ALL_MODELS.find((m) => m.id === modelId);
+}
+
+// Helper to find provider by id
+export function getProviderById(providerId: ProviderId): ProviderConfig | undefined {
+  return PROVIDERS.find((p) => p.id === providerId);
+}
+
+// Helper to get provider for a model
+export function getProviderForModel(modelId: string): ProviderConfig | undefined {
+  const model = getModelById(modelId);
+  if (!model) return undefined;
+  return getProviderById(model.providerId);
+}
+
+// ============================================================================
+// Variable Extraction
+// ============================================================================
+
+/**
+ * Extract variable names from a prompt template
+ * Variables are in the format {{variableName}}
+ */
+export function extractVariables(template: string): string[] {
+  const regex = /\{\{(\w+)\}\}/g;
+  const variables: string[] = [];
+  let match;
+  while ((match = regex.exec(template)) !== null) {
+    if (!variables.includes(match[1])) {
+      variables.push(match[1]);
+    }
+  }
+  return variables;
+}
+
+/**
+ * Resolve variables in a template
+ */
+export function resolveTemplate(
+  template: string,
+  variables: Record<string, string>
+): string {
+  return template.replace(/\{\{(\w+)\}\}/g, (_, varName) => {
+    return variables[varName] ?? `{{${varName}}}`;
+  });
+}
+
+// ============================================================================
+// Score Utilities
+// ============================================================================
+
+/**
+ * Calculate weighted overall score from individual scores
+ */
+export function calculateWeightedScore(
+  scores: ScoreBreakdown,
+  weights: CriteriaWeights
+): number {
+  const total =
+    scores.accuracy * weights.accuracy +
+    scores.relevance * weights.relevance +
+    scores.clarity * weights.clarity +
+    scores.completeness * weights.completeness +
+    scores.conciseness * weights.conciseness;
+
+  // Round to 1 decimal place
+  return Math.round(total * 10) / 10;
+}
+
+/**
+ * Get score color class based on score value
+ */
+export function getScoreColorClass(score: number): string {
+  if (score >= 8) return "text-green-600 dark:text-green-400";
+  if (score >= 5) return "text-yellow-600 dark:text-yellow-400";
+  return "text-red-600 dark:text-red-400";
+}
+
+/**
+ * Get score background color class based on score value
+ */
+export function getScoreBgClass(score: number): string {
+  if (score >= 8) return "bg-green-100 dark:bg-green-900/30";
+  if (score >= 5) return "bg-yellow-100 dark:bg-yellow-900/30";
+  return "bg-red-100 dark:bg-red-900/30";
+}
diff --git a/components/utils/tools-list.ts b/components/utils/tools-list.ts
index aa8ef64..f0055d5 100644
--- a/components/utils/tools-list.ts
+++ b/components/utils/tools-list.ts
@@ -1,4 +1,10 @@
 export const tools = [
+  {
+    title: "AI Eval Playground",
+    description:
+      "Compare AI models and prompts side-by-side with LLM-as-judge scoring. Evaluate GPT-4, Claude, Gemini responses. BYOK - keys stay in your browser.",
+    link: "/utilities/ai-eval",
+  },
   {
     title: "CSV to JSON",
     description:
diff --git a/package-lock.json b/package-lock.json
index 9879cb3..2777321 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -34,7 +34,8 @@
         "react-dom": "^18",
         "react-syntax-highlighter": "^15.5.0",
         "tailwind-merge": "^2.4.0",
-        "tailwindcss-animate": "^1.0.7"
+        "tailwindcss-animate": "^1.0.7",
+        "zod": "^4.3.6"
       },
       "devDependencies": {
         "@testing-library/jest-dom": "^6.4.8",
@@ -13697,6 +13698,15 @@
       "funding": {
         "url": "https://github.com/sponsors/sindresorhus"
       }
+    },
+    "node_modules/zod": {
+      "version": "4.3.6",
+      "resolved": "https://registry.npmjs.org/zod/-/zod-4.3.6.tgz",
+      "integrity": "sha512-rftlrkhHZOcjDwkGlnUtZZkvaPHCsDATp4pGpuOOMDaTdDDXF91wuVDJoWoPsKX/3YPQ5fHuF3STjcYyKr+Qhg==",
+      "license": "MIT",
+      "funding": {
+        "url": "https://github.com/sponsors/colinhacks"
+      }
     }
   }
 }
diff --git a/package.json b/package.json
index a6b44bd..408f439 100644
--- a/package.json
+++ b/package.json
@@ -38,7 +38,8 @@
     "react-dom": "^18",
     "react-syntax-highlighter": "^15.5.0",
     "tailwind-merge": "^2.4.0",
-    "tailwindcss-animate": "^1.0.7"
+    "tailwindcss-animate": "^1.0.7",
+    "zod": "^4.3.6"
   },
   "devDependencies": {
     "@testing-library/jest-dom": "^6.4.8",
diff --git a/pages/utilities/ai-eval.tsx b/pages/utilities/ai-eval.tsx
new file mode 100644
index 0000000..8126214
--- /dev/null
+++ b/pages/utilities/ai-eval.tsx
@@ -0,0 +1,460 @@
+import { useState, useCallback, useMemo } from "react";
+import PageHeader from "@/components/PageHeader";
+import { Card } from "@/components/ds/CardComponent";
+import { Button } from "@/components/ds/ButtonComponent";
+import Header from "@/components/Header";
+import { CMDK } from "@/components/CMDK";
+import Meta from "@/components/Meta";
+import CallToActionGrid from "@/components/CallToActionGrid";
+import {
+  Tabs,
+  TabsList,
+  TabsTrigger,
+} from "@/components/ds/TabsComponent";
+import {
+  ComparisonMode,
+  ModelConfig,
+  CriteriaWeights,
+  DEFAULT_CRITERIA_WEIGHTS,
+  JudgeEvaluation,
+  resolveTemplate,
+} from "@/components/utils/ai-eval-schemas";
+import { chat } from "@/components/utils/ai-eval-providers";
+import { judgeSingleResponse, judgeCompareResponses } from "@/components/utils/ai-eval-judge";
+import { useApiKeys } from "@/components/hooks/useApiKeys";
+import { ApiKeyDialog } from "@/components/ai-eval/ApiKeyDialog";
+import { EvalConfigPanel } from "@/components/ai-eval/EvalConfigPanel";
+import { EvalMultiModelSelector } from "@/components/ai-eval/EvalModelSelector";
+import { EvalJudgePanel } from "@/components/ai-eval/EvalJudgePanel";
+import { EvalComparisonGrid, ComparisonSummary } from "@/components/ai-eval/EvalComparisonGrid";
+import { Play, Loader2 } from "lucide-react";
+
+interface PromptVariant {
+  id: string;
+  systemPrompt: string;
+  userPrompt: string;
+}
+
+interface ResultData {
+  id: string;
+  modelId: string;
+  promptId: string;
+  output: string | null;
+  evaluation: JudgeEvaluation | null;
+  isLoading: boolean;
+  error: string | null;
+  latencyMs?: number;
+}
+
+export default function AIEval() {
+  const apiKeys = useApiKeys();
+
+  // Mode toggle
+  const [mode, setMode] = useState<ComparisonMode>("model-vs-model");
+
+  // Prompt configuration
+  const [prompts, setPrompts] = useState<PromptVariant[]>([]);
+  const [variables, setVariables] = useState<Record<string, string>>({});
+
+  // Model selection
+  const [selectedModels, setSelectedModels] = useState<ModelConfig[]>([]);
+
+  // Judge settings
+  const [judgeModel, setJudgeModel] = useState<ModelConfig | null>(null);
+  const [criteriaWeights, setCriteriaWeights] = useState<CriteriaWeights>(
+    DEFAULT_CRITERIA_WEIGHTS
+  );
+  const [autoEvaluate, setAutoEvaluate] = useState(true);
+
+  // Results state
+  const [results, setResults] = useState<ResultData[]>([]);
+  const [isRunning, setIsRunning] = useState(false);
+  const [comparisonReasoning, setComparisonReasoning] = useState<string>("");
+
+  // Determine winners based on evaluation scores
+  const winnerIds = useMemo(() => {
+    const completedResults = results.filter(
+      (r) => r.evaluation && !r.isLoading && !r.error
+    );
+
+    if (completedResults.length < 2) return [];
+
+    const maxScore = Math.max(
+      ...completedResults.map((r) => r.evaluation?.overallScore ?? 0)
+    );
+
+    // Consider it a winner if within 0.5 of max score (accounting for ties)
+    return completedResults
+      .filter((r) => (r.evaluation?.overallScore ?? 0) >= maxScore - 0.5)
+      .map((r) => r.id);
+  }, [results]);
+
+  // Validate if we can run evaluation
+  const canRun = useMemo(() => {
+    if (isRunning) return false;
+    if (prompts.length === 0) return false;
+
+    if (mode === "model-vs-model") {
+      if (selectedModels.length < 2) return false;
+      // Check if we have API keys for all selected models
+      for (const model of selectedModels) {
+        if (!apiKeys.hasKey(model.providerId)) return false;
+      }
+    } else {
+      if (selectedModels.length < 1) return false;
+      if (prompts.length < 2) return false;
+      if (!apiKeys.hasKey(selectedModels[0].providerId)) return false;
+    }
+
+    return true;
+  }, [isRunning, prompts, mode, selectedModels, apiKeys]);
+
+  // Run evaluation
+  const runEvaluation = useCallback(async () => {
+    if (!canRun) return;
+
+    setIsRunning(true);
+    setComparisonReasoning("");
+
+    // Initialize results
+    const initialResults: ResultData[] = [];
+
+    if (mode === "model-vs-model") {
+      // One result per model
+      for (const model of selectedModels) {
+        initialResults.push({
+          id: `${model.id}-${prompts[0].id}`,
+          modelId: model.id,
+          promptId: prompts[0].id,
+          output: null,
+          evaluation: null,
+          isLoading: true,
+          error: null,
+        });
+      }
+    } else {
+      // One result per prompt
+      for (const prompt of prompts) {
+        initialResults.push({
+          id: `${selectedModels[0].id}-${prompt.id}`,
+          modelId: selectedModels[0].id,
+          promptId: prompt.id,
+          output: null,
+          evaluation: null,
+          isLoading: true,
+          error: null,
+        });
+      }
+    }
+
+    setResults(initialResults);
+
+    // Run generations
+    const generationResults: ResultData[] = [];
+
+    for (const result of initialResults) {
+      const model = selectedModels.find((m) => m.id === result.modelId);
+      const prompt = prompts.find((p) => p.id === result.promptId);
+
+      if (!model || !prompt) continue;
+
+      const apiKey = apiKeys.getKey(model.providerId);
+      if (!apiKey) {
+        generationResults.push({
+          ...result,
+          isLoading: false,
+          error: `No API key for ${model.providerId}`,
+        });
+        continue;
+      }
+
+      const resolvedUserPrompt = resolveTemplate(prompt.userPrompt, variables);
+      const startTime = Date.now();
+
+      try {
+        const response = await chat(apiKey, {
+          model: model.id,
+          messages: [
+            { role: "system", content: prompt.systemPrompt },
+            { role: "user", content: resolvedUserPrompt },
+          ],
+        });
+
+        const latencyMs = Date.now() - startTime;
+
+        generationResults.push({
+          ...result,
+          output: response.content,
+          isLoading: false,
+          latencyMs,
+        });
+
+        // Update UI with generation result
+        setResults((prev) =>
+          prev.map((r) =>
+            r.id === result.id
+              ? { ...r, output: response.content, isLoading: false, latencyMs }
+              : r
+          )
+        );
+      } catch (error) {
+        generationResults.push({
+          ...result,
+          isLoading: false,
+          error: error instanceof Error ? error.message : "Generation failed",
+        });
+
+        setResults((prev) =>
+          prev.map((r) =>
+            r.id === result.id
+              ? {
+                  ...r,
+                  isLoading: false,
+                  error:
+                    error instanceof Error
+                      ? error.message
+                      : "Generation failed",
+                }
+              : r
+          )
+        );
+      }
+    }
+
+    // Run evaluation if auto-evaluate is on and we have a judge model
+    if (autoEvaluate && judgeModel) {
+      const judgeApiKey = apiKeys.getKey(judgeModel.providerId);
+
+      if (judgeApiKey) {
+        const successfulResults = generationResults.filter(
+          (r) => r.output && !r.error
+        );
+
+        if (successfulResults.length >= 2) {
+          // Use pairwise comparison for 2 results
+          try {
+            const prompt = prompts[0];
+            const resolvedPrompt = resolveTemplate(prompt.userPrompt, variables);
+            const fullPrompt = `${prompt.systemPrompt}\n\n${resolvedPrompt}`;
+
+            const comparison = await judgeCompareResponses({
+              apiKey: judgeApiKey,
+              judgeModel: judgeModel.id,
+              originalPrompt: fullPrompt,
+              responseA: successfulResults[0].output!,
+              responseB: successfulResults[1].output!,
+              weights: criteriaWeights,
+            });
+
+            setComparisonReasoning(comparison.comparisonReasoning);
+
+            // Update results with evaluations
+            setResults((prev) =>
+              prev.map((r) => {
+                if (r.id === successfulResults[0].id) {
+                  return { ...r, evaluation: comparison.evaluationA };
+                }
+                if (r.id === successfulResults[1].id) {
+                  return { ...r, evaluation: comparison.evaluationB };
+                }
+                return r;
+              })
+            );
+
+            // Evaluate remaining results individually
+            for (let i = 2; i < successfulResults.length; i++) {
+              const result = successfulResults[i];
+              try {
+                const evaluation = await judgeSingleResponse({
+                  apiKey: judgeApiKey,
+                  judgeModel: judgeModel.id,
+                  originalPrompt: fullPrompt,
+                  response: result.output!,
+                  weights: criteriaWeights,
+                });
+
+                setResults((prev) =>
+                  prev.map((r) =>
+                    r.id === result.id ? { ...r, evaluation } : r
+                  )
+                );
+              } catch (error) {
+                console.error("Judge evaluation failed:", error);
+              }
+            }
+          } catch (error) {
+            console.error("Comparison failed:", error);
+          }
+        } else if (successfulResults.length === 1) {
+          // Single result, evaluate individually
+          const result = successfulResults[0];
+          const prompt = prompts.find((p) => p.id === result.promptId);
+          if (prompt) {
+            try {
+              const resolvedPrompt = resolveTemplate(prompt.userPrompt, variables);
+              const fullPrompt = `${prompt.systemPrompt}\n\n${resolvedPrompt}`;
+
+              const evaluation = await judgeSingleResponse({
+                apiKey: judgeApiKey,
+                judgeModel: judgeModel.id,
+                originalPrompt: fullPrompt,
+                response: result.output!,
+                weights: criteriaWeights,
+              });
+
+              setResults((prev) =>
+                prev.map((r) =>
+                  r.id === result.id ? { ...r, evaluation } : r
+                )
+              );
+            } catch (error) {
+              console.error("Judge evaluation failed:", error);
+            }
+          }
+        }
+      }
+    }
+
+    setIsRunning(false);
+  }, [
+    canRun,
+    mode,
+    selectedModels,
+    prompts,
+    variables,
+    apiKeys,
+    autoEvaluate,
+    judgeModel,
+    criteriaWeights,
+  ]);
+
+  return (
+    <main>
+      <Meta
+        title="AI Eval Playground | Compare Models & Prompts | Free & Open Source"
+        description="Compare AI models and prompts side-by-side with LLM-as-judge scoring. Evaluate GPT-4, Claude, Gemini responses with automated quality metrics. BYOK - your keys stay in your browser."
+      />
+      <Header />
+      <CMDK />
+
+      <section className="container max-w-6xl mb-12">
+        <PageHeader
+          title="AI Eval Playground"
+          description="Compare prompts and models side-by-side with LLM-as-judge scoring"
+        />
+      </section>
+
+      {/* Mode Toggle & API Keys */}
+      <section className="container max-w-6xl mb-6">
+        <Card className="p-4 hover:shadow-none shadow-none rounded-xl">
+          <div className="flex items-center justify-between">
+            <Tabs
+              value={mode}
+              onValueChange={(v) => setMode(v as ComparisonMode)}
+            >
+              <TabsList>
+                <TabsTrigger value="model-vs-model">Model vs Model</TabsTrigger>
+                <TabsTrigger value="prompt-vs-prompt">
+                  Prompt vs Prompt
+                </TabsTrigger>
+              </TabsList>
+            </Tabs>
+
+            <ApiKeyDialog apiKeys={apiKeys} />
+          </div>
+        </Card>
+      </section>
+
+      {/* Configuration Grid */}
+      <section className="container max-w-6xl mb-6">
+        <div className="grid grid-cols-1 lg:grid-cols-3 gap-6">
+          {/* Prompt Configuration - Takes 2 columns */}
+          <div className="lg:col-span-2">
+            <EvalConfigPanel
+              mode={mode}
+              onPromptsChange={setPrompts}
+              onVariablesChange={setVariables}
+            />
+          </div>
+
+          {/* Judge Settings - Takes 1 column */}
+          <div>
+            <EvalJudgePanel
+              judgeModel={judgeModel}
+              onJudgeModelChange={setJudgeModel}
+              comparedModelIds={selectedModels.map((m) => m.id)}
+              weights={criteriaWeights}
+              onWeightsChange={setCriteriaWeights}
+              autoEvaluate={autoEvaluate}
+              onAutoEvaluateChange={setAutoEvaluate}
+              apiKeys={apiKeys}
+            />
+          </div>
+        </div>
+      </section>
+
+      {/* Model Selection */}
+      <section className="container max-w-6xl mb-6">
+        <Card className="p-6 hover:shadow-none shadow-none rounded-xl">
+          <EvalMultiModelSelector
+            values={selectedModels}
+            onChange={setSelectedModels}
+            apiKeys={apiKeys}
+            label={
+              mode === "model-vs-model"
+                ? "Select Models to Compare"
+                : "Select Model"
+            }
+            maxSelections={mode === "model-vs-model" ? 4 : 1}
+          />
+        </Card>
+      </section>
+
+      {/* Run Button */}
+      <section className="container max-w-6xl mb-6">
+        <Button
+          onClick={runEvaluation}
+          disabled={!canRun}
+          size="lg"
+          className="w-full gap-2"
+        >
+          {isRunning ? (
+            <>
+              <Loader2 className="h-5 w-5 animate-spin" />
+              Running Evaluation...
+            </>
+          ) : (
+            <>
+              <Play className="h-5 w-5" />
+              Run Evaluation
+            </>
+          )}
+        </Button>
+      </section>
+
+      {/* Results Grid */}
+      {results.length > 0 && (
+        <section className="container max-w-6xl mb-6">
+          <EvalComparisonGrid
+            mode={mode}
+            models={selectedModels}
+            results={results}
+            winnerIds={winnerIds}
+          />
+
+          {comparisonReasoning && (
+            <div className="mt-4">
+              <ComparisonSummary
+                results={results}
+                winnerIds={winnerIds}
+                comparisonReasoning={comparisonReasoning}
+              />
+            </div>
+          )}
+        </section>
+      )}
+
+      <CallToActionGrid />
+    </main>
+  );
+}

From 10604bf4de1fe37c4150c62d553e81515add11a4 Mon Sep 17 00:00:00 2001
From: Berk Durmus <berkdurmus@yahoo.com>
Date: Sun, 25 Jan 2026 22:44:45 +0300
Subject: [PATCH 2/2] feat(evals): add AI Eval Playground for comparing models
 and prompts #2

---
 PR_DESCRIPTION.md                         |  67 +++
 components/ai-eval/ApiKeyDialog.tsx       | 123 ++---
 components/ai-eval/EvalComparisonGrid.tsx | 154 ------
 components/ai-eval/EvalConfigPanel.tsx    | 265 -----------
 components/ai-eval/EvalJudgeConfig.tsx    | 181 ++++++++
 components/ai-eval/EvalJudgePanel.tsx     | 186 --------
 components/ai-eval/EvalModelSelector.tsx  | 190 ++------
 components/ai-eval/EvalResultCard.tsx     | 169 +++++++
 components/ai-eval/EvalResultCell.tsx     | 149 ------
 components/ai-eval/EvalScoreDisplay.tsx   | 151 ------
 pages/utilities/ai-eval.tsx               | 543 +++++++++++++---------
 11 files changed, 818 insertions(+), 1360 deletions(-)
 create mode 100644 PR_DESCRIPTION.md
 delete mode 100644 components/ai-eval/EvalComparisonGrid.tsx
 delete mode 100644 components/ai-eval/EvalConfigPanel.tsx
 create mode 100644 components/ai-eval/EvalJudgeConfig.tsx
 delete mode 100644 components/ai-eval/EvalJudgePanel.tsx
 create mode 100644 components/ai-eval/EvalResultCard.tsx
 delete mode 100644 components/ai-eval/EvalResultCell.tsx
 delete mode 100644 components/ai-eval/EvalScoreDisplay.tsx

diff --git a/PR_DESCRIPTION.md b/PR_DESCRIPTION.md
new file mode 100644
index 0000000..7f0e881
--- /dev/null
+++ b/PR_DESCRIPTION.md
@@ -0,0 +1,67 @@
+## Summary
+
+- Add new AI Eval Playground utility for comparing AI model outputs and prompts
+- Implement BYOK (Bring Your Own Key) support for OpenAI, Anthropic, and Google AI
+- Build LLM-as-judge scoring system with configurable criteria weights
+- Create clean, table-based comparison UI following Linear.app design patterns
+
+## Features
+
+### Comparison Modes
+- **Model vs Model**: Compare 2-4 models with the same prompt
+- **Prompt vs Prompt**: Compare 2-4 prompt variations with the same model
+
+### Supported Providers
+| Provider | Models |
+|----------|--------|
+| OpenAI | GPT-4o, GPT-4o Mini, GPT-4 Turbo, GPT-3.5 Turbo |
+| Anthropic | Claude 3.5 Sonnet, Claude 3.5 Haiku, Claude 3 Opus |
+| Google AI | Gemini 2.0 Flash, Gemini 1.5 Pro, Gemini 1.5 Flash |
+
+### LLM-as-Judge Scoring
+- 5 evaluation criteria: Accuracy, Relevance, Clarity, Completeness, Conciseness
+- Adjustable weight sliders for custom scoring emphasis
+- Pairwise comparison with winner detection
+- Visual score badges and breakdown bars
+
+### Security
+- API keys stored in sessionStorage only (cleared on browser close)
+- All processing happens client-side
+- Keys never leave the browser
+
+## Files Added
+
+```
+components/
+├── ai-eval/
+│   ├── ApiKeyDialog.tsx
+│   ├── EvalComparisonGrid.tsx
+│   ├── EvalConfigPanel.tsx
+│   ├── EvalJudgePanel.tsx
+│   ├── EvalModelSelector.tsx
+│   ├── EvalResultCell.tsx
+│   └── EvalScoreDisplay.tsx
+├── hooks/
+│   └── useApiKeys.ts
+└── utils/
+    ├── ai-eval-judge.ts
+    ├── ai-eval-providers.ts
+    ├── ai-eval-schemas.ts
+    └── ai-eval-schemas.test.ts
+
+pages/utilities/
+└── ai-eval.tsx
+```
+
+## Screenshots
+
+<!-- Add screenshots here -->
+
+## Test Plan
+
+- [x] Unit tests for schema validation and utility functions (37 tests passing)
+- [x] Build passes with no TypeScript errors
+- [ ] Manual testing with real API keys for each provider
+- [ ] Verify API key dialog saves/clears correctly
+- [ ] Test both comparison modes with multiple models
+- [ ] Verify judge scoring produces valid results
diff --git a/components/ai-eval/ApiKeyDialog.tsx b/components/ai-eval/ApiKeyDialog.tsx
index d68c2ed..2bffe9a 100644
--- a/components/ai-eval/ApiKeyDialog.tsx
+++ b/components/ai-eval/ApiKeyDialog.tsx
@@ -8,28 +8,23 @@ import {
   DialogTrigger,
 } from "@/components/ds/DialogComponent";
 import { Button } from "@/components/ds/ButtonComponent";
-import { Input } from "@/components/ds/InputComponent";
-import { Label } from "@/components/ds/LabelComponent";
 import { PROVIDERS, ProviderId } from "@/components/utils/ai-eval-schemas";
 import { UseApiKeysReturn } from "@/components/hooks/useApiKeys";
-import { Key, Check, X, Loader2, Eye, EyeOff } from "lucide-react";
+import { Key, Check, X, Loader2, Eye, EyeOff, ExternalLink } from "lucide-react";
 
 interface ApiKeyDialogProps {
   apiKeys: UseApiKeysReturn;
   children?: React.ReactNode;
 }
 
-interface ProviderKeyInputProps {
+interface ProviderKeyRowProps {
   providerId: ProviderId;
   providerName: string;
   apiKeys: UseApiKeysReturn;
+  keyUrl: string;
 }
 
-function ProviderKeyInput({
-  providerId,
-  providerName,
-  apiKeys,
-}: ProviderKeyInputProps) {
+function ProviderKeyRow({ providerId, providerName, apiKeys, keyUrl }: ProviderKeyRowProps) {
   const [value, setValue] = useState(apiKeys.getKey(providerId) || "");
   const [showKey, setShowKey] = useState(false);
   const [testing, setTesting] = useState(false);
@@ -52,13 +47,9 @@ function ProviderKeyInput({
 
   const handleTest = useCallback(async () => {
     if (!value.trim()) return;
-
-    // First save the key
     apiKeys.setKey(providerId, value.trim());
-
     setTesting(true);
     setTestResult(null);
-
     try {
       const result = await apiKeys.testKey(providerId);
       setTestResult(result);
@@ -70,34 +61,43 @@ function ProviderKeyInput({
   }, [apiKeys, providerId, value]);
 
   return (
-    <div className="space-y-2">
-      <div className="flex items-center justify-between">
-        <Label className="text-sm font-medium">{providerName}</Label>
-        <div className="flex items-center gap-1">
+    <div className="py-4 border-b border-border last:border-b-0">
+      <div className="flex items-center justify-between mb-3">
+        <div className="flex items-center gap-3">
+          <span className="text-sm font-medium">{providerName}</span>
           {hasKey && testResult === null && (
-            <span className="text-xs text-muted-foreground flex items-center gap-1">
-              <Check className="h-3 w-3 text-green-500" />
+            <span className="flex items-center gap-1 text-xs text-green-600 dark:text-green-400">
+              <Check className="h-3 w-3" />
               Configured
             </span>
           )}
           {testResult === true && (
-            <span className="text-xs text-green-600 dark:text-green-400 flex items-center gap-1">
+            <span className="flex items-center gap-1 text-xs text-green-600 dark:text-green-400">
               <Check className="h-3 w-3" />
               Valid
             </span>
           )}
           {testResult === false && (
-            <span className="text-xs text-red-600 dark:text-red-400 flex items-center gap-1">
+            <span className="flex items-center gap-1 text-xs text-red-600 dark:text-red-400">
               <X className="h-3 w-3" />
               Invalid
             </span>
           )}
         </div>
+        <a
+          href={keyUrl}
+          target="_blank"
+          rel="noopener noreferrer"
+          className="flex items-center gap-1 text-xs text-muted-foreground hover:text-foreground transition-colors"
+        >
+          Get key
+          <ExternalLink className="h-3 w-3" />
+        </a>
       </div>
 
       <div className="flex gap-2">
         <div className="relative flex-1">
-          <Input
+          <input
             type={showKey ? "text" : "password"}
             placeholder={`Enter ${providerName} API key`}
             value={value}
@@ -106,18 +106,14 @@ function ProviderKeyInput({
               setTestResult(null);
             }}
             onBlur={handleSave}
-            className="pr-10 font-mono text-xs"
+            className="w-full h-10 pl-3 pr-10 rounded-lg border border-input bg-background text-sm font-mono focus:outline-none focus:ring-2 focus:ring-ring"
           />
           <button
             type="button"
             onClick={() => setShowKey(!showKey)}
-            className="absolute right-2 top-1/2 -translate-y-1/2 text-muted-foreground hover:text-foreground transition-colors"
+            className="absolute right-3 top-1/2 -translate-y-1/2 text-muted-foreground hover:text-foreground transition-colors"
           >
-            {showKey ? (
-              <EyeOff className="h-4 w-4" />
-            ) : (
-              <Eye className="h-4 w-4" />
-            )}
+            {showKey ? <EyeOff className="h-4 w-4" /> : <Eye className="h-4 w-4" />}
           </button>
         </div>
 
@@ -126,13 +122,9 @@ function ProviderKeyInput({
           size="sm"
           onClick={handleTest}
           disabled={!value.trim() || testing}
-          className="shrink-0"
+          className="h-10 px-4"
         >
-          {testing ? (
-            <Loader2 className="h-4 w-4 animate-spin" />
-          ) : (
-            "Test"
-          )}
+          {testing ? <Loader2 className="h-4 w-4 animate-spin" /> : "Test"}
         </Button>
 
         {hasKey && (
@@ -140,7 +132,7 @@ function ProviderKeyInput({
             variant="outline"
             size="sm"
             onClick={handleRemove}
-            className="shrink-0 text-red-600 hover:text-red-700 hover:bg-red-50 dark:hover:bg-red-950"
+            className="h-10 px-4 text-red-600 hover:text-red-700 hover:bg-red-50 dark:hover:bg-red-950"
           >
             Remove
           </Button>
@@ -150,20 +142,24 @@ function ProviderKeyInput({
   );
 }
 
+const PROVIDER_KEY_URLS: Record<ProviderId, string> = {
+  openai: "https://platform.openai.com/api-keys",
+  anthropic: "https://console.anthropic.com/settings/keys",
+  google: "https://aistudio.google.com/app/apikey",
+};
+
 export function ApiKeyDialog({ apiKeys, children }: ApiKeyDialogProps) {
-  const configuredCount = PROVIDERS.filter((p) =>
-    apiKeys.hasKey(p.id)
-  ).length;
+  const configuredCount = PROVIDERS.filter((p) => apiKeys.hasKey(p.id)).length;
 
   return (
     <Dialog>
       <DialogTrigger asChild>
         {children || (
-          <Button variant="outline" size="sm" className="gap-2">
+          <Button variant="outline" className="gap-2 h-10">
             <Key className="h-4 w-4" />
-            API Keys
+            <span>API Keys</span>
             {configuredCount > 0 && (
-              <span className="ml-1 bg-green-100 dark:bg-green-900/50 text-green-700 dark:text-green-300 text-xs px-1.5 py-0.5 rounded-full">
+              <span className="ml-1 flex items-center justify-center w-5 h-5 text-xs font-medium rounded-full bg-green-100 dark:bg-green-900/50 text-green-700 dark:text-green-300">
                 {configuredCount}
               </span>
             )}
@@ -171,60 +167,29 @@ export function ApiKeyDialog({ apiKeys, children }: ApiKeyDialogProps) {
         )}
       </DialogTrigger>
 
-      <DialogContent className="sm:max-w-md">
+      <DialogContent className="sm:max-w-lg">
         <DialogHeader>
           <DialogTitle className="flex items-center gap-2">
             <Key className="h-5 w-5" />
             API Keys
           </DialogTitle>
           <DialogDescription>
-            Your API keys are stored in session storage and cleared when you
-            close the browser. Keys never leave your browser.
+            Keys are stored in session storage and cleared when you close the browser.
+            Your keys never leave your browser.
           </DialogDescription>
         </DialogHeader>
 
-        <div className="space-y-6 mt-4">
+        <div className="mt-2">
           {PROVIDERS.map((provider) => (
-            <ProviderKeyInput
+            <ProviderKeyRow
               key={provider.id}
               providerId={provider.id}
               providerName={provider.name}
               apiKeys={apiKeys}
+              keyUrl={PROVIDER_KEY_URLS[provider.id]}
             />
           ))}
         </div>
-
-        <div className="mt-6 pt-4 border-t border-border">
-          <p className="text-xs text-muted-foreground">
-            Need API keys?{" "}
-            <a
-              href="https://platform.openai.com/api-keys"
-              target="_blank"
-              rel="noopener noreferrer"
-              className="text-primary hover:underline"
-            >
-              OpenAI
-            </a>{" "}
-            ·{" "}
-            <a
-              href="https://console.anthropic.com/settings/keys"
-              target="_blank"
-              rel="noopener noreferrer"
-              className="text-primary hover:underline"
-            >
-              Anthropic
-            </a>{" "}
-            ·{" "}
-            <a
-              href="https://aistudio.google.com/app/apikey"
-              target="_blank"
-              rel="noopener noreferrer"
-              className="text-primary hover:underline"
-            >
-              Google AI
-            </a>
-          </p>
-        </div>
       </DialogContent>
     </Dialog>
   );
diff --git a/components/ai-eval/EvalComparisonGrid.tsx b/components/ai-eval/EvalComparisonGrid.tsx
deleted file mode 100644
index ca4ef7b..0000000
--- a/components/ai-eval/EvalComparisonGrid.tsx
+++ /dev/null
@@ -1,154 +0,0 @@
-import {
-  ModelConfig,
-  JudgeEvaluation,
-  ComparisonMode,
-} from "@/components/utils/ai-eval-schemas";
-import { EvalResultCell } from "./EvalResultCell";
-
-interface ResultData {
-  id: string;
-  modelId: string;
-  promptId: string;
-  output: string | null;
-  evaluation: JudgeEvaluation | null;
-  isLoading: boolean;
-  error: string | null;
-  latencyMs?: number;
-}
-
-interface EvalComparisonGridProps {
-  mode: ComparisonMode;
-  models: ModelConfig[];
-  results: ResultData[];
-  winnerIds: string[];
-}
-
-export function EvalComparisonGrid({
-  mode,
-  models,
-  results,
-  winnerIds,
-}: EvalComparisonGridProps) {
-  // Determine grid columns based on number of items
-  const gridCols =
-    models.length === 2
-      ? "grid-cols-2"
-      : models.length === 3
-        ? "grid-cols-3"
-        : "grid-cols-2 lg:grid-cols-4";
-
-  if (mode === "model-vs-model") {
-    // In model-vs-model mode, we show one column per model
-    return (
-      <div className={`grid ${gridCols} gap-4`}>
-        {models.map((model, index) => {
-          const result = results.find((r) => r.modelId === model.id);
-          const label = `Model ${String.fromCharCode(65 + index)}`;
-
-          return (
-            <EvalResultCell
-              key={model.id}
-              label={label}
-              model={model}
-              output={result?.output ?? null}
-              evaluation={result?.evaluation ?? null}
-              isLoading={result?.isLoading ?? false}
-              error={result?.error ?? null}
-              isWinner={result ? winnerIds.includes(result.id) : false}
-              latencyMs={result?.latencyMs}
-            />
-          );
-        })}
-      </div>
-    );
-  }
-
-  // In prompt-vs-prompt mode, we show one column per prompt variant
-  // Group results by promptId
-  const promptIds = Array.from(new Set(results.map((r) => r.promptId)));
-  const model = models[0]; // Single model in prompt-vs-prompt mode
-
-  return (
-    <div className={`grid ${promptIds.length === 2 ? "grid-cols-2" : promptIds.length === 3 ? "grid-cols-3" : "grid-cols-2 lg:grid-cols-4"} gap-4`}>
-      {promptIds.map((promptId, index) => {
-        const result = results.find((r) => r.promptId === promptId);
-        const label = `Prompt ${String.fromCharCode(65 + index)}`;
-
-        return (
-          <EvalResultCell
-            key={promptId}
-            label={label}
-            model={model}
-            output={result?.output ?? null}
-            evaluation={result?.evaluation ?? null}
-            isLoading={result?.isLoading ?? false}
-            error={result?.error ?? null}
-            isWinner={result ? winnerIds.includes(result.id) : false}
-            latencyMs={result?.latencyMs}
-          />
-        );
-      })}
-    </div>
-  );
-}
-
-// Summary component for showing overall comparison results
-interface ComparisonSummaryProps {
-  results: ResultData[];
-  winnerIds: string[];
-  comparisonReasoning?: string;
-}
-
-export function ComparisonSummary({
-  results,
-  winnerIds,
-  comparisonReasoning,
-}: ComparisonSummaryProps) {
-  const completedResults = results.filter(
-    (r) => r.output && r.evaluation && !r.isLoading
-  );
-
-  if (completedResults.length < 2) {
-    return null;
-  }
-
-  // Sort by overall score
-  const sorted = [...completedResults].sort(
-    (a, b) =>
-      (b.evaluation?.overallScore ?? 0) - (a.evaluation?.overallScore ?? 0)
-  );
-
-  return (
-    <div className="bg-muted/50 rounded-xl p-4 space-y-3">
-      <h3 className="text-sm font-semibold">Comparison Summary</h3>
-
-      <div className="flex items-center gap-4 text-sm">
-        {sorted.map((result, index) => {
-          const isWinner = winnerIds.includes(result.id);
-          return (
-            <div
-              key={result.id}
-              className={`flex items-center gap-2 ${
-                isWinner
-                  ? "text-green-600 dark:text-green-400 font-medium"
-                  : "text-muted-foreground"
-              }`}
-            >
-              <span className="font-mono">#{index + 1}</span>
-              <span>{result.modelId}</span>
-              <span className="font-semibold">
-                {result.evaluation?.overallScore.toFixed(1)}
-              </span>
-            </div>
-          );
-        })}
-      </div>
-
-      {comparisonReasoning && (
-        <p className="text-xs text-muted-foreground border-t border-border pt-3">
-          {comparisonReasoning}
-        </p>
-      )}
-    </div>
-  );
-}
diff --git a/components/ai-eval/EvalConfigPanel.tsx b/components/ai-eval/EvalConfigPanel.tsx
deleted file mode 100644
index d36ee88..0000000
--- a/components/ai-eval/EvalConfigPanel.tsx
+++ /dev/null
@@ -1,265 +0,0 @@
-import { useState, useEffect, useCallback, useMemo } from "react";
-import { Card } from "@/components/ds/CardComponent";
-import { Label } from "@/components/ds/LabelComponent";
-import { Textarea } from "@/components/ds/TextareaComponent";
-import { Input } from "@/components/ds/InputComponent";
-import {
-  extractVariables,
-  DEFAULT_SYSTEM_PROMPT,
-  DEFAULT_USER_PROMPT,
-  ComparisonMode,
-} from "@/components/utils/ai-eval-schemas";
-import { Plus, Trash2 } from "lucide-react";
-import { Button } from "@/components/ds/ButtonComponent";
-
-interface PromptVariant {
-  id: string;
-  systemPrompt: string;
-  userPrompt: string;
-}
-
-interface EvalConfigPanelProps {
-  mode: ComparisonMode;
-  onPromptsChange: (prompts: PromptVariant[]) => void;
-  onVariablesChange: (variables: Record<string, string>) => void;
-}
-
-export function EvalConfigPanel({
-  mode,
-  onPromptsChange,
-  onVariablesChange,
-}: EvalConfigPanelProps) {
-  // For model-vs-model mode, we have a single prompt
-  // For prompt-vs-prompt mode, we have multiple prompt variants
-  const [prompts, setPrompts] = useState<PromptVariant[]>([
-    {
-      id: "prompt-1",
-      systemPrompt: DEFAULT_SYSTEM_PROMPT,
-      userPrompt: DEFAULT_USER_PROMPT,
-    },
-  ]);
-
-  const [variables, setVariables] = useState<Record<string, string>>({
-    question: "",
-  });
-
-  // Extract all variables from all prompts
-  const allVariables = useMemo(() => {
-    const vars = new Set<string>();
-    prompts.forEach((p) => {
-      extractVariables(p.systemPrompt).forEach((v) => vars.add(v));
-      extractVariables(p.userPrompt).forEach((v) => vars.add(v));
-    });
-    return Array.from(vars);
-  }, [prompts]);
-
-  // Update variables state when new variables are detected
-  useEffect(() => {
-    setVariables((prev) => {
-      const next: Record<string, string> = {};
-      allVariables.forEach((v) => {
-        next[v] = prev[v] ?? "";
-      });
-      return next;
-    });
-  }, [allVariables]);
-
-  // Notify parent of changes
-  useEffect(() => {
-    onPromptsChange(prompts);
-  }, [prompts, onPromptsChange]);
-
-  useEffect(() => {
-    onVariablesChange(variables);
-  }, [variables, onVariablesChange]);
-
-  const updatePrompt = useCallback(
-    (id: string, field: "systemPrompt" | "userPrompt", value: string) => {
-      setPrompts((prev) =>
-        prev.map((p) => (p.id === id ? { ...p, [field]: value } : p))
-      );
-    },
-    []
-  );
-
-  const addPromptVariant = useCallback(() => {
-    const newId = `prompt-${Date.now()}`;
-    setPrompts((prev) => [
-      ...prev,
-      {
-        id: newId,
-        systemPrompt: prev[0]?.systemPrompt || DEFAULT_SYSTEM_PROMPT,
-        userPrompt: prev[0]?.userPrompt || DEFAULT_USER_PROMPT,
-      },
-    ]);
-  }, []);
-
-  const removePromptVariant = useCallback((id: string) => {
-    setPrompts((prev) => {
-      if (prev.length <= 1) return prev;
-      return prev.filter((p) => p.id !== id);
-    });
-  }, []);
-
-  const updateVariable = useCallback((name: string, value: string) => {
-    setVariables((prev) => ({ ...prev, [name]: value }));
-  }, []);
-
-  // In model-vs-model mode, show single prompt config
-  if (mode === "model-vs-model") {
-    const prompt = prompts[0];
-    return (
-      <Card className="p-6 hover:shadow-none shadow-none rounded-xl">
-        <div className="space-y-6">
-          <div>
-            <Label className="mb-2 block text-sm font-medium">
-              System Prompt
-            </Label>
-            <Textarea
-              rows={3}
-              placeholder="You are a helpful assistant..."
-              value={prompt.systemPrompt}
-              onChange={(e) =>
-                updatePrompt(prompt.id, "systemPrompt", e.target.value)
-              }
-              className="font-mono text-sm"
-            />
-          </div>
-
-          <div>
-            <Label className="mb-2 block text-sm font-medium">
-              User Prompt
-            </Label>
-            <Textarea
-              rows={4}
-              placeholder="Enter your prompt here. Use {{variable}} for dynamic values."
-              value={prompt.userPrompt}
-              onChange={(e) =>
-                updatePrompt(prompt.id, "userPrompt", e.target.value)
-              }
-              className="font-mono text-sm"
-            />
-            <p className="mt-1 text-xs text-muted-foreground">
-              Use {"{{variableName}}"} to create dynamic inputs
-            </p>
-          </div>
-
-          {allVariables.length > 0 && (
-            <div>
-              <Label className="mb-3 block text-sm font-medium">Variables</Label>
-              <div className="space-y-3">
-                {allVariables.map((varName) => (
-                  <div key={varName} className="flex items-center gap-3">
-                    <code className="text-xs bg-muted px-2 py-1 rounded font-mono min-w-[100px]">
-                      {"{{"}{varName}{"}}"}
-                    </code>
-                    <Input
-                      placeholder={`Enter value for ${varName}`}
-                      value={variables[varName] || ""}
-                      onChange={(e) => updateVariable(varName, e.target.value)}
-                      className="flex-1"
-                    />
-                  </div>
-                ))}
-              </div>
-            </div>
-          )}
-        </div>
-      </Card>
-    );
-  }
-
-  // In prompt-vs-prompt mode, show multiple prompt editors
-  return (
-    <div className="space-y-4">
-      {prompts.map((prompt, index) => (
-        <Card
-          key={prompt.id}
-          className="p-6 hover:shadow-none shadow-none rounded-xl"
-        >
-          <div className="flex items-center justify-between mb-4">
-            <h3 className="text-sm font-semibold">
-              Prompt Variant {String.fromCharCode(65 + index)}
-            </h3>
-            {prompts.length > 1 && (
-              <Button
-                variant="ghost"
-                size="sm"
-                onClick={() => removePromptVariant(prompt.id)}
-                className="text-muted-foreground hover:text-destructive"
-              >
-                <Trash2 className="h-4 w-4" />
-              </Button>
-            )}
-          </div>
-
-          <div className="space-y-4">
-            <div>
-              <Label className="mb-2 block text-xs font-medium text-muted-foreground">
-                System Prompt
-              </Label>
-              <Textarea
-                rows={2}
-                placeholder="You are a helpful assistant..."
-                value={prompt.systemPrompt}
-                onChange={(e) =>
-                  updatePrompt(prompt.id, "systemPrompt", e.target.value)
-                }
-                className="font-mono text-sm"
-              />
-            </div>
-
-            <div>
-              <Label className="mb-2 block text-xs font-medium text-muted-foreground">
-                User Prompt
-              </Label>
-              <Textarea
-                rows={3}
-                placeholder="Enter your prompt variant here..."
-                value={prompt.userPrompt}
-                onChange={(e) =>
-                  updatePrompt(prompt.id, "userPrompt", e.target.value)
-                }
-                className="font-mono text-sm"
-              />
-            </div>
-          </div>
-        </Card>
-      ))}
-
-      {prompts.length < 4 && (
-        <Button
-          variant="outline"
-          onClick={addPromptVariant}
-          className="w-full gap-2"
-        >
-          <Plus className="h-4 w-4" />
-          Add Prompt Variant
-        </Button>
-      )}
-
-      {allVariables.length > 0 && (
-        <Card className="p-6 hover:shadow-none shadow-none rounded-xl">
-          <Label className="mb-3 block text-sm font-medium">
-            Shared Variables
-          </Label>
-          <div className="space-y-3">
-            {allVariables.map((varName) => (
-              <div key={varName} className="flex items-center gap-3">
-                <code className="text-xs bg-muted px-2 py-1 rounded font-mono min-w-[100px]">
-                  {"{{"}{varName}{"}}"}
-                </code>
-                <Input
-                  placeholder={`Enter value for ${varName}`}
-                  value={variables[varName] || ""}
-                  onChange={(e) => updateVariable(varName, e.target.value)}
-                  className="flex-1"
-                />
-              </div>
-            ))}
-          </div>
-        </Card>
-      )}
-    </div>
-  );
-}
diff --git a/components/ai-eval/EvalJudgeConfig.tsx b/components/ai-eval/EvalJudgeConfig.tsx
new file mode 100644
index 0000000..f1ef87d
--- /dev/null
+++ b/components/ai-eval/EvalJudgeConfig.tsx
@@ -0,0 +1,181 @@
+import { useCallback } from "react";
+import { Slider } from "@/components/ds/SliderComponent";
+import {
+  ModelConfig,
+  CriteriaWeights,
+  DEFAULT_CRITERIA_WEIGHTS,
+  PROVIDERS,
+} from "@/components/utils/ai-eval-schemas";
+import { UseApiKeysReturn } from "@/components/hooks/useApiKeys";
+import { ChevronDown } from "lucide-react";
+
+interface EvalJudgeConfigProps {
+  judgeModel: ModelConfig | null;
+  onJudgeModelChange: (model: ModelConfig | null) => void;
+  weights: CriteriaWeights;
+  onWeightsChange: (weights: CriteriaWeights) => void;
+  autoEvaluate: boolean;
+  onAutoEvaluateChange: (value: boolean) => void;
+  apiKeys: UseApiKeysReturn;
+}
+
+const CRITERIA = [
+  { key: "accuracy" as const, label: "Accuracy", desc: "Factual correctness" },
+  { key: "relevance" as const, label: "Relevance", desc: "Addresses the prompt" },
+  { key: "clarity" as const, label: "Clarity", desc: "Clear and organized" },
+  { key: "completeness" as const, label: "Completeness", desc: "Comprehensive" },
+  { key: "conciseness" as const, label: "Conciseness", desc: "Appropriate length" },
+];
+
+export function EvalJudgeConfig({
+  judgeModel,
+  onJudgeModelChange,
+  weights,
+  onWeightsChange,
+  autoEvaluate,
+  onAutoEvaluateChange,
+  apiKeys,
+}: EvalJudgeConfigProps) {
+  const handleModelChange = (e: React.ChangeEvent<HTMLSelectElement>) => {
+    const modelId = e.target.value;
+    if (!modelId) {
+      onJudgeModelChange(null);
+      return;
+    }
+    for (const provider of PROVIDERS) {
+      const model = provider.models.find((m) => m.id === modelId);
+      if (model) {
+        onJudgeModelChange(model);
+        return;
+      }
+    }
+  };
+
+  const updateWeight = useCallback(
+    (key: keyof CriteriaWeights, value: number) => {
+      const newValue = value / 100;
+      const newWeights = { ...weights, [key]: newValue };
+      const total = Object.values(newWeights).reduce((sum, w) => sum + w, 0);
+      if (total > 0) {
+        Object.keys(newWeights).forEach((k) => {
+          newWeights[k as keyof CriteriaWeights] /= total;
+        });
+      }
+      onWeightsChange(newWeights);
+    },
+    [weights, onWeightsChange]
+  );
+
+  const resetWeights = () => onWeightsChange(DEFAULT_CRITERIA_WEIGHTS);
+
+  return (
+    <div className="rounded-xl border border-border bg-card overflow-hidden">
+      <div className="px-4 py-3 border-b border-border bg-muted/30">
+        <span className="text-xs font-medium text-muted-foreground uppercase tracking-wider">
+          Judge Configuration
+        </span>
+      </div>
+
+      <div className="p-6">
+        <div className="grid grid-cols-1 md:grid-cols-3 gap-8">
+          {/* Judge Model Selection */}
+          <div className="space-y-3">
+            <label className="text-sm font-medium">Judge Model</label>
+            <div className="relative">
+              <select
+                value={judgeModel?.id || ""}
+                onChange={handleModelChange}
+                className="w-full h-11 pl-4 pr-10 rounded-lg border border-input bg-background text-sm appearance-none cursor-pointer focus:outline-none focus:ring-2 focus:ring-ring"
+              >
+                <option value="">Select judge model...</option>
+                {PROVIDERS.map((provider) => (
+                  <optgroup
+                    key={provider.id}
+                    label={`${provider.name}${!apiKeys.hasKey(provider.id) ? " (no key)" : ""}`}
+                  >
+                    {provider.models.map((model) => (
+                      <option
+                        key={model.id}
+                        value={model.id}
+                        disabled={!apiKeys.hasKey(provider.id)}
+                      >
+                        {model.name}
+                      </option>
+                    ))}
+                  </optgroup>
+                ))}
+              </select>
+              <ChevronDown className="absolute right-3 top-1/2 -translate-y-1/2 h-4 w-4 text-muted-foreground pointer-events-none" />
+            </div>
+            <p className="text-xs text-muted-foreground">
+              Recommended: Use a different model than compared models
+            </p>
+          </div>
+
+          {/* Auto-evaluate Toggle */}
+          <div className="space-y-3">
+            <label className="text-sm font-medium">Auto-evaluate</label>
+            <label className="flex items-center gap-3 cursor-pointer">
+              <div
+                onClick={() => onAutoEvaluateChange(!autoEvaluate)}
+                className={`
+                  relative w-11 h-6 rounded-full transition-colors cursor-pointer
+                  ${autoEvaluate ? "bg-primary" : "bg-muted"}
+                `}
+              >
+                <div
+                  className={`
+                    absolute top-1 w-4 h-4 rounded-full bg-white shadow-sm transition-transform
+                    ${autoEvaluate ? "translate-x-6" : "translate-x-1"}
+                  `}
+                />
+              </div>
+              <span className="text-sm text-muted-foreground">
+                Score responses after generation
+              </span>
+            </label>
+          </div>
+
+          {/* Reset Weights */}
+          <div className="space-y-3">
+            <div className="flex items-center justify-between">
+              <label className="text-sm font-medium">Criteria Weights</label>
+              <button
+                onClick={resetWeights}
+                className="text-xs text-primary hover:underline"
+              >
+                Reset to default
+              </button>
+            </div>
+            <p className="text-xs text-muted-foreground">
+              Adjust how each criterion affects the overall score
+            </p>
+          </div>
+        </div>
+
+        {/* Criteria Sliders */}
+        <div className="mt-8 grid grid-cols-1 md:grid-cols-5 gap-6">
+          {CRITERIA.map(({ key, label, desc }) => (
+            <div key={key} className="space-y-2">
+              <div className="flex items-center justify-between">
+                <span className="text-sm font-medium">{label}</span>
+                <span className="text-xs font-mono text-muted-foreground">
+                  {Math.round(weights[key] * 100)}%
+                </span>
+              </div>
+              <Slider
+                value={[Math.round(weights[key] * 100)]}
+                onValueChange={(values) => updateWeight(key, values[0])}
+                min={0}
+                max={100}
+                step={5}
+                className="w-full"
+              />
+              <p className="text-xs text-muted-foreground">{desc}</p>
+            </div>
+          ))}
+        </div>
+      </div>
+    </div>
+  );
+}
diff --git a/components/ai-eval/EvalJudgePanel.tsx b/components/ai-eval/EvalJudgePanel.tsx
deleted file mode 100644
index 953e494..0000000
--- a/components/ai-eval/EvalJudgePanel.tsx
+++ /dev/null
@@ -1,186 +0,0 @@
-import { useCallback, useMemo } from "react";
-import { Card } from "@/components/ds/CardComponent";
-import { Label } from "@/components/ds/LabelComponent";
-import { Checkbox } from "@/components/ds/CheckboxComponent";
-import { Slider } from "@/components/ds/SliderComponent";
-import {
-  ModelConfig,
-  CriteriaWeights,
-  DEFAULT_CRITERIA_WEIGHTS,
-} from "@/components/utils/ai-eval-schemas";
-import { EvalModelSelector } from "./EvalModelSelector";
-import { UseApiKeysReturn } from "@/components/hooks/useApiKeys";
-import { AlertCircle, Scale } from "lucide-react";
-
-interface EvalJudgePanelProps {
-  judgeModel: ModelConfig | null;
-  onJudgeModelChange: (model: ModelConfig | null) => void;
-  comparedModelIds: string[];
-  weights: CriteriaWeights;
-  onWeightsChange: (weights: CriteriaWeights) => void;
-  autoEvaluate: boolean;
-  onAutoEvaluateChange: (value: boolean) => void;
-  apiKeys: UseApiKeysReturn;
-}
-
-const CRITERIA_INFO: {
-  key: keyof CriteriaWeights;
-  label: string;
-  description: string;
-}[] = [
-  {
-    key: "accuracy",
-    label: "Accuracy",
-    description: "Factual correctness",
-  },
-  {
-    key: "relevance",
-    label: "Relevance",
-    description: "Addresses the prompt",
-  },
-  {
-    key: "clarity",
-    label: "Clarity",
-    description: "Well-organized and clear",
-  },
-  {
-    key: "completeness",
-    label: "Completeness",
-    description: "Comprehensive coverage",
-  },
-  {
-    key: "conciseness",
-    label: "Conciseness",
-    description: "Appropriately detailed",
-  },
-];
-
-export function EvalJudgePanel({
-  judgeModel,
-  onJudgeModelChange,
-  comparedModelIds,
-  weights,
-  onWeightsChange,
-  autoEvaluate,
-  onAutoEvaluateChange,
-  apiKeys,
-}: EvalJudgePanelProps) {
-  // Check if judge model is same as compared models
-  const judgeMatchesCompared = useMemo(() => {
-    if (!judgeModel) return false;
-    return comparedModelIds.includes(judgeModel.id);
-  }, [judgeModel, comparedModelIds]);
-
-  // Calculate total weight percentage
-  const totalWeight = useMemo(() => {
-    return Object.values(weights).reduce((sum, w) => sum + w, 0);
-  }, [weights]);
-
-  const updateWeight = useCallback(
-    (key: keyof CriteriaWeights, value: number) => {
-      // Normalize value to percentage (0-1)
-      const newValue = value / 100;
-      const newWeights = { ...weights, [key]: newValue };
-
-      // Normalize all weights to sum to 1
-      const total = Object.values(newWeights).reduce((sum, w) => sum + w, 0);
-      if (total > 0) {
-        Object.keys(newWeights).forEach((k) => {
-          newWeights[k as keyof CriteriaWeights] /= total;
-        });
-      }
-
-      onWeightsChange(newWeights);
-    },
-    [weights, onWeightsChange]
-  );
-
-  const resetWeights = useCallback(() => {
-    onWeightsChange(DEFAULT_CRITERIA_WEIGHTS);
-  }, [onWeightsChange]);
-
-  return (
-    <Card className="p-6 hover:shadow-none shadow-none rounded-xl">
-      <div className="flex items-center gap-2 mb-4">
-        <Scale className="h-4 w-4 text-muted-foreground" />
-        <h3 className="text-sm font-semibold">Judge Settings</h3>
-      </div>
-
-      <div className="space-y-6">
-        {/* Judge Model Selection */}
-        <div>
-          <EvalModelSelector
-            value={judgeModel}
-            onChange={onJudgeModelChange}
-            apiKeys={apiKeys}
-            label="Judge Model"
-            showWarning={judgeMatchesCompared}
-            warningText="Using same model as compared - consider using a different judge"
-          />
-        </div>
-
-        {/* Auto-evaluate toggle */}
-        <div className="flex items-center justify-between">
-          <div>
-            <Label className="text-sm">Auto-evaluate</Label>
-            <p className="text-xs text-muted-foreground">
-              Automatically score responses after generation
-            </p>
-          </div>
-          <Checkbox
-            checked={autoEvaluate}
-            onCheckedChange={(checked) =>
-              onAutoEvaluateChange(checked === true)
-            }
-          />
-        </div>
-
-        {/* Criteria Weights */}
-        <div>
-          <div className="flex items-center justify-between mb-3">
-            <Label className="text-sm">Criteria Weights</Label>
-            <button
-              onClick={resetWeights}
-              className="text-xs text-muted-foreground hover:text-foreground transition-colors"
-            >
-              Reset
-            </button>
-          </div>
-
-          <div className="space-y-4">
-            {CRITERIA_INFO.map(({ key, label, description }) => (
-              <div key={key} className="space-y-2">
-                <div className="flex items-center justify-between">
-                  <div>
-                    <span className="text-sm font-medium">{label}</span>
-                    <span className="text-xs text-muted-foreground ml-2">
-                      {description}
-                    </span>
-                  </div>
-                  <span className="text-sm font-mono text-muted-foreground">
-                    {Math.round(weights[key] * 100)}%
-                  </span>
-                </div>
-                <Slider
-                  value={[Math.round(weights[key] * 100)]}
-                  onValueChange={(values) => updateWeight(key, values[0])}
-                  min={0}
-                  max={100}
-                  step={5}
-                  className="w-full"
-                />
-              </div>
-            ))}
-          </div>
-
-          {Math.abs(totalWeight - 1) > 0.01 && (
-            <p className="mt-3 text-xs text-amber-600 dark:text-amber-400 flex items-center gap-1">
-              <AlertCircle className="h-3 w-3" />
-              Weights will be normalized to sum to 100%
-            </p>
-          )}
-        </div>
-      </div>
-    </Card>
-  );
-}
diff --git a/components/ai-eval/EvalModelSelector.tsx b/components/ai-eval/EvalModelSelector.tsx
index 6668e7c..684feb4 100644
--- a/components/ai-eval/EvalModelSelector.tsx
+++ b/components/ai-eval/EvalModelSelector.tsx
@@ -1,179 +1,60 @@
 import { useCallback, useMemo } from "react";
-import {
-  PROVIDERS,
-  ModelConfig,
-} from "@/components/utils/ai-eval-schemas";
+import { PROVIDERS, ModelConfig } from "@/components/utils/ai-eval-schemas";
 import { UseApiKeysReturn } from "@/components/hooks/useApiKeys";
-import { ChevronDown, AlertCircle } from "lucide-react";
+import { Check } from "lucide-react";
 
 interface EvalModelSelectorProps {
-  value: ModelConfig | null;
-  onChange: (model: ModelConfig | null) => void;
+  selectedModels: ModelConfig[];
+  onModelsChange: (models: ModelConfig[]) => void;
   apiKeys: UseApiKeysReturn;
-  label?: string;
-  excludeModels?: string[];
-  showWarning?: boolean;
-  warningText?: string;
+  maxSelections: number;
 }
 
 export function EvalModelSelector({
-  value,
-  onChange,
+  selectedModels,
+  onModelsChange,
   apiKeys,
-  label,
-  excludeModels = [],
-  showWarning = false,
-  warningText,
+  maxSelections,
 }: EvalModelSelectorProps) {
-  // Group models by provider with availability status
-  const providerGroups = useMemo(() => {
-    return PROVIDERS.map((provider) => ({
-      ...provider,
-      hasKey: apiKeys.hasKey(provider.id),
-      models: provider.models.filter((m) => !excludeModels.includes(m.id)),
-    }));
-  }, [apiKeys, excludeModels]);
-
-  const handleChange = useCallback(
-    (e: React.ChangeEvent<HTMLSelectElement>) => {
-      const modelId = e.target.value;
-      if (!modelId) {
-        onChange(null);
-        return;
-      }
-
-      for (const provider of PROVIDERS) {
-        const model = provider.models.find((m) => m.id === modelId);
-        if (model) {
-          onChange(model);
-          return;
-        }
-      }
-    },
-    [onChange]
+  const selectedIds = useMemo(
+    () => new Set(selectedModels.map((m) => m.id)),
+    [selectedModels]
   );
 
-  const selectedProviderId = value?.providerId;
-  const hasKeyForSelected = selectedProviderId
-    ? apiKeys.hasKey(selectedProviderId)
-    : true;
-
-  return (
-    <div className="space-y-2">
-      {label && (
-        <label className="block text-sm font-medium text-foreground">
-          {label}
-        </label>
-      )}
-
-      <div className="relative">
-        <select
-          value={value?.id || ""}
-          onChange={handleChange}
-          className="w-full h-10 pl-3 pr-10 rounded-lg border border-input bg-muted text-sm ring-offset-background focus:outline-none focus:ring-2 focus:ring-ring focus:ring-offset-2 appearance-none cursor-pointer"
-        >
-          <option value="">Select a model...</option>
-          {providerGroups.map((provider) => (
-            <optgroup
-              key={provider.id}
-              label={`${provider.name}${!provider.hasKey ? " (no API key)" : ""}`}
-            >
-              {provider.models.map((model) => (
-                <option
-                  key={model.id}
-                  value={model.id}
-                  disabled={!provider.hasKey}
-                >
-                  {model.name}
-                </option>
-              ))}
-            </optgroup>
-          ))}
-        </select>
-        <ChevronDown className="absolute right-3 top-1/2 -translate-y-1/2 h-4 w-4 text-muted-foreground pointer-events-none" />
-      </div>
-
-      {!hasKeyForSelected && value && (
-        <p className="text-xs text-amber-600 dark:text-amber-400 flex items-center gap-1">
-          <AlertCircle className="h-3 w-3" />
-          Add API key for {value.providerId} to use this model
-        </p>
-      )}
-
-      {showWarning && warningText && (
-        <p className="text-xs text-amber-600 dark:text-amber-400 flex items-center gap-1">
-          <AlertCircle className="h-3 w-3" />
-          {warningText}
-        </p>
-      )}
-    </div>
-  );
-}
-
-// Multi-select version for selecting multiple models
-interface EvalMultiModelSelectorProps {
-  values: ModelConfig[];
-  onChange: (models: ModelConfig[]) => void;
-  apiKeys: UseApiKeysReturn;
-  label?: string;
-  maxSelections?: number;
-}
-
-export function EvalMultiModelSelector({
-  values,
-  onChange,
-  apiKeys,
-  label,
-  maxSelections = 4,
-}: EvalMultiModelSelectorProps) {
-  const selectedIds = useMemo(() => new Set(values.map((m) => m.id)), [values]);
-
-  const providerGroups = useMemo(() => {
-    return PROVIDERS.map((provider) => ({
-      ...provider,
-      hasKey: apiKeys.hasKey(provider.id),
-    }));
-  }, [apiKeys]);
-
   const handleToggle = useCallback(
     (model: ModelConfig) => {
       if (selectedIds.has(model.id)) {
-        onChange(values.filter((m) => m.id !== model.id));
-      } else if (values.length < maxSelections) {
-        onChange([...values, model]);
+        onModelsChange(selectedModels.filter((m) => m.id !== model.id));
+      } else if (selectedModels.length < maxSelections) {
+        onModelsChange([...selectedModels, model]);
+      } else if (maxSelections === 1) {
+        onModelsChange([model]);
       }
     },
-    [values, onChange, selectedIds, maxSelections]
+    [selectedModels, onModelsChange, selectedIds, maxSelections]
   );
 
   return (
-    <div className="space-y-3">
-      {label && (
-        <label className="block text-sm font-medium text-foreground">
-          {label}
-          <span className="text-muted-foreground font-normal ml-2">
-            ({values.length}/{maxSelections})
-          </span>
-        </label>
-      )}
+    <div className="space-y-6">
+      {PROVIDERS.map((provider) => {
+        const hasKey = apiKeys.hasKey(provider.id);
 
-      <div className="space-y-4">
-        {providerGroups.map((provider) => (
+        return (
           <div key={provider.id}>
-            <div className="text-xs font-medium text-muted-foreground mb-2 flex items-center gap-2">
-              {provider.name}
-              {!provider.hasKey && (
-                <span className="text-amber-600 dark:text-amber-400">
-                  (no API key)
+            <div className="flex items-center gap-2 mb-3">
+              <span className="text-sm font-medium">{provider.name}</span>
+              {!hasKey && (
+                <span className="text-xs text-amber-600 dark:text-amber-400 bg-amber-50 dark:bg-amber-950/30 px-2 py-0.5 rounded-full">
+                  No API key
                 </span>
               )}
             </div>
+
             <div className="flex flex-wrap gap-2">
               {provider.models.map((model) => {
                 const isSelected = selectedIds.has(model.id);
                 const isDisabled =
-                  !provider.hasKey ||
-                  (!isSelected && values.length >= maxSelections);
+                  !hasKey || (!isSelected && selectedModels.length >= maxSelections && maxSelections > 1);
 
                 return (
                   <button
@@ -181,24 +62,27 @@ export function EvalMultiModelSelector({
                     onClick={() => !isDisabled && handleToggle(model)}
                     disabled={isDisabled}
                     className={`
-                      px-3 py-1.5 text-sm rounded-lg border transition-colors
+                      relative flex items-center gap-2 px-4 py-2.5 text-sm font-medium rounded-lg border-2 transition-all
                       ${
                         isSelected
-                          ? "bg-primary text-primary-foreground border-primary"
+                          ? "border-primary bg-primary/5 text-primary"
                           : isDisabled
-                            ? "bg-muted text-muted-foreground border-border opacity-50 cursor-not-allowed"
-                            : "bg-background text-foreground border-border hover:border-primary hover:bg-accent"
+                            ? "border-border bg-muted/50 text-muted-foreground/50 cursor-not-allowed"
+                            : "border-border bg-background text-foreground hover:border-primary/50 hover:bg-primary/5 cursor-pointer"
                       }
                     `}
                   >
+                    {isSelected && (
+                      <Check className="h-4 w-4" />
+                    )}
                     {model.name}
                   </button>
                 );
               })}
             </div>
           </div>
-        ))}
-      </div>
+        );
+      })}
     </div>
   );
 }
diff --git a/components/ai-eval/EvalResultCard.tsx b/components/ai-eval/EvalResultCard.tsx
new file mode 100644
index 0000000..16ee33d
--- /dev/null
+++ b/components/ai-eval/EvalResultCard.tsx
@@ -0,0 +1,169 @@
+import { useState } from "react";
+import { JudgeEvaluation, getScoreColorClass } from "@/components/utils/ai-eval-schemas";
+import { Loader2, AlertCircle, Copy, Check, Trophy, ChevronDown, ChevronUp } from "lucide-react";
+
+interface EvalResultCardProps {
+  label: string;
+  sublabel?: string;
+  output: string | null;
+  evaluation: JudgeEvaluation | null;
+  isLoading: boolean;
+  error: string | null;
+  isWinner?: boolean;
+  latencyMs?: number;
+}
+
+export function EvalResultCard({
+  label,
+  sublabel,
+  output,
+  evaluation,
+  isLoading,
+  error,
+  isWinner,
+  latencyMs,
+}: EvalResultCardProps) {
+  const [copied, setCopied] = useState(false);
+  const [showScoreDetails, setShowScoreDetails] = useState(false);
+
+  const handleCopy = async () => {
+    if (!output) return;
+    await navigator.clipboard.writeText(output);
+    setCopied(true);
+    setTimeout(() => setCopied(false), 2000);
+  };
+
+  return (
+    <div
+      className={`
+        rounded-xl border-2 bg-card overflow-hidden transition-all
+        ${isWinner ? "border-green-500/50 shadow-lg shadow-green-500/10" : "border-border"}
+      `}
+    >
+      {/* Header */}
+      <div className="px-4 py-3 border-b border-border bg-muted/30 flex items-center justify-between">
+        <div className="flex items-center gap-2">
+          {isWinner && <Trophy className="h-4 w-4 text-green-500" />}
+          <div>
+            <span className="font-semibold text-sm">{label}</span>
+            {sublabel && (
+              <span className="text-xs text-muted-foreground ml-2">{sublabel}</span>
+            )}
+          </div>
+        </div>
+
+        <div className="flex items-center gap-3">
+          {latencyMs !== undefined && (
+            <span className="text-xs text-muted-foreground">
+              {(latencyMs / 1000).toFixed(2)}s
+            </span>
+          )}
+          {evaluation && (
+            <div
+              className={`
+                px-3 py-1 rounded-full text-sm font-bold
+                ${isWinner ? "bg-green-500/10 text-green-600 dark:text-green-400" : "bg-muted"}
+                ${getScoreColorClass(evaluation.overallScore)}
+              `}
+            >
+              {evaluation.overallScore.toFixed(1)}
+            </div>
+          )}
+        </div>
+      </div>
+
+      {/* Content */}
+      <div className="p-4">
+        {isLoading ? (
+          <div className="flex items-center justify-center py-12 text-muted-foreground">
+            <Loader2 className="h-6 w-6 animate-spin mr-3" />
+            <span className="text-sm">Generating response...</span>
+          </div>
+        ) : error ? (
+          <div className="py-8 px-4 bg-red-50 dark:bg-red-950/20 rounded-lg">
+            <div className="flex items-start gap-3 text-red-600 dark:text-red-400">
+              <AlertCircle className="h-5 w-5 mt-0.5 shrink-0" />
+              <p className="text-sm">{error}</p>
+            </div>
+          </div>
+        ) : output ? (
+          <div className="space-y-4">
+            {/* Output Text */}
+            <div className="relative group">
+              <div className="bg-muted/50 rounded-lg p-4 text-sm leading-relaxed max-h-80 overflow-y-auto">
+                <pre className="whitespace-pre-wrap font-sans">{output}</pre>
+              </div>
+              <button
+                onClick={handleCopy}
+                className="absolute top-3 right-3 p-2 rounded-md bg-background border border-border opacity-0 group-hover:opacity-100 transition-opacity hover:bg-muted"
+              >
+                {copied ? (
+                  <Check className="h-4 w-4 text-green-500" />
+                ) : (
+                  <Copy className="h-4 w-4 text-muted-foreground" />
+                )}
+              </button>
+            </div>
+
+            {/* Evaluation Scores */}
+            {evaluation && (
+              <div className="border-t border-border pt-4">
+                <button
+                  onClick={() => setShowScoreDetails(!showScoreDetails)}
+                  className="w-full flex items-center justify-between text-sm text-muted-foreground hover:text-foreground transition-colors"
+                >
+                  <span>Score Breakdown</span>
+                  {showScoreDetails ? (
+                    <ChevronUp className="h-4 w-4" />
+                  ) : (
+                    <ChevronDown className="h-4 w-4" />
+                  )}
+                </button>
+
+                {showScoreDetails && (
+                  <div className="mt-4 space-y-3">
+                    {[
+                      { key: "accuracy", label: "Accuracy" },
+                      { key: "relevance", label: "Relevance" },
+                      { key: "clarity", label: "Clarity" },
+                      { key: "completeness", label: "Completeness" },
+                      { key: "conciseness", label: "Conciseness" },
+                    ].map(({ key, label }) => {
+                      const score = evaluation.scores[key as keyof typeof evaluation.scores];
+                      return (
+                        <div key={key} className="flex items-center gap-3">
+                          <span className="text-xs text-muted-foreground w-24">{label}</span>
+                          <div className="flex-1 h-2 bg-muted rounded-full overflow-hidden">
+                            <div
+                              className={`h-full rounded-full transition-all ${
+                                score >= 8 ? "bg-green-500" : score >= 5 ? "bg-yellow-500" : "bg-red-500"
+                              }`}
+                              style={{ width: `${score * 10}%` }}
+                            />
+                          </div>
+                          <span className={`text-xs font-medium w-6 text-right ${getScoreColorClass(score)}`}>
+                            {score}
+                          </span>
+                        </div>
+                      );
+                    })}
+
+                    {evaluation.reasoning && (
+                      <p className="text-xs text-muted-foreground pt-2 border-t border-border mt-3">
+                        {evaluation.reasoning}
+                      </p>
+                    )}
+                  </div>
+                )}
+              </div>
+            )}
+          </div>
+        ) : (
+          <div className="py-12 text-center text-muted-foreground text-sm">
+            Run evaluation to see output
+          </div>
+        )}
+      </div>
+    </div>
+  );
+}
diff --git a/components/ai-eval/EvalResultCell.tsx b/components/ai-eval/EvalResultCell.tsx
deleted file mode 100644
index 51abd1c..0000000
--- a/components/ai-eval/EvalResultCell.tsx
+++ /dev/null
@@ -1,149 +0,0 @@
-import { useState } from "react";
-import { Card } from "@/components/ds/CardComponent";
-import {
-  ModelConfig,
-  JudgeEvaluation,
-  getProviderById,
-} from "@/components/utils/ai-eval-schemas";
-import { EvalScoreDisplay, ScoreBadge } from "./EvalScoreDisplay";
-import { Loader2, AlertCircle, ChevronDown, ChevronUp, Copy, Check } from "lucide-react";
-
-interface EvalResultCellProps {
-  label: string;
-  model: ModelConfig;
-  output: string | null;
-  evaluation: JudgeEvaluation | null;
-  isLoading: boolean;
-  error: string | null;
-  isWinner?: boolean;
-  latencyMs?: number;
-}
-
-export function EvalResultCell({
-  label,
-  model,
-  output,
-  evaluation,
-  isLoading,
-  error,
-  isWinner,
-  latencyMs,
-}: EvalResultCellProps) {
-  const [expanded, setExpanded] = useState(true);
-  const [copied, setCopied] = useState(false);
-
-  const provider = getProviderById(model.providerId);
-
-  const handleCopy = async () => {
-    if (!output) return;
-    await navigator.clipboard.writeText(output);
-    setCopied(true);
-    setTimeout(() => setCopied(false), 2000);
-  };
-
-  return (
-    <Card
-      className={`
-        p-4 hover:shadow-none shadow-none rounded-xl transition-all
-        ${isWinner ? "ring-2 ring-green-500/50" : ""}
-      `}
-    >
-      {/* Header */}
-      <div className="flex items-center justify-between mb-3">
-        <div className="flex items-center gap-2">
-          <span className="text-sm font-semibold">{label}</span>
-          {isWinner && (
-            <span className="text-xs bg-green-100 dark:bg-green-900/50 text-green-700 dark:text-green-300 px-2 py-0.5 rounded-full">
-              Winner
-            </span>
-          )}
-        </div>
-        {evaluation && (
-          <ScoreBadge
-            score={evaluation.overallScore}
-            isWinner={isWinner}
-            size="sm"
-          />
-        )}
-      </div>
-
-      {/* Model Info */}
-      <div className="flex items-center gap-2 mb-3 text-xs text-muted-foreground">
-        <span className="font-medium">{provider?.name}</span>
-        <span>·</span>
-        <span>{model.name}</span>
-        {latencyMs !== undefined && (
-          <>
-            <span>·</span>
-            <span>{(latencyMs / 1000).toFixed(2)}s</span>
-          </>
-        )}
-      </div>
-
-      {/* Content */}
-      {isLoading ? (
-        <div className="flex items-center justify-center py-8 text-muted-foreground">
-          <Loader2 className="h-5 w-5 animate-spin mr-2" />
-          <span className="text-sm">Generating...</span>
-        </div>
-      ) : error ? (
-        <div className="py-4 px-3 bg-red-50 dark:bg-red-950/30 rounded-lg">
-          <div className="flex items-start gap-2 text-red-600 dark:text-red-400">
-            <AlertCircle className="h-4 w-4 mt-0.5 shrink-0" />
-            <p className="text-sm">{error}</p>
-          </div>
-        </div>
-      ) : output ? (
-        <div className="space-y-3">
-          {/* Output */}
-          <div className="relative">
-            <button
-              onClick={() => setExpanded(!expanded)}
-              className="flex items-center gap-1 text-xs text-muted-foreground hover:text-foreground mb-2"
-            >
-              {expanded ? (
-                <ChevronUp className="h-3 w-3" />
-              ) : (
-                <ChevronDown className="h-3 w-3" />
-              )}
-              Output
-            </button>
-            
-            {expanded && (
-              <div className="relative group">
-                <div className="bg-muted rounded-lg p-3 text-sm leading-relaxed max-h-64 overflow-y-auto">
-                  <pre className="whitespace-pre-wrap font-sans">{output}</pre>
-                </div>
-                <button
-                  onClick={handleCopy}
-                  className="absolute top-2 right-2 p-1.5 rounded-md bg-background/80 opacity-0 group-hover:opacity-100 transition-opacity hover:bg-background"
-                >
-                  {copied ? (
-                    <Check className="h-3.5 w-3.5 text-green-500" />
-                  ) : (
-                    <Copy className="h-3.5 w-3.5 text-muted-foreground" />
-                  )}
-                </button>
-              </div>
-            )}
-          </div>
-
-          {/* Evaluation */}
-          {evaluation && (
-            <div className="pt-3 border-t border-border">
-              <EvalScoreDisplay
-                evaluation={evaluation}
-                isWinner={isWinner}
-                showBreakdown={true}
-              />
-            </div>
-          )}
-        </div>
-      ) : (
-        <div className="py-8 text-center text-muted-foreground text-sm">
-          Run evaluation to see output
-        </div>
-      )}
-    </Card>
-  );
-}
diff --git a/components/ai-eval/EvalScoreDisplay.tsx b/components/ai-eval/EvalScoreDisplay.tsx
deleted file mode 100644
index 2aee27d..0000000
--- a/components/ai-eval/EvalScoreDisplay.tsx
+++ /dev/null
@@ -1,151 +0,0 @@
-import {
-  JudgeEvaluation,
-  ScoreBreakdown,
-  getScoreColorClass,
-  getScoreBgClass,
-} from "@/components/utils/ai-eval-schemas";
-import { Trophy } from "lucide-react";
-
-interface ScoreBadgeProps {
-  score: number;
-  isWinner?: boolean;
-  size?: "sm" | "md" | "lg";
-}
-
-export function ScoreBadge({ score, isWinner, size = "md" }: ScoreBadgeProps) {
-  const sizeClasses = {
-    sm: "text-sm px-2 py-0.5",
-    md: "text-base px-3 py-1",
-    lg: "text-lg px-4 py-1.5 font-semibold",
-  };
-
-  return (
-    <div
-      className={`
-        inline-flex items-center gap-1.5 rounded-full font-medium
-        ${sizeClasses[size]}
-        ${getScoreBgClass(score)}
-        ${getScoreColorClass(score)}
-      `}
-    >
-      {isWinner && <Trophy className="h-3.5 w-3.5" />}
-      <span>{score.toFixed(1)}</span>
-      <span className="opacity-60">/ 10</span>
-    </div>
-  );
-}
-
-interface ScoreBarProps {
-  label: string;
-  score: number;
-  maxScore?: number;
-}
-
-export function ScoreBar({ label, score, maxScore = 10 }: ScoreBarProps) {
-  const percentage = (score / maxScore) * 100;
-
-  return (
-    <div className="flex items-center gap-3">
-      <span className="text-xs text-muted-foreground w-24 shrink-0">
-        {label}
-      </span>
-      <div className="flex-1 h-2 bg-muted rounded-full overflow-hidden">
-        <div
-          className={`h-full rounded-full transition-all ${
-            score >= 8
-              ? "bg-green-500"
-              : score >= 5
-                ? "bg-yellow-500"
-                : "bg-red-500"
-          }`}
-          style={{ width: `${percentage}%` }}
-        />
-      </div>
-      <span className={`text-xs font-medium w-6 text-right ${getScoreColorClass(score)}`}>
-        {score}
-      </span>
-    </div>
-  );
-}
-
-interface ScoreBreakdownDisplayProps {
-  scores: ScoreBreakdown;
-  compact?: boolean;
-}
-
-export function ScoreBreakdownDisplay({
-  scores,
-  compact,
-}: ScoreBreakdownDisplayProps) {
-  const criteria: { key: keyof ScoreBreakdown; label: string }[] = [
-    { key: "accuracy", label: "Accuracy" },
-    { key: "relevance", label: "Relevance" },
-    { key: "clarity", label: "Clarity" },
-    { key: "completeness", label: "Completeness" },
-    { key: "conciseness", label: "Conciseness" },
-  ];
-
-  if (compact) {
-    return (
-      <div className="grid grid-cols-5 gap-2 text-center">
-        {criteria.map(({ key, label }) => (
-          <div key={key} className="space-y-1">
-            <div className="text-[10px] text-muted-foreground uppercase tracking-wider">
-              {label.slice(0, 3)}
-            </div>
-            <div className={`text-sm font-medium ${getScoreColorClass(scores[key])}`}>
-              {scores[key]}
-            </div>
-          </div>
-        ))}
-      </div>
-    );
-  }
-
-  return (
-    <div className="space-y-2">
-      {criteria.map(({ key, label }) => (
-        <ScoreBar key={key} label={label} score={scores[key]} />
-      ))}
-    </div>
-  );
-}
-
-interface EvalScoreDisplayProps {
-  evaluation: JudgeEvaluation;
-  isWinner?: boolean;
-  showBreakdown?: boolean;
-}
-
-export function EvalScoreDisplay({
-  evaluation,
-  isWinner,
-  showBreakdown = true,
-}: EvalScoreDisplayProps) {
-  return (
-    <div className="space-y-4">
-      <div className="flex items-center justify-between">
-        <span className="text-sm font-medium text-muted-foreground">Score</span>
-        <ScoreBadge
-          score={evaluation.overallScore}
-          isWinner={isWinner}
-          size="md"
-        />
-      </div>
-
-      {showBreakdown && (
-        <div className="pt-2 border-t border-border">
-          <ScoreBreakdownDisplay scores={evaluation.scores} />
-        </div>
-      )}
-
-      {evaluation.reasoning && (
-        <div className="pt-2 border-t border-border">
-          <p className="text-xs text-muted-foreground leading-relaxed">
-            {evaluation.reasoning}
-          </p>
-        </div>
-      )}
-    </div>
-  );
-}
diff --git a/pages/utilities/ai-eval.tsx b/pages/utilities/ai-eval.tsx
index 8126214..144363f 100644
--- a/pages/utilities/ai-eval.tsx
+++ b/pages/utilities/ai-eval.tsx
@@ -1,39 +1,28 @@
 import { useState, useCallback, useMemo } from "react";
 import PageHeader from "@/components/PageHeader";
-import { Card } from "@/components/ds/CardComponent";
 import { Button } from "@/components/ds/ButtonComponent";
 import Header from "@/components/Header";
 import { CMDK } from "@/components/CMDK";
 import Meta from "@/components/Meta";
 import CallToActionGrid from "@/components/CallToActionGrid";
-import {
-  Tabs,
-  TabsList,
-  TabsTrigger,
-} from "@/components/ds/TabsComponent";
 import {
   ComparisonMode,
   ModelConfig,
   CriteriaWeights,
   DEFAULT_CRITERIA_WEIGHTS,
   JudgeEvaluation,
-  resolveTemplate,
 } from "@/components/utils/ai-eval-schemas";
 import { chat } from "@/components/utils/ai-eval-providers";
-import { judgeSingleResponse, judgeCompareResponses } from "@/components/utils/ai-eval-judge";
+import {
+  judgeSingleResponse,
+  judgeCompareResponses,
+} from "@/components/utils/ai-eval-judge";
 import { useApiKeys } from "@/components/hooks/useApiKeys";
 import { ApiKeyDialog } from "@/components/ai-eval/ApiKeyDialog";
-import { EvalConfigPanel } from "@/components/ai-eval/EvalConfigPanel";
-import { EvalMultiModelSelector } from "@/components/ai-eval/EvalModelSelector";
-import { EvalJudgePanel } from "@/components/ai-eval/EvalJudgePanel";
-import { EvalComparisonGrid, ComparisonSummary } from "@/components/ai-eval/EvalComparisonGrid";
-import { Play, Loader2 } from "lucide-react";
-
-interface PromptVariant {
-  id: string;
-  systemPrompt: string;
-  userPrompt: string;
-}
+import { EvalModelSelector } from "@/components/ai-eval/EvalModelSelector";
+import { EvalResultCard } from "@/components/ai-eval/EvalResultCard";
+import { EvalJudgeConfig } from "@/components/ai-eval/EvalJudgeConfig";
+import { Play, Loader2, Settings2 } from "lucide-react";
 
 interface ResultData {
   id: string;
@@ -53,11 +42,15 @@ export default function AIEval() {
   const [mode, setMode] = useState<ComparisonMode>("model-vs-model");
 
   // Prompt configuration
-  const [prompts, setPrompts] = useState<PromptVariant[]>([]);
-  const [variables, setVariables] = useState<Record<string, string>>({});
+  const [systemPrompt, setSystemPrompt] = useState("You are a helpful assistant.");
+  const [userPrompt, setUserPrompt] = useState("");
+
+  // For prompt-vs-prompt mode
+  const [promptVariants, setPromptVariants] = useState<string[]>(["", ""]);
 
   // Model selection
   const [selectedModels, setSelectedModels] = useState<ModelConfig[]>([]);
+  const [singleModel, setSingleModel] = useState<ModelConfig | null>(null);
 
   // Judge settings
   const [judgeModel, setJudgeModel] = useState<ModelConfig | null>(null);
@@ -65,67 +58,54 @@ export default function AIEval() {
     DEFAULT_CRITERIA_WEIGHTS
   );
   const [autoEvaluate, setAutoEvaluate] = useState(true);
+  const [showJudgeSettings, setShowJudgeSettings] = useState(false);
 
   // Results state
   const [results, setResults] = useState<ResultData[]>([]);
   const [isRunning, setIsRunning] = useState(false);
   const [comparisonReasoning, setComparisonReasoning] = useState<string>("");
 
-  // Determine winners based on evaluation scores
+  // Determine winners
   const winnerIds = useMemo(() => {
-    const completedResults = results.filter(
-      (r) => r.evaluation && !r.isLoading && !r.error
-    );
-
-    if (completedResults.length < 2) return [];
-
-    const maxScore = Math.max(
-      ...completedResults.map((r) => r.evaluation?.overallScore ?? 0)
-    );
-
-    // Consider it a winner if within 0.5 of max score (accounting for ties)
-    return completedResults
+    const completed = results.filter((r) => r.evaluation && !r.isLoading && !r.error);
+    if (completed.length < 2) return [];
+    const maxScore = Math.max(...completed.map((r) => r.evaluation?.overallScore ?? 0));
+    return completed
       .filter((r) => (r.evaluation?.overallScore ?? 0) >= maxScore - 0.5)
       .map((r) => r.id);
   }, [results]);
 
-  // Validate if we can run evaluation
+  // Validate if we can run
   const canRun = useMemo(() => {
     if (isRunning) return false;
-    if (prompts.length === 0) return false;
-
     if (mode === "model-vs-model") {
+      if (!userPrompt.trim()) return false;
       if (selectedModels.length < 2) return false;
-      // Check if we have API keys for all selected models
       for (const model of selectedModels) {
         if (!apiKeys.hasKey(model.providerId)) return false;
       }
     } else {
-      if (selectedModels.length < 1) return false;
-      if (prompts.length < 2) return false;
-      if (!apiKeys.hasKey(selectedModels[0].providerId)) return false;
+      if (!singleModel) return false;
+      if (!apiKeys.hasKey(singleModel.providerId)) return false;
+      if (promptVariants.filter((p) => p.trim()).length < 2) return false;
     }
-
     return true;
-  }, [isRunning, prompts, mode, selectedModels, apiKeys]);
+  }, [isRunning, mode, userPrompt, selectedModels, singleModel, promptVariants, apiKeys]);
 
   // Run evaluation
   const runEvaluation = useCallback(async () => {
     if (!canRun) return;
-
     setIsRunning(true);
     setComparisonReasoning("");
 
-    // Initialize results
     const initialResults: ResultData[] = [];
 
     if (mode === "model-vs-model") {
-      // One result per model
       for (const model of selectedModels) {
         initialResults.push({
-          id: `${model.id}-${prompts[0].id}`,
+          id: model.id,
           modelId: model.id,
-          promptId: prompts[0].id,
+          promptId: "main",
           output: null,
           evaluation: null,
           isLoading: true,
@@ -133,18 +113,19 @@ export default function AIEval() {
         });
       }
     } else {
-      // One result per prompt
-      for (const prompt of prompts) {
-        initialResults.push({
-          id: `${selectedModels[0].id}-${prompt.id}`,
-          modelId: selectedModels[0].id,
-          promptId: prompt.id,
-          output: null,
-          evaluation: null,
-          isLoading: true,
-          error: null,
-        });
-      }
+      promptVariants.forEach((_, index) => {
+        if (promptVariants[index].trim()) {
+          initialResults.push({
+            id: `prompt-${index}`,
+            modelId: singleModel!.id,
+            promptId: `prompt-${index}`,
+            output: null,
+            evaluation: null,
+            isLoading: true,
+            error: null,
+          });
+        }
+      });
     }
 
     setResults(initialResults);
@@ -153,8 +134,15 @@ export default function AIEval() {
     const generationResults: ResultData[] = [];
 
     for (const result of initialResults) {
-      const model = selectedModels.find((m) => m.id === result.modelId);
-      const prompt = prompts.find((p) => p.id === result.promptId);
+      const model =
+        mode === "model-vs-model"
+          ? selectedModels.find((m) => m.id === result.modelId)
+          : singleModel;
+
+      const prompt =
+        mode === "model-vs-model"
+          ? userPrompt
+          : promptVariants[parseInt(result.promptId.split("-")[1])];
 
       if (!model || !prompt) continue;
 
@@ -168,15 +156,14 @@ export default function AIEval() {
         continue;
       }
 
-      const resolvedUserPrompt = resolveTemplate(prompt.userPrompt, variables);
       const startTime = Date.now();
 
       try {
         const response = await chat(apiKey, {
           model: model.id,
           messages: [
-            { role: "system", content: prompt.systemPrompt },
-            { role: "user", content: resolvedUserPrompt },
+            { role: "system", content: systemPrompt },
+            { role: "user", content: prompt },
           ],
         });
 
@@ -189,7 +176,6 @@ export default function AIEval() {
           latencyMs,
         });
 
-        // Update UI with generation result
         setResults((prev) =>
           prev.map((r) =>
             r.id === result.id
@@ -210,10 +196,7 @@ export default function AIEval() {
               ? {
                   ...r,
                   isLoading: false,
-                  error:
-                    error instanceof Error
-                      ? error.message
-                      : "Generation failed",
+                  error: error instanceof Error ? error.message : "Generation failed",
                 }
               : r
           )
@@ -221,49 +204,38 @@ export default function AIEval() {
       }
     }
 
-    // Run evaluation if auto-evaluate is on and we have a judge model
+    // Run judge evaluation
     if (autoEvaluate && judgeModel) {
       const judgeApiKey = apiKeys.getKey(judgeModel.providerId);
-
       if (judgeApiKey) {
-        const successfulResults = generationResults.filter(
-          (r) => r.output && !r.error
-        );
+        const successful = generationResults.filter((r) => r.output && !r.error);
 
-        if (successfulResults.length >= 2) {
-          // Use pairwise comparison for 2 results
+        if (successful.length >= 2) {
           try {
-            const prompt = prompts[0];
-            const resolvedPrompt = resolveTemplate(prompt.userPrompt, variables);
-            const fullPrompt = `${prompt.systemPrompt}\n\n${resolvedPrompt}`;
+            const fullPrompt = `${systemPrompt}\n\n${mode === "model-vs-model" ? userPrompt : promptVariants[0]}`;
 
             const comparison = await judgeCompareResponses({
               apiKey: judgeApiKey,
               judgeModel: judgeModel.id,
               originalPrompt: fullPrompt,
-              responseA: successfulResults[0].output!,
-              responseB: successfulResults[1].output!,
+              responseA: successful[0].output!,
+              responseB: successful[1].output!,
               weights: criteriaWeights,
             });
 
             setComparisonReasoning(comparison.comparisonReasoning);
 
-            // Update results with evaluations
             setResults((prev) =>
               prev.map((r) => {
-                if (r.id === successfulResults[0].id) {
-                  return { ...r, evaluation: comparison.evaluationA };
-                }
-                if (r.id === successfulResults[1].id) {
-                  return { ...r, evaluation: comparison.evaluationB };
-                }
+                if (r.id === successful[0].id) return { ...r, evaluation: comparison.evaluationA };
+                if (r.id === successful[1].id) return { ...r, evaluation: comparison.evaluationB };
                 return r;
               })
             );
 
-            // Evaluate remaining results individually
-            for (let i = 2; i < successfulResults.length; i++) {
-              const result = successfulResults[i];
+            // Evaluate remaining
+            for (let i = 2; i < successful.length; i++) {
+              const result = successful[i];
               try {
                 const evaluation = await judgeSingleResponse({
                   apiKey: judgeApiKey,
@@ -272,44 +244,15 @@ export default function AIEval() {
                   response: result.output!,
                   weights: criteriaWeights,
                 });
-
                 setResults((prev) =>
-                  prev.map((r) =>
-                    r.id === result.id ? { ...r, evaluation } : r
-                  )
+                  prev.map((r) => (r.id === result.id ? { ...r, evaluation } : r))
                 );
-              } catch (error) {
-                console.error("Judge evaluation failed:", error);
+              } catch (e) {
+                console.error("Judge failed:", e);
               }
             }
-          } catch (error) {
-            console.error("Comparison failed:", error);
-          }
-        } else if (successfulResults.length === 1) {
-          // Single result, evaluate individually
-          const result = successfulResults[0];
-          const prompt = prompts.find((p) => p.id === result.promptId);
-          if (prompt) {
-            try {
-              const resolvedPrompt = resolveTemplate(prompt.userPrompt, variables);
-              const fullPrompt = `${prompt.systemPrompt}\n\n${resolvedPrompt}`;
-
-              const evaluation = await judgeSingleResponse({
-                apiKey: judgeApiKey,
-                judgeModel: judgeModel.id,
-                originalPrompt: fullPrompt,
-                response: result.output!,
-                weights: criteriaWeights,
-              });
-
-              setResults((prev) =>
-                prev.map((r) =>
-                  r.id === result.id ? { ...r, evaluation } : r
-                )
-              );
-            } catch (error) {
-              console.error("Judge evaluation failed:", error);
-            }
+          } catch (e) {
+            console.error("Comparison failed:", e);
           }
         }
       }
@@ -320,69 +263,97 @@ export default function AIEval() {
     canRun,
     mode,
     selectedModels,
-    prompts,
-    variables,
+    singleModel,
+    systemPrompt,
+    userPrompt,
+    promptVariants,
     apiKeys,
     autoEvaluate,
     judgeModel,
     criteriaWeights,
   ]);
 
+  const addPromptVariant = () => {
+    if (promptVariants.length < 4) {
+      setPromptVariants([...promptVariants, ""]);
+    }
+  };
+
+  const removePromptVariant = (index: number) => {
+    if (promptVariants.length > 2) {
+      setPromptVariants(promptVariants.filter((_, i) => i !== index));
+    }
+  };
+
+  const updatePromptVariant = (index: number, value: string) => {
+    setPromptVariants(promptVariants.map((p, i) => (i === index ? value : p)));
+  };
+
   return (
-    <main>
+    <main className="min-h-screen bg-background">
       <Meta
         title="AI Eval Playground | Compare Models & Prompts | Free & Open Source"
-        description="Compare AI models and prompts side-by-side with LLM-as-judge scoring. Evaluate GPT-4, Claude, Gemini responses with automated quality metrics. BYOK - your keys stay in your browser."
+        description="Compare AI models and prompts side-by-side with LLM-as-judge scoring. Evaluate GPT-4, Claude, Gemini responses with automated quality metrics."
       />
       <Header />
       <CMDK />
 
-      <section className="container max-w-6xl mb-12">
-        <PageHeader
-          title="AI Eval Playground"
-          description="Compare prompts and models side-by-side with LLM-as-judge scoring"
-        />
-      </section>
-
-      {/* Mode Toggle & API Keys */}
-      <section className="container max-w-6xl mb-6">
-        <Card className="p-4 hover:shadow-none shadow-none rounded-xl">
-          <div className="flex items-center justify-between">
-            <Tabs
-              value={mode}
-              onValueChange={(v) => setMode(v as ComparisonMode)}
+      <div className="container max-w-7xl py-8">
+        {/* Header */}
+        <div className="mb-8">
+          <PageHeader
+            title="AI Eval Playground"
+            description="Compare models and prompts with automated LLM-as-judge scoring"
+          />
+        </div>
+
+        {/* Toolbar */}
+        <div className="flex items-center justify-between mb-6 pb-6 border-b border-border">
+          <div className="flex items-center gap-1 p-1 bg-muted rounded-lg">
+            <button
+              onClick={() => setMode("model-vs-model")}
+              className={`px-4 py-2 text-sm font-medium rounded-md transition-all ${
+                mode === "model-vs-model"
+                  ? "bg-background text-foreground shadow-sm"
+                  : "text-muted-foreground hover:text-foreground"
+              }`}
             >
-              <TabsList>
-                <TabsTrigger value="model-vs-model">Model vs Model</TabsTrigger>
-                <TabsTrigger value="prompt-vs-prompt">
-                  Prompt vs Prompt
-                </TabsTrigger>
-              </TabsList>
-            </Tabs>
+              Model vs Model
+            </button>
+            <button
+              onClick={() => setMode("prompt-vs-prompt")}
+              className={`px-4 py-2 text-sm font-medium rounded-md transition-all ${
+                mode === "prompt-vs-prompt"
+                  ? "bg-background text-foreground shadow-sm"
+                  : "text-muted-foreground hover:text-foreground"
+              }`}
+            >
+              Prompt vs Prompt
+            </button>
+          </div>
 
+          <div className="flex items-center gap-3">
+            <button
+              onClick={() => setShowJudgeSettings(!showJudgeSettings)}
+              className={`flex items-center gap-2 px-3 py-2 text-sm rounded-lg border transition-all ${
+                showJudgeSettings
+                  ? "border-primary bg-primary/5 text-primary"
+                  : "border-border text-muted-foreground hover:text-foreground hover:border-foreground/20"
+              }`}
+            >
+              <Settings2 className="h-4 w-4" />
+              Judge Settings
+            </button>
             <ApiKeyDialog apiKeys={apiKeys} />
           </div>
-        </Card>
-      </section>
-
-      {/* Configuration Grid */}
-      <section className="container max-w-6xl mb-6">
-        <div className="grid grid-cols-1 lg:grid-cols-3 gap-6">
-          {/* Prompt Configuration - Takes 2 columns */}
-          <div className="lg:col-span-2">
-            <EvalConfigPanel
-              mode={mode}
-              onPromptsChange={setPrompts}
-              onVariablesChange={setVariables}
-            />
-          </div>
+        </div>
 
-          {/* Judge Settings - Takes 1 column */}
-          <div>
-            <EvalJudgePanel
+        {/* Judge Settings Panel (collapsible) */}
+        {showJudgeSettings && (
+          <div className="mb-6">
+            <EvalJudgeConfig
               judgeModel={judgeModel}
               onJudgeModelChange={setJudgeModel}
-              comparedModelIds={selectedModels.map((m) => m.id)}
               weights={criteriaWeights}
               onWeightsChange={setCriteriaWeights}
               autoEvaluate={autoEvaluate}
@@ -390,69 +361,195 @@ export default function AIEval() {
               apiKeys={apiKeys}
             />
           </div>
-        </div>
-      </section>
-
-      {/* Model Selection */}
-      <section className="container max-w-6xl mb-6">
-        <Card className="p-6 hover:shadow-none shadow-none rounded-xl">
-          <EvalMultiModelSelector
-            values={selectedModels}
-            onChange={setSelectedModels}
-            apiKeys={apiKeys}
-            label={
-              mode === "model-vs-model"
-                ? "Select Models to Compare"
-                : "Select Model"
-            }
-            maxSelections={mode === "model-vs-model" ? 4 : 1}
-          />
-        </Card>
-      </section>
-
-      {/* Run Button */}
-      <section className="container max-w-6xl mb-6">
-        <Button
-          onClick={runEvaluation}
-          disabled={!canRun}
-          size="lg"
-          className="w-full gap-2"
-        >
-          {isRunning ? (
+        )}
+
+        {/* Main Content */}
+        <div className="space-y-6">
+          {/* System Prompt - Full Width */}
+          <div className="rounded-xl border border-border bg-card overflow-hidden">
+            <div className="px-4 py-3 border-b border-border bg-muted/30">
+              <span className="text-xs font-medium text-muted-foreground uppercase tracking-wider">
+                System Prompt
+              </span>
+            </div>
+            <textarea
+              value={systemPrompt}
+              onChange={(e) => setSystemPrompt(e.target.value)}
+              placeholder="You are a helpful assistant..."
+              className="w-full px-4 py-4 bg-transparent text-sm font-mono resize-none focus:outline-none min-h-[80px]"
+              rows={2}
+            />
+          </div>
+
+          {mode === "model-vs-model" ? (
             <>
-              <Loader2 className="h-5 w-5 animate-spin" />
-              Running Evaluation...
+              {/* User Prompt - Full Width */}
+              <div className="rounded-xl border border-border bg-card overflow-hidden">
+                <div className="px-4 py-3 border-b border-border bg-muted/30">
+                  <span className="text-xs font-medium text-muted-foreground uppercase tracking-wider">
+                    User Prompt
+                  </span>
+                </div>
+                <textarea
+                  value={userPrompt}
+                  onChange={(e) => setUserPrompt(e.target.value)}
+                  placeholder="Enter your prompt here..."
+                  className="w-full px-4 py-4 bg-transparent text-sm resize-none focus:outline-none min-h-[120px]"
+                  rows={4}
+                />
+              </div>
+
+              {/* Model Selection */}
+              <div className="rounded-xl border border-border bg-card overflow-hidden">
+                <div className="px-4 py-3 border-b border-border bg-muted/30 flex items-center justify-between">
+                  <span className="text-xs font-medium text-muted-foreground uppercase tracking-wider">
+                    Select Models to Compare
+                  </span>
+                  <span className="text-xs text-muted-foreground">
+                    {selectedModels.length}/4 selected
+                  </span>
+                </div>
+                <div className="p-4">
+                  <EvalModelSelector
+                    selectedModels={selectedModels}
+                    onModelsChange={setSelectedModels}
+                    apiKeys={apiKeys}
+                    maxSelections={4}
+                  />
+                </div>
+              </div>
             </>
           ) : (
             <>
-              <Play className="h-5 w-5" />
-              Run Evaluation
+              {/* Single Model Selection */}
+              <div className="rounded-xl border border-border bg-card overflow-hidden">
+                <div className="px-4 py-3 border-b border-border bg-muted/30">
+                  <span className="text-xs font-medium text-muted-foreground uppercase tracking-wider">
+                    Select Model
+                  </span>
+                </div>
+                <div className="p-4">
+                  <EvalModelSelector
+                    selectedModels={singleModel ? [singleModel] : []}
+                    onModelsChange={(models) => setSingleModel(models[0] || null)}
+                    apiKeys={apiKeys}
+                    maxSelections={1}
+                  />
+                </div>
+              </div>
+
+              {/* Prompt Variants Grid */}
+              <div className="grid grid-cols-1 md:grid-cols-2 gap-4">
+                {promptVariants.map((prompt, index) => (
+                  <div
+                    key={index}
+                    className="rounded-xl border border-border bg-card overflow-hidden"
+                  >
+                    <div className="px-4 py-3 border-b border-border bg-muted/30 flex items-center justify-between">
+                      <span className="text-xs font-medium text-muted-foreground uppercase tracking-wider">
+                        Prompt {String.fromCharCode(65 + index)}
+                      </span>
+                      {promptVariants.length > 2 && (
+                        <button
+                          onClick={() => removePromptVariant(index)}
+                          className="text-xs text-muted-foreground hover:text-destructive transition-colors"
+                        >
+                          Remove
+                        </button>
+                      )}
+                    </div>
+                    <textarea
+                      value={prompt}
+                      onChange={(e) => updatePromptVariant(index, e.target.value)}
+                      placeholder={`Enter prompt variant ${String.fromCharCode(65 + index)}...`}
+                      className="w-full px-4 py-4 bg-transparent text-sm resize-none focus:outline-none min-h-[120px]"
+                      rows={4}
+                    />
+                  </div>
+                ))}
+              </div>
+
+              {promptVariants.length < 4 && (
+                <button
+                  onClick={addPromptVariant}
+                  className="w-full py-3 border border-dashed border-border rounded-xl text-sm text-muted-foreground hover:text-foreground hover:border-foreground/20 transition-all"
+                >
+                  + Add Prompt Variant
+                </button>
+              )}
             </>
           )}
-        </Button>
-      </section>
-
-      {/* Results Grid */}
-      {results.length > 0 && (
-        <section className="container max-w-6xl mb-6">
-          <EvalComparisonGrid
-            mode={mode}
-            models={selectedModels}
-            results={results}
-            winnerIds={winnerIds}
-          />
 
-          {comparisonReasoning && (
-            <div className="mt-4">
-              <ComparisonSummary
-                results={results}
-                winnerIds={winnerIds}
-                comparisonReasoning={comparisonReasoning}
-              />
+          {/* Run Button */}
+          <Button
+            onClick={runEvaluation}
+            disabled={!canRun}
+            size="lg"
+            className="w-full h-12 text-base font-medium"
+          >
+            {isRunning ? (
+              <>
+                <Loader2 className="h-5 w-5 animate-spin mr-2" />
+                Running Evaluation...
+              </>
+            ) : (
+              <>
+                <Play className="h-5 w-5 mr-2" />
+                Run Evaluation
+              </>
+            )}
+          </Button>
+
+          {/* Results */}
+          {results.length > 0 && (
+            <div className="space-y-4">
+              <div className="flex items-center justify-between">
+                <h2 className="text-lg font-semibold">Results</h2>
+                {comparisonReasoning && (
+                  <p className="text-sm text-muted-foreground max-w-xl">
+                    {comparisonReasoning}
+                  </p>
+                )}
+              </div>
+
+              <div
+                className={`grid gap-4 ${
+                  results.length === 2
+                    ? "grid-cols-1 md:grid-cols-2"
+                    : results.length === 3
+                      ? "grid-cols-1 md:grid-cols-3"
+                      : "grid-cols-1 md:grid-cols-2 lg:grid-cols-4"
+                }`}
+              >
+                {results.map((result, index) => {
+                  const model =
+                    mode === "model-vs-model"
+                      ? selectedModels.find((m) => m.id === result.modelId)
+                      : singleModel;
+
+                  return (
+                    <EvalResultCard
+                      key={result.id}
+                      label={
+                        mode === "model-vs-model"
+                          ? model?.name || "Unknown"
+                          : `Prompt ${String.fromCharCode(65 + index)}`
+                      }
+                      sublabel={mode === "model-vs-model" ? undefined : model?.name}
+                      output={result.output}
+                      evaluation={result.evaluation}
+                      isLoading={result.isLoading}
+                      error={result.error}
+                      isWinner={winnerIds.includes(result.id)}
+                      latencyMs={result.latencyMs}
+                    />
+                  );
+                })}
+              </div>
             </div>
           )}
-        </section>
-      )}
+        </div>
+      </div>
 
       <CallToActionGrid />
     </main>