From b363fb082967778a7a3523a9669ecb2cacee769a Mon Sep 17 00:00:00 2001
From: Enreign <taysic@gmail.com>
Date: Tue, 17 Mar 2026 00:53:41 +0100
Subject: [PATCH 1/2] feat(leaderboard): ghost performance leaderboard and A/B
 testing

- GhostLeaderboard: SQLite-backed outcome tracking per ghost profile
- TaskOutcome: records success, latency, token usage, user rating (-1/0/1)
- GhostMetrics: aggregate stats with composite rank_score (60% success,
  20% user rating, 20% token efficiency)
- A/B testing: route configurable fraction of requests to a challenger ghost
- Auto-promotion: recommend promoting challenger when it outperforms control
  by promotion_threshold (default: 10%) over min_samples (default: 50)
- ASCII leaderboard with star ratings and head-to-head comparison
- 'sparks leaderboard [show|compare <a> <b>|reset]' CLI subcommand
- 7 unit tests covering recording, ranking, A/B routing, promotion check

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/config.rs      |  40 +++++
 src/cost.rs        | 334 +++++++++++++++++++++++++++++++++++++++
 src/leaderboard.rs | 377 +++++++++++++++++++++++++++++++++++++++++++++
 src/main.rs        |  37 +++++
 src/snapshot.rs    |   1 +
 5 files changed, 789 insertions(+)
 create mode 100644 src/cost.rs
 create mode 100644 src/leaderboard.rs
diff --git a/src/config.rs b/src/config.rs
index 5e69d36..c14340f 100644
--- a/src/config.rs
+++ b/src/config.rs
@@ -73,6 +73,8 @@ pub struct Config {
     pub sonarqube: SonarqubeConfig,
     #[serde(default)]
     pub snapshot: SnapshotConfig,
+    #[serde(default)]
+    pub leaderboard: LeaderboardConfig,
     #[serde(skip)]
     inline_secret_labels: Vec<String>,
 }
@@ -1505,6 +1507,43 @@ impl ManagerConfig {
     }
 }
 
+// ── Leaderboard config ────────────────────────────────────────────────
+
+#[derive(Debug, Deserialize, Serialize, Clone)]
+pub struct LeaderboardConfig {
+    /// Enable leaderboard tracking (default: true)
+    #[serde(default = "default_lb_enabled")]
+    pub enabled: bool,
+    /// Ghost name to A/B test against the default ghost (None = disabled)
+    pub ab_test_ghost: Option<String>,
+    /// Fraction of requests routed to the challenger ghost (0.0-1.0, default: 0.1)
+    #[serde(default = "default_ab_fraction")]
+    pub ab_test_fraction: f64,
+    /// Minimum samples before auto-promotion recommendation (default: 50)
+    #[serde(default = "default_lb_min_samples")]
+    pub min_samples_for_recommendation: u64,
+    /// Success rate improvement threshold for auto-promotion (default: 0.10 = 10%)
+    #[serde(default = "default_lb_threshold")]
+    pub promotion_threshold: f64,
+}
+
+fn default_lb_enabled() -> bool { true }
+fn default_ab_fraction() -> f64 { 0.1 }
+fn default_lb_min_samples() -> u64 { 50 }
+fn default_lb_threshold() -> f64 { 0.10 }
+
+impl Default for LeaderboardConfig {
+    fn default() -> Self {
+        Self {
+            enabled: default_lb_enabled(),
+            ab_test_ghost: None,
+            ab_test_fraction: default_ab_fraction(),
+            min_samples_for_recommendation: default_lb_min_samples(),
+            promotion_threshold: default_lb_threshold(),
+        }
+    }
+}
+
 impl Default for Config {
     fn default() -> Self {
         Self {
@@ -1538,6 +1577,7 @@ impl Default for Config {
             alerts: AlertsConfig::default(),
             sonarqube: SonarqubeConfig::default(),
             snapshot: SnapshotConfig::default(),
+            leaderboard: LeaderboardConfig::default(),
             inline_secret_labels: Vec::new(),
         }
     }
diff --git a/src/cost.rs b/src/cost.rs
new file mode 100644
index 0000000..993bebc
--- /dev/null
+++ b/src/cost.rs
@@ -0,0 +1,334 @@
+//! Token usage and cost tracking.
+//!
+//! Records per-call token counts and calculates USD cost based on model pricing.
+//! Enforces daily and per-session budgets when configured.
+
+use std::collections::HashMap;
+use std::sync::Mutex;
+
+use rusqlite::{Connection, params};
+
+use crate::config::CostConfig;
+use crate::error::{SparksError, Result};
+
+/// Built-in model pricing: (input_per_1m_usd, output_per_1m_usd).
+/// Users can override via config.cost.model_prices.
+pub fn builtin_prices() -> HashMap<&'static str, (f64, f64)> {
+    let mut m = HashMap::new();
+    // Anthropic
+    m.insert("claude-opus-4-6",       (15.00, 75.00));
+    m.insert("claude-sonnet-4-6",     (3.00,  15.00));
+    m.insert("claude-haiku-4-5",      (0.80,  4.00));
+    // OpenAI
+    m.insert("gpt-4o",                (5.00,  15.00));
+    m.insert("gpt-4o-mini",           (0.15,  0.60));
+    m.insert("gpt-4-turbo",           (10.00, 30.00));
+    m.insert("o1",                    (15.00, 60.00));
+    m.insert("o3-mini",               (1.10,  4.40));
+    // Common aliases
+    m.insert("gpt-4",                 (30.00, 60.00));
+    m.insert("gpt-3.5-turbo",         (0.50,  1.50));
+    m
+}
+
+/// A single token usage record.
+#[derive(Debug, Clone)]
+pub struct TokenUsage {
+    pub session_key: String,
+    pub model: String,
+    pub ghost: Option<String>,
+    pub input_tokens: u64,
+    pub output_tokens: u64,
+    pub cost_usd: f64,
+}
+
+impl TokenUsage {
+    pub fn total_tokens(&self) -> u64 {
+        self.input_tokens + self.output_tokens
+    }
+}
+
+/// Aggregate cost summary.
+#[derive(Debug, Default, Clone)]
+pub struct CostSummary {
+    pub total_input_tokens: u64,
+    pub total_output_tokens: u64,
+    pub total_cost_usd: f64,
+    pub by_model: HashMap<String, f64>,
+    pub by_ghost: HashMap<String, f64>,
+    pub record_count: usize,
+}
+
+impl CostSummary {
+    pub fn format_report(&self) -> String {
+        let mut lines = vec![
+            format!("**\u{1f4b0} Cost Summary**"),
+            format!(""),
+            format!("Total: **${:.4}**", self.total_cost_usd),
+            format!("Input tokens: {}", self.total_input_tokens),
+            format!("Output tokens: {}", self.total_output_tokens),
+            format!("Calls: {}", self.record_count),
+        ];
+
+        if !self.by_model.is_empty() {
+            lines.push(String::new());
+            lines.push("**By model:**".to_string());
+            let mut models: Vec<_> = self.by_model.iter().collect();
+            models.sort_by(|a, b| b.1.partial_cmp(a.1).unwrap_or(std::cmp::Ordering::Equal));
+            for (model, cost) in models {
+                lines.push(format!("  \u{2022} {}: ${:.4}", model, cost));
+            }
+        }
+
+        if !self.by_ghost.is_empty() {
+            lines.push(String::new());
+            lines.push("**By ghost:**".to_string());
+            let mut ghosts: Vec<_> = self.by_ghost.iter().collect();
+            ghosts.sort_by(|a, b| b.1.partial_cmp(a.1).unwrap_or(std::cmp::Ordering::Equal));
+            for (ghost, cost) in ghosts {
+                lines.push(format!("  \u{2022} {}: ${:.4}", ghost, cost));
+            }
+        }
+
+        lines.join("\n")
+    }
+}
+
+/// Calculate USD cost for given token counts and model.
+pub fn calculate_cost(model: &str, input_tokens: u64, output_tokens: u64, config: &CostConfig) -> f64 {
+    // Check config overrides first
+    let prices = if let Some(prices) = config.model_prices.get(model) {
+        (prices[0], prices[1])
+    } else {
+        let builtin = builtin_prices();
+        // Try exact match, then prefix match
+        if let Some(&(inp, out)) = builtin.get(model) {
+            (inp, out)
+        } else {
+            // Try prefix: "claude-sonnet-4-6-20251022" -> "claude-sonnet-4-6"
+            let matched = builtin.iter()
+                .find(|(k, _)| model.starts_with(*k))
+                .map(|(_, v)| *v);
+            matched.unwrap_or((0.0, 0.0))
+        }
+    };
+    (input_tokens as f64 / 1_000_000.0 * prices.0)
+        + (output_tokens as f64 / 1_000_000.0 * prices.1)
+}
+
+/// Persistent cost tracker backed by SQLite.
+pub struct CostTracker {
+    conn: Mutex<Connection>,
+    config: CostConfig,
+}
+
+impl CostTracker {
+    pub fn new(conn: Connection, config: CostConfig) -> Result<Self> {
+        {
+            conn.execute_batch(
+                "CREATE TABLE IF NOT EXISTS cost_log (
+                    id INTEGER PRIMARY KEY AUTOINCREMENT,
+                    session_key TEXT NOT NULL,
+                    model TEXT NOT NULL,
+                    ghost TEXT,
+                    input_tokens INTEGER NOT NULL DEFAULT 0,
+                    output_tokens INTEGER NOT NULL DEFAULT 0,
+                    cost_usd REAL NOT NULL DEFAULT 0.0,
+                    created_at TEXT NOT NULL DEFAULT (datetime('now'))
+                );
+                CREATE INDEX IF NOT EXISTS idx_cost_log_session ON cost_log(session_key);
+                CREATE INDEX IF NOT EXISTS idx_cost_log_created ON cost_log(created_at);",
+            )?;
+        }
+        Ok(Self { conn: Mutex::new(conn), config })
+    }
+
+    /// Record a token usage event and check budget.
+    pub fn record(&self, usage: &TokenUsage) -> Result<()> {
+        if !self.config.enabled {
+            return Ok(());
+        }
+        let conn = self.conn.lock().map_err(|_| SparksError::Internal("cost lock poisoned".into()))?;
+        conn.execute(
+            "INSERT INTO cost_log (session_key, model, ghost, input_tokens, output_tokens, cost_usd)
+             VALUES (?1, ?2, ?3, ?4, ?5, ?6)",
+            params![
+                usage.session_key,
+                usage.model,
+                usage.ghost,
+                usage.input_tokens as i64,
+                usage.output_tokens as i64,
+                usage.cost_usd,
+            ],
+        )?;
+        Ok(())
+    }
+
+    /// Get cost summary for today.
+    pub fn today_summary(&self) -> Result<CostSummary> {
+        self.summary_since("date('now')")
+    }
+
+    /// Get cost summary for a session.
+    pub fn session_summary(&self, session_key: &str) -> Result<CostSummary> {
+        let conn = self.conn.lock().map_err(|_| SparksError::Internal("cost lock poisoned".into()))?;
+        let mut stmt = conn.prepare(
+            "SELECT model, ghost, input_tokens, output_tokens, cost_usd
+             FROM cost_log WHERE session_key = ?1"
+        )?;
+        self.aggregate_rows(&mut stmt, rusqlite::params![session_key])
+    }
+
+    fn summary_since(&self, date_expr: &str) -> Result<CostSummary> {
+        let conn = self.conn.lock().map_err(|_| SparksError::Internal("cost lock poisoned".into()))?;
+        let sql = format!(
+            "SELECT model, ghost, input_tokens, output_tokens, cost_usd
+             FROM cost_log WHERE date(created_at) >= {}", date_expr
+        );
+        let mut stmt = conn.prepare(&sql)?;
+        self.aggregate_rows(&mut stmt, rusqlite::params![])
+    }
+
+    fn aggregate_rows(
+        &self,
+        stmt: &mut rusqlite::Statement<'_>,
+        params: &[&dyn rusqlite::ToSql],
+    ) -> Result<CostSummary> {
+        let mut summary = CostSummary::default();
+        let rows = stmt.query_map(params, |row| {
+            Ok((
+                row.get::<_, String>(0)?,   // model
+                row.get::<_, Option<String>>(1)?, // ghost
+                row.get::<_, i64>(2)? as u64, // input_tokens
+                row.get::<_, i64>(3)? as u64, // output_tokens
+                row.get::<_, f64>(4)?,        // cost_usd
+            ))
+        })?;
+
+        for row in rows {
+            let (model, ghost, input, output, cost) = row?;
+            summary.total_input_tokens += input;
+            summary.total_output_tokens += output;
+            summary.total_cost_usd += cost;
+            summary.record_count += 1;
+            *summary.by_model.entry(model).or_default() += cost;
+            if let Some(g) = ghost {
+                *summary.by_ghost.entry(g).or_default() += cost;
+            }
+        }
+        Ok(summary)
+    }
+
+    /// Check if the daily budget is exceeded. Returns Err if block_on_exceeded and over budget.
+    pub fn check_daily_budget(&self) -> Result<()> {
+        if self.config.daily_budget_usd <= 0.0 || !self.config.enabled {
+            return Ok(());
+        }
+        let summary = self.today_summary()?;
+        if summary.total_cost_usd >= self.config.daily_budget_usd {
+            let msg = format!(
+                "Daily cost budget exceeded: ${:.4} >= ${:.2}",
+                summary.total_cost_usd, self.config.daily_budget_usd
+            );
+            if self.config.on_budget_exceeded == "block" {
+                return Err(SparksError::Tool(msg));
+            } else {
+                tracing::warn!("{}", msg);
+            }
+        }
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn calculate_cost_known_model() {
+        let config = CostConfig::default();
+        // claude-sonnet: $3/1M input, $15/1M output
+        let cost = calculate_cost("claude-sonnet-4-6", 1_000_000, 1_000_000, &config);
+        assert!((cost - 18.0).abs() < 0.01, "Expected ~$18, got ${}", cost);
+    }
+
+    #[test]
+    fn calculate_cost_unknown_model() {
+        let config = CostConfig::default();
+        let cost = calculate_cost("unknown-model-xyz", 1000, 1000, &config);
+        assert_eq!(cost, 0.0);
+    }
+
+    #[test]
+    fn calculate_cost_config_override() {
+        let mut config = CostConfig::default();
+        config.model_prices.insert("my-model".to_string(), [10.0, 20.0]);
+        let cost = calculate_cost("my-model", 1_000_000, 500_000, &config);
+        assert!((cost - 20.0).abs() < 0.01); // $10 input + $10 output
+    }
+
+    #[test]
+    fn calculate_cost_prefix_match() {
+        let config = CostConfig::default();
+        // Should match "claude-sonnet-4-6" prefix
+        let cost = calculate_cost("claude-sonnet-4-6-20251022", 1_000_000, 0, &config);
+        assert!((cost - 3.0).abs() < 0.01);
+    }
+
+    #[test]
+    fn cost_tracker_record_and_summarize() {
+        let conn = Connection::open_in_memory().unwrap();
+        let tracker = CostTracker::new(conn, CostConfig::default()).unwrap();
+        let usage = TokenUsage {
+            session_key: "test:session".to_string(),
+            model: "claude-sonnet-4-6".to_string(),
+            ghost: Some("coder".to_string()),
+            input_tokens: 1000,
+            output_tokens: 500,
+            cost_usd: 0.0105,
+        };
+        tracker.record(&usage).unwrap();
+        let summary = tracker.session_summary("test:session").unwrap();
+        assert_eq!(summary.record_count, 1);
+        assert_eq!(summary.total_input_tokens, 1000);
+        assert!((summary.total_cost_usd - 0.0105).abs() < 0.0001);
+        assert!(summary.by_ghost.contains_key("coder"));
+    }
+
+    #[test]
+    fn cost_summary_format_report() {
+        let mut summary = CostSummary::default();
+        summary.total_cost_usd = 1.2345;
+        summary.total_input_tokens = 100_000;
+        summary.total_output_tokens = 50_000;
+        summary.record_count = 10;
+        summary.by_model.insert("claude-sonnet-4-6".to_string(), 1.2345);
+        let report = summary.format_report();
+        assert!(report.contains("$1.2345"));
+        assert!(report.contains("claude-sonnet-4-6"));
+    }
+
+    #[test]
+    fn builtin_prices_non_empty() {
+        let prices = builtin_prices();
+        assert!(!prices.is_empty());
+        assert!(prices.contains_key("claude-sonnet-4-6"));
+        assert!(prices.contains_key("gpt-4o"));
+    }
+
+    #[test]
+    fn delivery_channel_severity() {
+        // Verify severity ranking is strictly ordered
+        assert!(severity_rank("critical") > severity_rank("warning"));
+        assert!(severity_rank("warning") > severity_rank("info"));
+    }
+
+    fn severity_rank(s: &str) -> u8 {
+        match s {
+            "critical" => 3,
+            "warning" => 2,
+            "info" => 1,
+            _ => 0,
+        }
+    }
+}
diff --git a/src/leaderboard.rs b/src/leaderboard.rs
new file mode 100644
index 0000000..90f7e48
--- /dev/null
+++ b/src/leaderboard.rs
@@ -0,0 +1,377 @@
+//! Agent performance leaderboard and A/B ghost comparison.
+//!
+//! Tracks per-ghost performance metrics (success rate, latency, token usage)
+//! and supports A/B testing between ghost profiles.
+
+use std::sync::Mutex;
+
+use rusqlite::{Connection, params};
+
+use crate::config::LeaderboardConfig;
+use crate::error::{SparksError, Result};
+
+/// A single task outcome record.
+#[derive(Debug, Clone)]
+pub struct TaskOutcome {
+    pub session_key: String,
+    pub ghost: String,
+    pub success: bool,
+    pub latency_ms: u64,
+    pub input_tokens: u64,
+    pub output_tokens: u64,
+    pub user_rating: Option<i8>,  // -1 (thumbs down), 0 (neutral), 1 (thumbs up)
+}
+
+/// Aggregated performance metrics for a ghost.
+#[derive(Debug, Clone, Default)]
+pub struct GhostMetrics {
+    pub ghost: String,
+    pub total_tasks: u64,
+    pub successful_tasks: u64,
+    pub success_rate: f64,
+    pub avg_latency_ms: f64,
+    pub avg_input_tokens: f64,
+    pub avg_output_tokens: f64,
+    pub avg_rating: f64,
+    pub rated_tasks: u64,
+}
+
+impl GhostMetrics {
+    pub fn rank_score(&self) -> f64 {
+        // Weighted composite: 60% success rate, 20% rating, 20% efficiency (inverse tokens)
+        let efficiency = if self.avg_input_tokens + self.avg_output_tokens > 0.0 {
+            1.0 - ((self.avg_input_tokens + self.avg_output_tokens) / 10_000.0).min(1.0)
+        } else {
+            0.5
+        };
+        let rating = if self.rated_tasks > 0 { (self.avg_rating + 1.0) / 2.0 } else { 0.5 };
+        0.6 * self.success_rate + 0.2 * rating + 0.2 * efficiency
+    }
+
+    pub fn format_row(&self, rank: usize) -> String {
+        let stars = match (self.success_rate * 10.0) as u32 {
+            9..=10 => "★★★★★",
+            7..=8  => "★★★★☆",
+            5..=6  => "★★★☆☆",
+            3..=4  => "★★☆☆☆",
+            _      => "★☆☆☆☆",
+        };
+        format!(
+            "#{} {:20} {} {:.0}% success  {:.0}ms avg  {} tasks",
+            rank, self.ghost, stars,
+            self.success_rate * 100.0,
+            self.avg_latency_ms,
+            self.total_tasks,
+        )
+    }
+}
+
+/// A/B test routing result.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum AbRoute {
+    Control,
+    Challenger(String),
+}
+
+/// The leaderboard store backed by SQLite.
+pub struct GhostLeaderboard {
+    conn: Mutex<Connection>,
+    config: LeaderboardConfig,
+}
+
+impl GhostLeaderboard {
+    pub fn new(conn: Connection, config: LeaderboardConfig) -> Result<Self> {
+        {
+            conn.execute_batch(
+                "CREATE TABLE IF NOT EXISTS ghost_outcomes (
+                    id INTEGER PRIMARY KEY AUTOINCREMENT,
+                    session_key TEXT NOT NULL,
+                    ghost TEXT NOT NULL,
+                    success INTEGER NOT NULL DEFAULT 0,
+                    latency_ms INTEGER NOT NULL DEFAULT 0,
+                    input_tokens INTEGER NOT NULL DEFAULT 0,
+                    output_tokens INTEGER NOT NULL DEFAULT 0,
+                    user_rating INTEGER,
+                    created_at TEXT NOT NULL DEFAULT (datetime('now'))
+                );
+                CREATE INDEX IF NOT EXISTS idx_ghost_outcomes_ghost ON ghost_outcomes(ghost);
+                CREATE INDEX IF NOT EXISTS idx_ghost_outcomes_created ON ghost_outcomes(created_at);",
+            )?;
+        }
+        Ok(Self { conn: Mutex::new(conn), config })
+    }
+
+    /// Record a task outcome.
+    pub fn record(&self, outcome: &TaskOutcome) -> Result<()> {
+        if !self.config.enabled {
+            return Ok(());
+        }
+        let conn = self.conn.lock().map_err(|_| SparksError::Internal("lb lock".into()))?;
+        conn.execute(
+            "INSERT INTO ghost_outcomes (session_key, ghost, success, latency_ms, input_tokens, output_tokens, user_rating)
+             VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7)",
+            params![
+                outcome.session_key,
+                outcome.ghost,
+                outcome.success as i32,
+                outcome.latency_ms as i64,
+                outcome.input_tokens as i64,
+                outcome.output_tokens as i64,
+                outcome.user_rating.map(|r| r as i64),
+            ],
+        )?;
+        Ok(())
+    }
+
+    /// Get metrics for all ghosts, sorted by rank score.
+    pub fn rankings(&self) -> Result<Vec<GhostMetrics>> {
+        let conn = self.conn.lock().map_err(|_| SparksError::Internal("lb lock".into()))?;
+        let mut stmt = conn.prepare(
+            "SELECT ghost,
+                    COUNT(*) as total,
+                    SUM(success) as successes,
+                    AVG(latency_ms) as avg_latency,
+                    AVG(input_tokens) as avg_input,
+                    AVG(output_tokens) as avg_output,
+                    AVG(CASE WHEN user_rating IS NOT NULL THEN CAST(user_rating AS REAL) END) as avg_rating,
+                    COUNT(user_rating) as rated
+             FROM ghost_outcomes
+             GROUP BY ghost
+             ORDER BY total DESC"
+        )?;
+
+        let rows = stmt.query_map([], |row| {
+            Ok((
+                row.get::<_, String>(0)?,
+                row.get::<_, i64>(1)? as u64,
+                row.get::<_, i64>(2)? as u64,
+                row.get::<_, f64>(3)?,
+                row.get::<_, f64>(4)?,
+                row.get::<_, f64>(5)?,
+                row.get::<_, Option<f64>>(6)?,
+                row.get::<_, i64>(7)? as u64,
+            ))
+        })?;
+
+        let mut metrics = Vec::new();
+        for row in rows {
+            let (ghost, total, successes, avg_latency, avg_input, avg_output, avg_rating, rated) = row?;
+            let success_rate = if total > 0 { successes as f64 / total as f64 } else { 0.0 };
+            metrics.push(GhostMetrics {
+                ghost,
+                total_tasks: total,
+                successful_tasks: successes,
+                success_rate,
+                avg_latency_ms: avg_latency,
+                avg_input_tokens: avg_input,
+                avg_output_tokens: avg_output,
+                avg_rating: avg_rating.unwrap_or(0.0),
+                rated_tasks: rated,
+            });
+        }
+
+        metrics.sort_by(|a, b| b.rank_score().partial_cmp(&a.rank_score()).unwrap_or(std::cmp::Ordering::Equal));
+        Ok(metrics)
+    }
+
+    /// Get metrics for a specific ghost.
+    pub fn ghost_metrics(&self, ghost: &str) -> Result<Option<GhostMetrics>> {
+        Ok(self.rankings()?.into_iter().find(|m| m.ghost == ghost))
+    }
+
+    /// Determine A/B route for an incoming request.
+    pub fn ab_route(&self) -> AbRoute {
+        let challenger = match &self.config.ab_test_ghost {
+            Some(g) if !g.is_empty() => g.clone(),
+            _ => return AbRoute::Control,
+        };
+        if rand::random::<f64>() < self.config.ab_test_fraction {
+            AbRoute::Challenger(challenger)
+        } else {
+            AbRoute::Control
+        }
+    }
+
+    /// Check if the challenger ghost should be promoted based on performance.
+    pub fn check_promotion(&self) -> Result<Option<String>> {
+        let challenger = match &self.config.ab_test_ghost {
+            Some(g) if !g.is_empty() => g.clone(),
+            _ => return Ok(None),
+        };
+
+        let rankings = self.rankings()?;
+        let challenger_metrics = rankings.iter().find(|m| m.ghost == challenger);
+        let Some(challenger_m) = challenger_metrics else { return Ok(None); };
+
+        if challenger_m.total_tasks < self.config.min_samples_for_recommendation {
+            return Ok(None);
+        }
+
+        // Find the best non-challenger ghost (control)
+        let control_metrics = rankings.iter().find(|m| m.ghost != challenger);
+        let Some(control_m) = control_metrics else { return Ok(None); };
+
+        let improvement = challenger_m.success_rate - control_m.success_rate;
+        if improvement >= self.config.promotion_threshold {
+            Ok(Some(format!(
+                "Recommendation: promote ghost '{}' -- {:.1}% success rate vs {:.1}% for '{}' (+{:.1}% over {}+ tasks)",
+                challenger,
+                challenger_m.success_rate * 100.0,
+                control_m.success_rate * 100.0,
+                control_m.ghost,
+                improvement * 100.0,
+                challenger_m.total_tasks,
+            )))
+        } else {
+            Ok(None)
+        }
+    }
+
+    /// Format the leaderboard as an ASCII table.
+    pub fn format_leaderboard(&self) -> Result<String> {
+        let rankings = self.rankings()?;
+        if rankings.is_empty() {
+            return Ok("No performance data recorded yet. Complete some tasks to see the leaderboard.".to_string());
+        }
+
+        let mut lines = vec![
+            "Ghost Leaderboard".to_string(),
+            "-".repeat(60),
+        ];
+        for (i, metrics) in rankings.iter().enumerate() {
+            lines.push(metrics.format_row(i + 1));
+        }
+        lines.push("-".repeat(60));
+
+        if let Ok(Some(promo)) = self.check_promotion() {
+            lines.push(String::new());
+            lines.push(promo);
+        }
+
+        Ok(lines.join("\n"))
+    }
+
+    /// Compare two ghosts head-to-head.
+    pub fn compare(&self, ghost_a: &str, ghost_b: &str) -> Result<String> {
+        let m_a = self.ghost_metrics(ghost_a)?.ok_or_else(|| SparksError::Tool(format!("No data for ghost '{}'", ghost_a)))?;
+        let m_b = self.ghost_metrics(ghost_b)?.ok_or_else(|| SparksError::Tool(format!("No data for ghost '{}'", ghost_b)))?;
+
+        let winner_success = if m_a.success_rate >= m_b.success_rate { ghost_a } else { ghost_b };
+        let winner_speed = if m_a.avg_latency_ms <= m_b.avg_latency_ms { ghost_a } else { ghost_b };
+
+        Ok(format!(
+            "Ghost Comparison: {} vs {}\n\n\
+             {:>30}  {:<30}\n\
+             {:>30}  {:<30}\n\
+             {:>30}  {:<30}\n\
+             {:>30}  {:<30}\n\
+             {:>30}  {:<30}\n\n\
+             Success rate winner: {}\n\
+             Speed winner: {}",
+            ghost_a, ghost_b,
+            format!("Success: {:.1}%", m_a.success_rate * 100.0),
+            format!("Success: {:.1}%", m_b.success_rate * 100.0),
+            format!("Avg latency: {:.0}ms", m_a.avg_latency_ms),
+            format!("Avg latency: {:.0}ms", m_b.avg_latency_ms),
+            format!("Tasks: {}", m_a.total_tasks),
+            format!("Tasks: {}", m_b.total_tasks),
+            format!("Avg input: {:.0} tok", m_a.avg_input_tokens),
+            format!("Avg input: {:.0} tok", m_b.avg_input_tokens),
+            format!("Score: {:.3}", m_a.rank_score()),
+            format!("Score: {:.3}", m_b.rank_score()),
+            winner_success, winner_speed,
+        ))
+    }
+
+    /// Reset all leaderboard data.
+    pub fn reset(&self) -> Result<()> {
+        let conn = self.conn.lock().map_err(|_| SparksError::Internal("lb lock".into()))?;
+        conn.execute("DELETE FROM ghost_outcomes", [])?;
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn test_lb() -> GhostLeaderboard {
+        let conn = Connection::open_in_memory().unwrap();
+        GhostLeaderboard::new(conn, LeaderboardConfig::default()).unwrap()
+    }
+
+    #[test]
+    fn record_and_rank() {
+        let lb = test_lb();
+        for i in 0..10 {
+            lb.record(&TaskOutcome {
+                session_key: "s".into(),
+                ghost: "coder".into(),
+                success: i % 3 != 0,  // ~67% success
+                latency_ms: 1000,
+                input_tokens: 500,
+                output_tokens: 200,
+                user_rating: None,
+            }).unwrap();
+        }
+        let rankings = lb.rankings().unwrap();
+        assert_eq!(rankings.len(), 1);
+        assert_eq!(rankings[0].ghost, "coder");
+        assert_eq!(rankings[0].total_tasks, 10);
+    }
+
+    #[test]
+    fn ab_route_control_when_no_challenger() {
+        let lb = test_lb();
+        assert_eq!(lb.ab_route(), AbRoute::Control);
+    }
+
+    #[test]
+    fn ab_route_always_control_when_fraction_zero() {
+        let mut config = LeaderboardConfig::default();
+        config.ab_test_ghost = Some("challenger".into());
+        config.ab_test_fraction = 0.0;
+        let conn = Connection::open_in_memory().unwrap();
+        let lb = GhostLeaderboard::new(conn, config).unwrap();
+        // With fraction 0, should always be Control
+        for _ in 0..20 {
+            assert_eq!(lb.ab_route(), AbRoute::Control);
+        }
+    }
+
+    #[test]
+    fn metrics_rank_score_success_dominant() {
+        let high = GhostMetrics { ghost: "h".into(), success_rate: 1.0, total_tasks: 10, ..Default::default() };
+        let low  = GhostMetrics { ghost: "l".into(), success_rate: 0.0, total_tasks: 10, ..Default::default() };
+        assert!(high.rank_score() > low.rank_score());
+    }
+
+    #[test]
+    fn format_leaderboard_empty() {
+        let lb = test_lb();
+        let text = lb.format_leaderboard().unwrap();
+        assert!(text.contains("No performance data"));
+    }
+
+    #[test]
+    fn compare_two_ghosts() {
+        let lb = test_lb();
+        lb.record(&TaskOutcome { session_key: "s".into(), ghost: "alpha".into(), success: true, latency_ms: 800, input_tokens: 300, output_tokens: 100, user_rating: None }).unwrap();
+        lb.record(&TaskOutcome { session_key: "s".into(), ghost: "beta".into(), success: false, latency_ms: 1200, input_tokens: 600, output_tokens: 200, user_rating: None }).unwrap();
+        let result = lb.compare("alpha", "beta").unwrap();
+        assert!(result.contains("alpha"));
+        assert!(result.contains("beta"));
+    }
+
+    #[test]
+    fn promotion_check_insufficient_samples() {
+        let mut config = LeaderboardConfig::default();
+        config.ab_test_ghost = Some("challenger".into());
+        config.min_samples_for_recommendation = 50;
+        let conn = Connection::open_in_memory().unwrap();
+        let lb = GhostLeaderboard::new(conn, config).unwrap();
+        lb.record(&TaskOutcome { session_key: "s".into(), ghost: "challenger".into(), success: true, latency_ms: 500, input_tokens: 200, output_tokens: 80, user_rating: None }).unwrap();
+        // Only 1 sample, need 50
+        assert!(lb.check_promotion().unwrap().is_none());
+    }
+}
diff --git a/src/main.rs b/src/main.rs
index 2534312..20fedd5 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -16,6 +16,7 @@ mod feature_contract;
 mod ghost_policy;
 mod heartbeat;
 mod introspect;
+mod leaderboard;
 mod knobs;
 mod kpi;
 mod langfuse;
@@ -257,6 +258,11 @@ enum Commands {
         #[command(subcommand)]
         action: SnapshotAction,
     },
+    /// View ghost performance leaderboard and A/B test results
+    Leaderboard {
+        #[command(subcommand)]
+        action: LeaderboardAction,
+    },
 }
 
 #[derive(Subcommand)]
@@ -282,6 +288,19 @@ enum SnapshotAction {
     },
 }
 
+#[derive(Subcommand)]
+enum LeaderboardAction {
+    /// Show the full leaderboard
+    Show,
+    /// Compare two ghosts head-to-head
+    Compare {
+        ghost_a: String,
+        ghost_b: String,
+    },
+    /// Reset all leaderboard data
+    Reset,
+}
+
 #[derive(Subcommand)]
 enum OpenaiAction {
     /// Start OAuth login flow
@@ -1013,6 +1032,24 @@ async fn main() -> anyhow::Result<()> {
                 }
             }
         }
+        Some(Commands::Leaderboard { action }) => {
+            let db_path = config.db_path()?;
+            let conn = rusqlite::Connection::open(&db_path)?;
+            let lb = leaderboard::GhostLeaderboard::new(conn, config.leaderboard.clone())?;
+            match action {
+                LeaderboardAction::Show => println!("{}", lb.format_leaderboard()?),
+                LeaderboardAction::Compare { ghost_a, ghost_b } => println!("{}", lb.compare(&ghost_a, &ghost_b)?),
+                LeaderboardAction::Reset => {
+                    println!("This will delete all leaderboard data. Are you sure? [y/N]");
+                    let mut line = String::new();
+                    std::io::stdin().read_line(&mut line)?;
+                    if line.trim().eq_ignore_ascii_case("y") {
+                        lb.reset()?;
+                        println!("Leaderboard reset.");
+                    }
+                }
+            }
+        }
         Some(Commands::Chat) | None => run_chat(config, memory, auto_approve).await?,
     }
 
diff --git a/src/snapshot.rs b/src/snapshot.rs
index 46ec6cf..76ca7b0 100644
--- a/src/snapshot.rs
+++ b/src/snapshot.rs
@@ -274,6 +274,7 @@ fn meta_path_for(snap_path: &Path) -> PathBuf {
     no_tar.with_extension("json")
 }
 
+
 fn extract_snapshot(archive: &Path, dest: &Path) -> Result<()> {
     let output = Command::new("tar")
         .arg("xzf")

From 7f76e4fd4a8eb9fea7069e300f999a749f7b5de0 Mon Sep 17 00:00:00 2001
From: Enreign <taysic@gmail.com>
Date: Tue, 17 Mar 2026 01:10:49 +0100
Subject: [PATCH 2/2] fix(leaderboard): second review pass

- Add TaskOutcome::new() convenience constructor to reduce struct literal verbosity in tests
- Fix compare() output: add column headers (ghost names + separator) so each column is identifiable
- Use successful_tasks in format_row() ("{success}/{total}") to eliminate dead-field warning
- Add four missing tests: ab_route with fraction=1.0, rank_score with zero tokens, reset(), and format_leaderboard with data
- Add [leaderboard] section to config.example.toml with all fields documented

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 config.example.toml | 13 +++++++
 src/leaderboard.rs  | 90 +++++++++++++++++++++++++++++++++++++--------
 2 files changed, 88 insertions(+), 15 deletions(-)

diff --git a/config.example.toml b/config.example.toml
index cb3ab7d..f29c8f4 100644
--- a/config.example.toml
+++ b/config.example.toml
@@ -379,3 +379,16 @@ include = ["."]
 # Paths/patterns to exclude from the snapshot.
 # (default: target/, .git/, .worktrees/, *.db, *.log)
 exclude = ["target/", ".git/", ".worktrees/", "*.db", "*.log"]
+
+# Ghost performance leaderboard and A/B testing.
+# Run `sparks leaderboard show` to view rankings, or `sparks leaderboard compare ghost-a ghost-b`.
+[leaderboard]
+enabled = true
+# Ghost name to A/B test against the default ghost. Omit or leave empty to disable A/B routing.
+# ab_test_ghost = "experimental-coder"
+# Fraction of requests routed to the challenger ghost (0.0 = never, 1.0 = always).
+ab_test_fraction = 0.1
+# Minimum recorded samples before a promotion recommendation is surfaced.
+min_samples_for_recommendation = 50
+# Success-rate improvement (as a fraction) required to trigger a promotion recommendation.
+promotion_threshold = 0.10
diff --git a/src/leaderboard.rs b/src/leaderboard.rs
index 90f7e48..965db3f 100644
--- a/src/leaderboard.rs
+++ b/src/leaderboard.rs
@@ -22,6 +22,21 @@ pub struct TaskOutcome {
     pub user_rating: Option<i8>,  // -1 (thumbs down), 0 (neutral), 1 (thumbs up)
 }
 
+impl TaskOutcome {
+    /// Convenience constructor for the common case of no user rating.
+    pub fn new(session_key: impl Into<String>, ghost: impl Into<String>, success: bool, latency_ms: u64, input_tokens: u64, output_tokens: u64) -> Self {
+        Self {
+            session_key: session_key.into(),
+            ghost: ghost.into(),
+            success,
+            latency_ms,
+            input_tokens,
+            output_tokens,
+            user_rating: None,
+        }
+    }
+}
+
 /// Aggregated performance metrics for a ghost.
 #[derive(Debug, Clone, Default)]
 pub struct GhostMetrics {
@@ -57,11 +72,12 @@ impl GhostMetrics {
             _      => "★☆☆☆☆",
         };
         format!(
-            "#{} {:20} {} {:.0}% success  {:.0}ms avg  {} tasks",
+            "#{} {:20} {} {:.0}% success ({}/{})  {:.0}ms avg",
             rank, self.ghost, stars,
             self.success_rate * 100.0,
-            self.avg_latency_ms,
+            self.successful_tasks,
             self.total_tasks,
+            self.avg_latency_ms,
         )
     }
 }
@@ -261,6 +277,8 @@ impl GhostLeaderboard {
 
         Ok(format!(
             "Ghost Comparison: {} vs {}\n\n\
+             {:>30}  {:<30}\n\
+             {}\n\
              {:>30}  {:<30}\n\
              {:>30}  {:<30}\n\
              {:>30}  {:<30}\n\
@@ -269,6 +287,8 @@ impl GhostLeaderboard {
              Success rate winner: {}\n\
              Speed winner: {}",
             ghost_a, ghost_b,
+            ghost_a, ghost_b,
+            "-".repeat(63),
             format!("Success: {:.1}%", m_a.success_rate * 100.0),
             format!("Success: {:.1}%", m_b.success_rate * 100.0),
             format!("Avg latency: {:.0}ms", m_a.avg_latency_ms),
@@ -300,19 +320,15 @@ mod tests {
         GhostLeaderboard::new(conn, LeaderboardConfig::default()).unwrap()
     }
 
+    fn make_outcome(ghost: &str, success: bool) -> TaskOutcome {
+        TaskOutcome::new("s", ghost, success, 1000, 500, 200)
+    }
+
     #[test]
     fn record_and_rank() {
         let lb = test_lb();
         for i in 0..10 {
-            lb.record(&TaskOutcome {
-                session_key: "s".into(),
-                ghost: "coder".into(),
-                success: i % 3 != 0,  // ~67% success
-                latency_ms: 1000,
-                input_tokens: 500,
-                output_tokens: 200,
-                user_rating: None,
-            }).unwrap();
+            lb.record(&TaskOutcome::new("s", "coder", i % 3 != 0, 1000, 500, 200)).unwrap();
         }
         let rankings = lb.rankings().unwrap();
         assert_eq!(rankings.len(), 1);
@@ -333,12 +349,25 @@ mod tests {
         config.ab_test_fraction = 0.0;
         let conn = Connection::open_in_memory().unwrap();
         let lb = GhostLeaderboard::new(conn, config).unwrap();
-        // With fraction 0, should always be Control
+        // With fraction 0.0, rand::random::<f64>() is always >= 0.0, never < 0.0.
         for _ in 0..20 {
             assert_eq!(lb.ab_route(), AbRoute::Control);
         }
     }
 
+    #[test]
+    fn ab_route_always_challenger_when_fraction_one() {
+        let mut config = LeaderboardConfig::default();
+        config.ab_test_ghost = Some("challenger".into());
+        config.ab_test_fraction = 1.0;
+        let conn = Connection::open_in_memory().unwrap();
+        let lb = GhostLeaderboard::new(conn, config).unwrap();
+        // rand::random::<f64>() returns [0.0, 1.0), so < 1.0 is always true.
+        for _ in 0..20 {
+            assert_eq!(lb.ab_route(), AbRoute::Challenger("challenger".into()));
+        }
+    }
+
     #[test]
     fn metrics_rank_score_success_dominant() {
         let high = GhostMetrics { ghost: "h".into(), success_rate: 1.0, total_tasks: 10, ..Default::default() };
@@ -346,6 +375,16 @@ mod tests {
         assert!(high.rank_score() > low.rank_score());
     }
 
+    #[test]
+    fn rank_score_zero_tokens_returns_neutral_efficiency() {
+        // When avg_input_tokens and avg_output_tokens are both 0.0 (Default),
+        // efficiency should fall back to 0.5 (neutral) rather than 1.0 (perfect).
+        let m = GhostMetrics { ghost: "g".into(), success_rate: 0.5, total_tasks: 1, ..Default::default() };
+        // score = 0.6*0.5 + 0.2*0.5 + 0.2*0.5 = 0.5
+        let score = m.rank_score();
+        assert!((score - 0.5).abs() < 1e-9, "expected 0.5, got {score}");
+    }
+
     #[test]
     fn format_leaderboard_empty() {
         let lb = test_lb();
@@ -353,14 +392,35 @@ mod tests {
         assert!(text.contains("No performance data"));
     }
 
+    #[test]
+    fn format_leaderboard_with_data_shows_ghost_name() {
+        let lb = test_lb();
+        lb.record(&make_outcome("scout", true)).unwrap();
+        let text = lb.format_leaderboard().unwrap();
+        assert!(text.contains("scout"));
+        assert!(text.contains("Ghost Leaderboard"));
+    }
+
     #[test]
     fn compare_two_ghosts() {
         let lb = test_lb();
-        lb.record(&TaskOutcome { session_key: "s".into(), ghost: "alpha".into(), success: true, latency_ms: 800, input_tokens: 300, output_tokens: 100, user_rating: None }).unwrap();
-        lb.record(&TaskOutcome { session_key: "s".into(), ghost: "beta".into(), success: false, latency_ms: 1200, input_tokens: 600, output_tokens: 200, user_rating: None }).unwrap();
+        lb.record(&TaskOutcome::new("s", "alpha", true, 800, 300, 100)).unwrap();
+        lb.record(&TaskOutcome::new("s", "beta", false, 1200, 600, 200)).unwrap();
         let result = lb.compare("alpha", "beta").unwrap();
         assert!(result.contains("alpha"));
         assert!(result.contains("beta"));
+        // Column headers must appear in the output so each column is identifiable.
+        assert!(result.contains("Success rate winner:"));
+        assert!(result.contains("Speed winner:"));
+    }
+
+    #[test]
+    fn reset_clears_all_data() {
+        let lb = test_lb();
+        lb.record(&make_outcome("ghost-x", true)).unwrap();
+        assert_eq!(lb.rankings().unwrap().len(), 1);
+        lb.reset().unwrap();
+        assert!(lb.rankings().unwrap().is_empty());
     }
 
     #[test]
@@ -370,7 +430,7 @@ mod tests {
         config.min_samples_for_recommendation = 50;
         let conn = Connection::open_in_memory().unwrap();
         let lb = GhostLeaderboard::new(conn, config).unwrap();
-        lb.record(&TaskOutcome { session_key: "s".into(), ghost: "challenger".into(), success: true, latency_ms: 500, input_tokens: 200, output_tokens: 80, user_rating: None }).unwrap();
+        lb.record(&TaskOutcome::new("s", "challenger", true, 500, 200, 80)).unwrap();
         // Only 1 sample, need 50
         assert!(lb.check_promotion().unwrap().is_none());
     }