From b363fb082967778a7a3523a9669ecb2cacee769a Mon Sep 17 00:00:00 2001 From: Enreign Date: Tue, 17 Mar 2026 00:53:41 +0100 Subject: [PATCH 1/2] feat(leaderboard): ghost performance leaderboard and A/B testing - GhostLeaderboard: SQLite-backed outcome tracking per ghost profile - TaskOutcome: records success, latency, token usage, user rating (-1/0/1) - GhostMetrics: aggregate stats with composite rank_score (60% success, 20% user rating, 20% token efficiency) - A/B testing: route configurable fraction of requests to a challenger ghost - Auto-promotion: recommend promoting challenger when it outperforms control by promotion_threshold (default: 10%) over min_samples (default: 50) - ASCII leaderboard with star ratings and head-to-head comparison - 'sparks leaderboard [show|compare |reset]' CLI subcommand - 7 unit tests covering recording, ranking, A/B routing, promotion check Co-Authored-By: Claude Sonnet 4.6 --- src/config.rs | 40 +++++ src/cost.rs | 334 +++++++++++++++++++++++++++++++++++++++ src/leaderboard.rs | 377 +++++++++++++++++++++++++++++++++++++++++++++ src/main.rs | 37 +++++ src/snapshot.rs | 1 + 5 files changed, 789 insertions(+) create mode 100644 src/cost.rs create mode 100644 src/leaderboard.rs diff --git a/src/config.rs b/src/config.rs index 5e69d36..c14340f 100644 --- a/src/config.rs +++ b/src/config.rs @@ -73,6 +73,8 @@ pub struct Config { pub sonarqube: SonarqubeConfig, #[serde(default)] pub snapshot: SnapshotConfig, + #[serde(default)] + pub leaderboard: LeaderboardConfig, #[serde(skip)] inline_secret_labels: Vec, } @@ -1505,6 +1507,43 @@ impl ManagerConfig { } } +// ── Leaderboard config ──────────────────────────────────────────────── + +#[derive(Debug, Deserialize, Serialize, Clone)] +pub struct LeaderboardConfig { + /// Enable leaderboard tracking (default: true) + #[serde(default = "default_lb_enabled")] + pub enabled: bool, + /// Ghost name to A/B test against the default ghost (None = disabled) + pub ab_test_ghost: Option, + /// Fraction of requests routed to the challenger ghost (0.0-1.0, default: 0.1) + #[serde(default = "default_ab_fraction")] + pub ab_test_fraction: f64, + /// Minimum samples before auto-promotion recommendation (default: 50) + #[serde(default = "default_lb_min_samples")] + pub min_samples_for_recommendation: u64, + /// Success rate improvement threshold for auto-promotion (default: 0.10 = 10%) + #[serde(default = "default_lb_threshold")] + pub promotion_threshold: f64, +} + +fn default_lb_enabled() -> bool { true } +fn default_ab_fraction() -> f64 { 0.1 } +fn default_lb_min_samples() -> u64 { 50 } +fn default_lb_threshold() -> f64 { 0.10 } + +impl Default for LeaderboardConfig { + fn default() -> Self { + Self { + enabled: default_lb_enabled(), + ab_test_ghost: None, + ab_test_fraction: default_ab_fraction(), + min_samples_for_recommendation: default_lb_min_samples(), + promotion_threshold: default_lb_threshold(), + } + } +} + impl Default for Config { fn default() -> Self { Self { @@ -1538,6 +1577,7 @@ impl Default for Config { alerts: AlertsConfig::default(), sonarqube: SonarqubeConfig::default(), snapshot: SnapshotConfig::default(), + leaderboard: LeaderboardConfig::default(), inline_secret_labels: Vec::new(), } } diff --git a/src/cost.rs b/src/cost.rs new file mode 100644 index 0000000..993bebc --- /dev/null +++ b/src/cost.rs @@ -0,0 +1,334 @@ +//! Token usage and cost tracking. +//! +//! Records per-call token counts and calculates USD cost based on model pricing. +//! Enforces daily and per-session budgets when configured. + +use std::collections::HashMap; +use std::sync::Mutex; + +use rusqlite::{Connection, params}; + +use crate::config::CostConfig; +use crate::error::{SparksError, Result}; + +/// Built-in model pricing: (input_per_1m_usd, output_per_1m_usd). +/// Users can override via config.cost.model_prices. +pub fn builtin_prices() -> HashMap<&'static str, (f64, f64)> { + let mut m = HashMap::new(); + // Anthropic + m.insert("claude-opus-4-6", (15.00, 75.00)); + m.insert("claude-sonnet-4-6", (3.00, 15.00)); + m.insert("claude-haiku-4-5", (0.80, 4.00)); + // OpenAI + m.insert("gpt-4o", (5.00, 15.00)); + m.insert("gpt-4o-mini", (0.15, 0.60)); + m.insert("gpt-4-turbo", (10.00, 30.00)); + m.insert("o1", (15.00, 60.00)); + m.insert("o3-mini", (1.10, 4.40)); + // Common aliases + m.insert("gpt-4", (30.00, 60.00)); + m.insert("gpt-3.5-turbo", (0.50, 1.50)); + m +} + +/// A single token usage record. +#[derive(Debug, Clone)] +pub struct TokenUsage { + pub session_key: String, + pub model: String, + pub ghost: Option, + pub input_tokens: u64, + pub output_tokens: u64, + pub cost_usd: f64, +} + +impl TokenUsage { + pub fn total_tokens(&self) -> u64 { + self.input_tokens + self.output_tokens + } +} + +/// Aggregate cost summary. +#[derive(Debug, Default, Clone)] +pub struct CostSummary { + pub total_input_tokens: u64, + pub total_output_tokens: u64, + pub total_cost_usd: f64, + pub by_model: HashMap, + pub by_ghost: HashMap, + pub record_count: usize, +} + +impl CostSummary { + pub fn format_report(&self) -> String { + let mut lines = vec![ + format!("**\u{1f4b0} Cost Summary**"), + format!(""), + format!("Total: **${:.4}**", self.total_cost_usd), + format!("Input tokens: {}", self.total_input_tokens), + format!("Output tokens: {}", self.total_output_tokens), + format!("Calls: {}", self.record_count), + ]; + + if !self.by_model.is_empty() { + lines.push(String::new()); + lines.push("**By model:**".to_string()); + let mut models: Vec<_> = self.by_model.iter().collect(); + models.sort_by(|a, b| b.1.partial_cmp(a.1).unwrap_or(std::cmp::Ordering::Equal)); + for (model, cost) in models { + lines.push(format!(" \u{2022} {}: ${:.4}", model, cost)); + } + } + + if !self.by_ghost.is_empty() { + lines.push(String::new()); + lines.push("**By ghost:**".to_string()); + let mut ghosts: Vec<_> = self.by_ghost.iter().collect(); + ghosts.sort_by(|a, b| b.1.partial_cmp(a.1).unwrap_or(std::cmp::Ordering::Equal)); + for (ghost, cost) in ghosts { + lines.push(format!(" \u{2022} {}: ${:.4}", ghost, cost)); + } + } + + lines.join("\n") + } +} + +/// Calculate USD cost for given token counts and model. +pub fn calculate_cost(model: &str, input_tokens: u64, output_tokens: u64, config: &CostConfig) -> f64 { + // Check config overrides first + let prices = if let Some(prices) = config.model_prices.get(model) { + (prices[0], prices[1]) + } else { + let builtin = builtin_prices(); + // Try exact match, then prefix match + if let Some(&(inp, out)) = builtin.get(model) { + (inp, out) + } else { + // Try prefix: "claude-sonnet-4-6-20251022" -> "claude-sonnet-4-6" + let matched = builtin.iter() + .find(|(k, _)| model.starts_with(*k)) + .map(|(_, v)| *v); + matched.unwrap_or((0.0, 0.0)) + } + }; + (input_tokens as f64 / 1_000_000.0 * prices.0) + + (output_tokens as f64 / 1_000_000.0 * prices.1) +} + +/// Persistent cost tracker backed by SQLite. +pub struct CostTracker { + conn: Mutex, + config: CostConfig, +} + +impl CostTracker { + pub fn new(conn: Connection, config: CostConfig) -> Result { + { + conn.execute_batch( + "CREATE TABLE IF NOT EXISTS cost_log ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + session_key TEXT NOT NULL, + model TEXT NOT NULL, + ghost TEXT, + input_tokens INTEGER NOT NULL DEFAULT 0, + output_tokens INTEGER NOT NULL DEFAULT 0, + cost_usd REAL NOT NULL DEFAULT 0.0, + created_at TEXT NOT NULL DEFAULT (datetime('now')) + ); + CREATE INDEX IF NOT EXISTS idx_cost_log_session ON cost_log(session_key); + CREATE INDEX IF NOT EXISTS idx_cost_log_created ON cost_log(created_at);", + )?; + } + Ok(Self { conn: Mutex::new(conn), config }) + } + + /// Record a token usage event and check budget. + pub fn record(&self, usage: &TokenUsage) -> Result<()> { + if !self.config.enabled { + return Ok(()); + } + let conn = self.conn.lock().map_err(|_| SparksError::Internal("cost lock poisoned".into()))?; + conn.execute( + "INSERT INTO cost_log (session_key, model, ghost, input_tokens, output_tokens, cost_usd) + VALUES (?1, ?2, ?3, ?4, ?5, ?6)", + params![ + usage.session_key, + usage.model, + usage.ghost, + usage.input_tokens as i64, + usage.output_tokens as i64, + usage.cost_usd, + ], + )?; + Ok(()) + } + + /// Get cost summary for today. + pub fn today_summary(&self) -> Result { + self.summary_since("date('now')") + } + + /// Get cost summary for a session. + pub fn session_summary(&self, session_key: &str) -> Result { + let conn = self.conn.lock().map_err(|_| SparksError::Internal("cost lock poisoned".into()))?; + let mut stmt = conn.prepare( + "SELECT model, ghost, input_tokens, output_tokens, cost_usd + FROM cost_log WHERE session_key = ?1" + )?; + self.aggregate_rows(&mut stmt, rusqlite::params![session_key]) + } + + fn summary_since(&self, date_expr: &str) -> Result { + let conn = self.conn.lock().map_err(|_| SparksError::Internal("cost lock poisoned".into()))?; + let sql = format!( + "SELECT model, ghost, input_tokens, output_tokens, cost_usd + FROM cost_log WHERE date(created_at) >= {}", date_expr + ); + let mut stmt = conn.prepare(&sql)?; + self.aggregate_rows(&mut stmt, rusqlite::params![]) + } + + fn aggregate_rows( + &self, + stmt: &mut rusqlite::Statement<'_>, + params: &[&dyn rusqlite::ToSql], + ) -> Result { + let mut summary = CostSummary::default(); + let rows = stmt.query_map(params, |row| { + Ok(( + row.get::<_, String>(0)?, // model + row.get::<_, Option>(1)?, // ghost + row.get::<_, i64>(2)? as u64, // input_tokens + row.get::<_, i64>(3)? as u64, // output_tokens + row.get::<_, f64>(4)?, // cost_usd + )) + })?; + + for row in rows { + let (model, ghost, input, output, cost) = row?; + summary.total_input_tokens += input; + summary.total_output_tokens += output; + summary.total_cost_usd += cost; + summary.record_count += 1; + *summary.by_model.entry(model).or_default() += cost; + if let Some(g) = ghost { + *summary.by_ghost.entry(g).or_default() += cost; + } + } + Ok(summary) + } + + /// Check if the daily budget is exceeded. Returns Err if block_on_exceeded and over budget. + pub fn check_daily_budget(&self) -> Result<()> { + if self.config.daily_budget_usd <= 0.0 || !self.config.enabled { + return Ok(()); + } + let summary = self.today_summary()?; + if summary.total_cost_usd >= self.config.daily_budget_usd { + let msg = format!( + "Daily cost budget exceeded: ${:.4} >= ${:.2}", + summary.total_cost_usd, self.config.daily_budget_usd + ); + if self.config.on_budget_exceeded == "block" { + return Err(SparksError::Tool(msg)); + } else { + tracing::warn!("{}", msg); + } + } + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn calculate_cost_known_model() { + let config = CostConfig::default(); + // claude-sonnet: $3/1M input, $15/1M output + let cost = calculate_cost("claude-sonnet-4-6", 1_000_000, 1_000_000, &config); + assert!((cost - 18.0).abs() < 0.01, "Expected ~$18, got ${}", cost); + } + + #[test] + fn calculate_cost_unknown_model() { + let config = CostConfig::default(); + let cost = calculate_cost("unknown-model-xyz", 1000, 1000, &config); + assert_eq!(cost, 0.0); + } + + #[test] + fn calculate_cost_config_override() { + let mut config = CostConfig::default(); + config.model_prices.insert("my-model".to_string(), [10.0, 20.0]); + let cost = calculate_cost("my-model", 1_000_000, 500_000, &config); + assert!((cost - 20.0).abs() < 0.01); // $10 input + $10 output + } + + #[test] + fn calculate_cost_prefix_match() { + let config = CostConfig::default(); + // Should match "claude-sonnet-4-6" prefix + let cost = calculate_cost("claude-sonnet-4-6-20251022", 1_000_000, 0, &config); + assert!((cost - 3.0).abs() < 0.01); + } + + #[test] + fn cost_tracker_record_and_summarize() { + let conn = Connection::open_in_memory().unwrap(); + let tracker = CostTracker::new(conn, CostConfig::default()).unwrap(); + let usage = TokenUsage { + session_key: "test:session".to_string(), + model: "claude-sonnet-4-6".to_string(), + ghost: Some("coder".to_string()), + input_tokens: 1000, + output_tokens: 500, + cost_usd: 0.0105, + }; + tracker.record(&usage).unwrap(); + let summary = tracker.session_summary("test:session").unwrap(); + assert_eq!(summary.record_count, 1); + assert_eq!(summary.total_input_tokens, 1000); + assert!((summary.total_cost_usd - 0.0105).abs() < 0.0001); + assert!(summary.by_ghost.contains_key("coder")); + } + + #[test] + fn cost_summary_format_report() { + let mut summary = CostSummary::default(); + summary.total_cost_usd = 1.2345; + summary.total_input_tokens = 100_000; + summary.total_output_tokens = 50_000; + summary.record_count = 10; + summary.by_model.insert("claude-sonnet-4-6".to_string(), 1.2345); + let report = summary.format_report(); + assert!(report.contains("$1.2345")); + assert!(report.contains("claude-sonnet-4-6")); + } + + #[test] + fn builtin_prices_non_empty() { + let prices = builtin_prices(); + assert!(!prices.is_empty()); + assert!(prices.contains_key("claude-sonnet-4-6")); + assert!(prices.contains_key("gpt-4o")); + } + + #[test] + fn delivery_channel_severity() { + // Verify severity ranking is strictly ordered + assert!(severity_rank("critical") > severity_rank("warning")); + assert!(severity_rank("warning") > severity_rank("info")); + } + + fn severity_rank(s: &str) -> u8 { + match s { + "critical" => 3, + "warning" => 2, + "info" => 1, + _ => 0, + } + } +} diff --git a/src/leaderboard.rs b/src/leaderboard.rs new file mode 100644 index 0000000..90f7e48 --- /dev/null +++ b/src/leaderboard.rs @@ -0,0 +1,377 @@ +//! Agent performance leaderboard and A/B ghost comparison. +//! +//! Tracks per-ghost performance metrics (success rate, latency, token usage) +//! and supports A/B testing between ghost profiles. + +use std::sync::Mutex; + +use rusqlite::{Connection, params}; + +use crate::config::LeaderboardConfig; +use crate::error::{SparksError, Result}; + +/// A single task outcome record. +#[derive(Debug, Clone)] +pub struct TaskOutcome { + pub session_key: String, + pub ghost: String, + pub success: bool, + pub latency_ms: u64, + pub input_tokens: u64, + pub output_tokens: u64, + pub user_rating: Option, // -1 (thumbs down), 0 (neutral), 1 (thumbs up) +} + +/// Aggregated performance metrics for a ghost. +#[derive(Debug, Clone, Default)] +pub struct GhostMetrics { + pub ghost: String, + pub total_tasks: u64, + pub successful_tasks: u64, + pub success_rate: f64, + pub avg_latency_ms: f64, + pub avg_input_tokens: f64, + pub avg_output_tokens: f64, + pub avg_rating: f64, + pub rated_tasks: u64, +} + +impl GhostMetrics { + pub fn rank_score(&self) -> f64 { + // Weighted composite: 60% success rate, 20% rating, 20% efficiency (inverse tokens) + let efficiency = if self.avg_input_tokens + self.avg_output_tokens > 0.0 { + 1.0 - ((self.avg_input_tokens + self.avg_output_tokens) / 10_000.0).min(1.0) + } else { + 0.5 + }; + let rating = if self.rated_tasks > 0 { (self.avg_rating + 1.0) / 2.0 } else { 0.5 }; + 0.6 * self.success_rate + 0.2 * rating + 0.2 * efficiency + } + + pub fn format_row(&self, rank: usize) -> String { + let stars = match (self.success_rate * 10.0) as u32 { + 9..=10 => "★★★★★", + 7..=8 => "★★★★☆", + 5..=6 => "★★★☆☆", + 3..=4 => "★★☆☆☆", + _ => "★☆☆☆☆", + }; + format!( + "#{} {:20} {} {:.0}% success {:.0}ms avg {} tasks", + rank, self.ghost, stars, + self.success_rate * 100.0, + self.avg_latency_ms, + self.total_tasks, + ) + } +} + +/// A/B test routing result. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum AbRoute { + Control, + Challenger(String), +} + +/// The leaderboard store backed by SQLite. +pub struct GhostLeaderboard { + conn: Mutex, + config: LeaderboardConfig, +} + +impl GhostLeaderboard { + pub fn new(conn: Connection, config: LeaderboardConfig) -> Result { + { + conn.execute_batch( + "CREATE TABLE IF NOT EXISTS ghost_outcomes ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + session_key TEXT NOT NULL, + ghost TEXT NOT NULL, + success INTEGER NOT NULL DEFAULT 0, + latency_ms INTEGER NOT NULL DEFAULT 0, + input_tokens INTEGER NOT NULL DEFAULT 0, + output_tokens INTEGER NOT NULL DEFAULT 0, + user_rating INTEGER, + created_at TEXT NOT NULL DEFAULT (datetime('now')) + ); + CREATE INDEX IF NOT EXISTS idx_ghost_outcomes_ghost ON ghost_outcomes(ghost); + CREATE INDEX IF NOT EXISTS idx_ghost_outcomes_created ON ghost_outcomes(created_at);", + )?; + } + Ok(Self { conn: Mutex::new(conn), config }) + } + + /// Record a task outcome. + pub fn record(&self, outcome: &TaskOutcome) -> Result<()> { + if !self.config.enabled { + return Ok(()); + } + let conn = self.conn.lock().map_err(|_| SparksError::Internal("lb lock".into()))?; + conn.execute( + "INSERT INTO ghost_outcomes (session_key, ghost, success, latency_ms, input_tokens, output_tokens, user_rating) + VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7)", + params![ + outcome.session_key, + outcome.ghost, + outcome.success as i32, + outcome.latency_ms as i64, + outcome.input_tokens as i64, + outcome.output_tokens as i64, + outcome.user_rating.map(|r| r as i64), + ], + )?; + Ok(()) + } + + /// Get metrics for all ghosts, sorted by rank score. + pub fn rankings(&self) -> Result> { + let conn = self.conn.lock().map_err(|_| SparksError::Internal("lb lock".into()))?; + let mut stmt = conn.prepare( + "SELECT ghost, + COUNT(*) as total, + SUM(success) as successes, + AVG(latency_ms) as avg_latency, + AVG(input_tokens) as avg_input, + AVG(output_tokens) as avg_output, + AVG(CASE WHEN user_rating IS NOT NULL THEN CAST(user_rating AS REAL) END) as avg_rating, + COUNT(user_rating) as rated + FROM ghost_outcomes + GROUP BY ghost + ORDER BY total DESC" + )?; + + let rows = stmt.query_map([], |row| { + Ok(( + row.get::<_, String>(0)?, + row.get::<_, i64>(1)? as u64, + row.get::<_, i64>(2)? as u64, + row.get::<_, f64>(3)?, + row.get::<_, f64>(4)?, + row.get::<_, f64>(5)?, + row.get::<_, Option>(6)?, + row.get::<_, i64>(7)? as u64, + )) + })?; + + let mut metrics = Vec::new(); + for row in rows { + let (ghost, total, successes, avg_latency, avg_input, avg_output, avg_rating, rated) = row?; + let success_rate = if total > 0 { successes as f64 / total as f64 } else { 0.0 }; + metrics.push(GhostMetrics { + ghost, + total_tasks: total, + successful_tasks: successes, + success_rate, + avg_latency_ms: avg_latency, + avg_input_tokens: avg_input, + avg_output_tokens: avg_output, + avg_rating: avg_rating.unwrap_or(0.0), + rated_tasks: rated, + }); + } + + metrics.sort_by(|a, b| b.rank_score().partial_cmp(&a.rank_score()).unwrap_or(std::cmp::Ordering::Equal)); + Ok(metrics) + } + + /// Get metrics for a specific ghost. + pub fn ghost_metrics(&self, ghost: &str) -> Result> { + Ok(self.rankings()?.into_iter().find(|m| m.ghost == ghost)) + } + + /// Determine A/B route for an incoming request. + pub fn ab_route(&self) -> AbRoute { + let challenger = match &self.config.ab_test_ghost { + Some(g) if !g.is_empty() => g.clone(), + _ => return AbRoute::Control, + }; + if rand::random::() < self.config.ab_test_fraction { + AbRoute::Challenger(challenger) + } else { + AbRoute::Control + } + } + + /// Check if the challenger ghost should be promoted based on performance. + pub fn check_promotion(&self) -> Result> { + let challenger = match &self.config.ab_test_ghost { + Some(g) if !g.is_empty() => g.clone(), + _ => return Ok(None), + }; + + let rankings = self.rankings()?; + let challenger_metrics = rankings.iter().find(|m| m.ghost == challenger); + let Some(challenger_m) = challenger_metrics else { return Ok(None); }; + + if challenger_m.total_tasks < self.config.min_samples_for_recommendation { + return Ok(None); + } + + // Find the best non-challenger ghost (control) + let control_metrics = rankings.iter().find(|m| m.ghost != challenger); + let Some(control_m) = control_metrics else { return Ok(None); }; + + let improvement = challenger_m.success_rate - control_m.success_rate; + if improvement >= self.config.promotion_threshold { + Ok(Some(format!( + "Recommendation: promote ghost '{}' -- {:.1}% success rate vs {:.1}% for '{}' (+{:.1}% over {}+ tasks)", + challenger, + challenger_m.success_rate * 100.0, + control_m.success_rate * 100.0, + control_m.ghost, + improvement * 100.0, + challenger_m.total_tasks, + ))) + } else { + Ok(None) + } + } + + /// Format the leaderboard as an ASCII table. + pub fn format_leaderboard(&self) -> Result { + let rankings = self.rankings()?; + if rankings.is_empty() { + return Ok("No performance data recorded yet. Complete some tasks to see the leaderboard.".to_string()); + } + + let mut lines = vec![ + "Ghost Leaderboard".to_string(), + "-".repeat(60), + ]; + for (i, metrics) in rankings.iter().enumerate() { + lines.push(metrics.format_row(i + 1)); + } + lines.push("-".repeat(60)); + + if let Ok(Some(promo)) = self.check_promotion() { + lines.push(String::new()); + lines.push(promo); + } + + Ok(lines.join("\n")) + } + + /// Compare two ghosts head-to-head. + pub fn compare(&self, ghost_a: &str, ghost_b: &str) -> Result { + let m_a = self.ghost_metrics(ghost_a)?.ok_or_else(|| SparksError::Tool(format!("No data for ghost '{}'", ghost_a)))?; + let m_b = self.ghost_metrics(ghost_b)?.ok_or_else(|| SparksError::Tool(format!("No data for ghost '{}'", ghost_b)))?; + + let winner_success = if m_a.success_rate >= m_b.success_rate { ghost_a } else { ghost_b }; + let winner_speed = if m_a.avg_latency_ms <= m_b.avg_latency_ms { ghost_a } else { ghost_b }; + + Ok(format!( + "Ghost Comparison: {} vs {}\n\n\ + {:>30} {:<30}\n\ + {:>30} {:<30}\n\ + {:>30} {:<30}\n\ + {:>30} {:<30}\n\ + {:>30} {:<30}\n\n\ + Success rate winner: {}\n\ + Speed winner: {}", + ghost_a, ghost_b, + format!("Success: {:.1}%", m_a.success_rate * 100.0), + format!("Success: {:.1}%", m_b.success_rate * 100.0), + format!("Avg latency: {:.0}ms", m_a.avg_latency_ms), + format!("Avg latency: {:.0}ms", m_b.avg_latency_ms), + format!("Tasks: {}", m_a.total_tasks), + format!("Tasks: {}", m_b.total_tasks), + format!("Avg input: {:.0} tok", m_a.avg_input_tokens), + format!("Avg input: {:.0} tok", m_b.avg_input_tokens), + format!("Score: {:.3}", m_a.rank_score()), + format!("Score: {:.3}", m_b.rank_score()), + winner_success, winner_speed, + )) + } + + /// Reset all leaderboard data. + pub fn reset(&self) -> Result<()> { + let conn = self.conn.lock().map_err(|_| SparksError::Internal("lb lock".into()))?; + conn.execute("DELETE FROM ghost_outcomes", [])?; + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn test_lb() -> GhostLeaderboard { + let conn = Connection::open_in_memory().unwrap(); + GhostLeaderboard::new(conn, LeaderboardConfig::default()).unwrap() + } + + #[test] + fn record_and_rank() { + let lb = test_lb(); + for i in 0..10 { + lb.record(&TaskOutcome { + session_key: "s".into(), + ghost: "coder".into(), + success: i % 3 != 0, // ~67% success + latency_ms: 1000, + input_tokens: 500, + output_tokens: 200, + user_rating: None, + }).unwrap(); + } + let rankings = lb.rankings().unwrap(); + assert_eq!(rankings.len(), 1); + assert_eq!(rankings[0].ghost, "coder"); + assert_eq!(rankings[0].total_tasks, 10); + } + + #[test] + fn ab_route_control_when_no_challenger() { + let lb = test_lb(); + assert_eq!(lb.ab_route(), AbRoute::Control); + } + + #[test] + fn ab_route_always_control_when_fraction_zero() { + let mut config = LeaderboardConfig::default(); + config.ab_test_ghost = Some("challenger".into()); + config.ab_test_fraction = 0.0; + let conn = Connection::open_in_memory().unwrap(); + let lb = GhostLeaderboard::new(conn, config).unwrap(); + // With fraction 0, should always be Control + for _ in 0..20 { + assert_eq!(lb.ab_route(), AbRoute::Control); + } + } + + #[test] + fn metrics_rank_score_success_dominant() { + let high = GhostMetrics { ghost: "h".into(), success_rate: 1.0, total_tasks: 10, ..Default::default() }; + let low = GhostMetrics { ghost: "l".into(), success_rate: 0.0, total_tasks: 10, ..Default::default() }; + assert!(high.rank_score() > low.rank_score()); + } + + #[test] + fn format_leaderboard_empty() { + let lb = test_lb(); + let text = lb.format_leaderboard().unwrap(); + assert!(text.contains("No performance data")); + } + + #[test] + fn compare_two_ghosts() { + let lb = test_lb(); + lb.record(&TaskOutcome { session_key: "s".into(), ghost: "alpha".into(), success: true, latency_ms: 800, input_tokens: 300, output_tokens: 100, user_rating: None }).unwrap(); + lb.record(&TaskOutcome { session_key: "s".into(), ghost: "beta".into(), success: false, latency_ms: 1200, input_tokens: 600, output_tokens: 200, user_rating: None }).unwrap(); + let result = lb.compare("alpha", "beta").unwrap(); + assert!(result.contains("alpha")); + assert!(result.contains("beta")); + } + + #[test] + fn promotion_check_insufficient_samples() { + let mut config = LeaderboardConfig::default(); + config.ab_test_ghost = Some("challenger".into()); + config.min_samples_for_recommendation = 50; + let conn = Connection::open_in_memory().unwrap(); + let lb = GhostLeaderboard::new(conn, config).unwrap(); + lb.record(&TaskOutcome { session_key: "s".into(), ghost: "challenger".into(), success: true, latency_ms: 500, input_tokens: 200, output_tokens: 80, user_rating: None }).unwrap(); + // Only 1 sample, need 50 + assert!(lb.check_promotion().unwrap().is_none()); + } +} diff --git a/src/main.rs b/src/main.rs index 2534312..20fedd5 100644 --- a/src/main.rs +++ b/src/main.rs @@ -16,6 +16,7 @@ mod feature_contract; mod ghost_policy; mod heartbeat; mod introspect; +mod leaderboard; mod knobs; mod kpi; mod langfuse; @@ -257,6 +258,11 @@ enum Commands { #[command(subcommand)] action: SnapshotAction, }, + /// View ghost performance leaderboard and A/B test results + Leaderboard { + #[command(subcommand)] + action: LeaderboardAction, + }, } #[derive(Subcommand)] @@ -282,6 +288,19 @@ enum SnapshotAction { }, } +#[derive(Subcommand)] +enum LeaderboardAction { + /// Show the full leaderboard + Show, + /// Compare two ghosts head-to-head + Compare { + ghost_a: String, + ghost_b: String, + }, + /// Reset all leaderboard data + Reset, +} + #[derive(Subcommand)] enum OpenaiAction { /// Start OAuth login flow @@ -1013,6 +1032,24 @@ async fn main() -> anyhow::Result<()> { } } } + Some(Commands::Leaderboard { action }) => { + let db_path = config.db_path()?; + let conn = rusqlite::Connection::open(&db_path)?; + let lb = leaderboard::GhostLeaderboard::new(conn, config.leaderboard.clone())?; + match action { + LeaderboardAction::Show => println!("{}", lb.format_leaderboard()?), + LeaderboardAction::Compare { ghost_a, ghost_b } => println!("{}", lb.compare(&ghost_a, &ghost_b)?), + LeaderboardAction::Reset => { + println!("This will delete all leaderboard data. Are you sure? [y/N]"); + let mut line = String::new(); + std::io::stdin().read_line(&mut line)?; + if line.trim().eq_ignore_ascii_case("y") { + lb.reset()?; + println!("Leaderboard reset."); + } + } + } + } Some(Commands::Chat) | None => run_chat(config, memory, auto_approve).await?, } diff --git a/src/snapshot.rs b/src/snapshot.rs index 46ec6cf..76ca7b0 100644 --- a/src/snapshot.rs +++ b/src/snapshot.rs @@ -274,6 +274,7 @@ fn meta_path_for(snap_path: &Path) -> PathBuf { no_tar.with_extension("json") } + fn extract_snapshot(archive: &Path, dest: &Path) -> Result<()> { let output = Command::new("tar") .arg("xzf") From 7f76e4fd4a8eb9fea7069e300f999a749f7b5de0 Mon Sep 17 00:00:00 2001 From: Enreign Date: Tue, 17 Mar 2026 01:10:49 +0100 Subject: [PATCH 2/2] fix(leaderboard): second review pass - Add TaskOutcome::new() convenience constructor to reduce struct literal verbosity in tests - Fix compare() output: add column headers (ghost names + separator) so each column is identifiable - Use successful_tasks in format_row() ("{success}/{total}") to eliminate dead-field warning - Add four missing tests: ab_route with fraction=1.0, rank_score with zero tokens, reset(), and format_leaderboard with data - Add [leaderboard] section to config.example.toml with all fields documented Co-Authored-By: Claude Sonnet 4.6 --- config.example.toml | 13 +++++++ src/leaderboard.rs | 90 +++++++++++++++++++++++++++++++++++++-------- 2 files changed, 88 insertions(+), 15 deletions(-) diff --git a/config.example.toml b/config.example.toml index cb3ab7d..f29c8f4 100644 --- a/config.example.toml +++ b/config.example.toml @@ -379,3 +379,16 @@ include = ["."] # Paths/patterns to exclude from the snapshot. # (default: target/, .git/, .worktrees/, *.db, *.log) exclude = ["target/", ".git/", ".worktrees/", "*.db", "*.log"] + +# Ghost performance leaderboard and A/B testing. +# Run `sparks leaderboard show` to view rankings, or `sparks leaderboard compare ghost-a ghost-b`. +[leaderboard] +enabled = true +# Ghost name to A/B test against the default ghost. Omit or leave empty to disable A/B routing. +# ab_test_ghost = "experimental-coder" +# Fraction of requests routed to the challenger ghost (0.0 = never, 1.0 = always). +ab_test_fraction = 0.1 +# Minimum recorded samples before a promotion recommendation is surfaced. +min_samples_for_recommendation = 50 +# Success-rate improvement (as a fraction) required to trigger a promotion recommendation. +promotion_threshold = 0.10 diff --git a/src/leaderboard.rs b/src/leaderboard.rs index 90f7e48..965db3f 100644 --- a/src/leaderboard.rs +++ b/src/leaderboard.rs @@ -22,6 +22,21 @@ pub struct TaskOutcome { pub user_rating: Option, // -1 (thumbs down), 0 (neutral), 1 (thumbs up) } +impl TaskOutcome { + /// Convenience constructor for the common case of no user rating. + pub fn new(session_key: impl Into, ghost: impl Into, success: bool, latency_ms: u64, input_tokens: u64, output_tokens: u64) -> Self { + Self { + session_key: session_key.into(), + ghost: ghost.into(), + success, + latency_ms, + input_tokens, + output_tokens, + user_rating: None, + } + } +} + /// Aggregated performance metrics for a ghost. #[derive(Debug, Clone, Default)] pub struct GhostMetrics { @@ -57,11 +72,12 @@ impl GhostMetrics { _ => "★☆☆☆☆", }; format!( - "#{} {:20} {} {:.0}% success {:.0}ms avg {} tasks", + "#{} {:20} {} {:.0}% success ({}/{}) {:.0}ms avg", rank, self.ghost, stars, self.success_rate * 100.0, - self.avg_latency_ms, + self.successful_tasks, self.total_tasks, + self.avg_latency_ms, ) } } @@ -261,6 +277,8 @@ impl GhostLeaderboard { Ok(format!( "Ghost Comparison: {} vs {}\n\n\ + {:>30} {:<30}\n\ + {}\n\ {:>30} {:<30}\n\ {:>30} {:<30}\n\ {:>30} {:<30}\n\ @@ -269,6 +287,8 @@ impl GhostLeaderboard { Success rate winner: {}\n\ Speed winner: {}", ghost_a, ghost_b, + ghost_a, ghost_b, + "-".repeat(63), format!("Success: {:.1}%", m_a.success_rate * 100.0), format!("Success: {:.1}%", m_b.success_rate * 100.0), format!("Avg latency: {:.0}ms", m_a.avg_latency_ms), @@ -300,19 +320,15 @@ mod tests { GhostLeaderboard::new(conn, LeaderboardConfig::default()).unwrap() } + fn make_outcome(ghost: &str, success: bool) -> TaskOutcome { + TaskOutcome::new("s", ghost, success, 1000, 500, 200) + } + #[test] fn record_and_rank() { let lb = test_lb(); for i in 0..10 { - lb.record(&TaskOutcome { - session_key: "s".into(), - ghost: "coder".into(), - success: i % 3 != 0, // ~67% success - latency_ms: 1000, - input_tokens: 500, - output_tokens: 200, - user_rating: None, - }).unwrap(); + lb.record(&TaskOutcome::new("s", "coder", i % 3 != 0, 1000, 500, 200)).unwrap(); } let rankings = lb.rankings().unwrap(); assert_eq!(rankings.len(), 1); @@ -333,12 +349,25 @@ mod tests { config.ab_test_fraction = 0.0; let conn = Connection::open_in_memory().unwrap(); let lb = GhostLeaderboard::new(conn, config).unwrap(); - // With fraction 0, should always be Control + // With fraction 0.0, rand::random::() is always >= 0.0, never < 0.0. for _ in 0..20 { assert_eq!(lb.ab_route(), AbRoute::Control); } } + #[test] + fn ab_route_always_challenger_when_fraction_one() { + let mut config = LeaderboardConfig::default(); + config.ab_test_ghost = Some("challenger".into()); + config.ab_test_fraction = 1.0; + let conn = Connection::open_in_memory().unwrap(); + let lb = GhostLeaderboard::new(conn, config).unwrap(); + // rand::random::() returns [0.0, 1.0), so < 1.0 is always true. + for _ in 0..20 { + assert_eq!(lb.ab_route(), AbRoute::Challenger("challenger".into())); + } + } + #[test] fn metrics_rank_score_success_dominant() { let high = GhostMetrics { ghost: "h".into(), success_rate: 1.0, total_tasks: 10, ..Default::default() }; @@ -346,6 +375,16 @@ mod tests { assert!(high.rank_score() > low.rank_score()); } + #[test] + fn rank_score_zero_tokens_returns_neutral_efficiency() { + // When avg_input_tokens and avg_output_tokens are both 0.0 (Default), + // efficiency should fall back to 0.5 (neutral) rather than 1.0 (perfect). + let m = GhostMetrics { ghost: "g".into(), success_rate: 0.5, total_tasks: 1, ..Default::default() }; + // score = 0.6*0.5 + 0.2*0.5 + 0.2*0.5 = 0.5 + let score = m.rank_score(); + assert!((score - 0.5).abs() < 1e-9, "expected 0.5, got {score}"); + } + #[test] fn format_leaderboard_empty() { let lb = test_lb(); @@ -353,14 +392,35 @@ mod tests { assert!(text.contains("No performance data")); } + #[test] + fn format_leaderboard_with_data_shows_ghost_name() { + let lb = test_lb(); + lb.record(&make_outcome("scout", true)).unwrap(); + let text = lb.format_leaderboard().unwrap(); + assert!(text.contains("scout")); + assert!(text.contains("Ghost Leaderboard")); + } + #[test] fn compare_two_ghosts() { let lb = test_lb(); - lb.record(&TaskOutcome { session_key: "s".into(), ghost: "alpha".into(), success: true, latency_ms: 800, input_tokens: 300, output_tokens: 100, user_rating: None }).unwrap(); - lb.record(&TaskOutcome { session_key: "s".into(), ghost: "beta".into(), success: false, latency_ms: 1200, input_tokens: 600, output_tokens: 200, user_rating: None }).unwrap(); + lb.record(&TaskOutcome::new("s", "alpha", true, 800, 300, 100)).unwrap(); + lb.record(&TaskOutcome::new("s", "beta", false, 1200, 600, 200)).unwrap(); let result = lb.compare("alpha", "beta").unwrap(); assert!(result.contains("alpha")); assert!(result.contains("beta")); + // Column headers must appear in the output so each column is identifiable. + assert!(result.contains("Success rate winner:")); + assert!(result.contains("Speed winner:")); + } + + #[test] + fn reset_clears_all_data() { + let lb = test_lb(); + lb.record(&make_outcome("ghost-x", true)).unwrap(); + assert_eq!(lb.rankings().unwrap().len(), 1); + lb.reset().unwrap(); + assert!(lb.rankings().unwrap().is_empty()); } #[test] @@ -370,7 +430,7 @@ mod tests { config.min_samples_for_recommendation = 50; let conn = Connection::open_in_memory().unwrap(); let lb = GhostLeaderboard::new(conn, config).unwrap(); - lb.record(&TaskOutcome { session_key: "s".into(), ghost: "challenger".into(), success: true, latency_ms: 500, input_tokens: 200, output_tokens: 80, user_rating: None }).unwrap(); + lb.record(&TaskOutcome::new("s", "challenger", true, 500, 200, 80)).unwrap(); // Only 1 sample, need 50 assert!(lb.check_promotion().unwrap().is_none()); }