-
Notifications
You must be signed in to change notification settings - Fork 702
feat: ensemble decision pipeline — panel, arbiter, judge, ChromaDB feedback #721
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,156 @@ | ||
| import { commandExists } from '../agent/commands/index.js'; | ||
|
|
||
| export class Arbiter { | ||
| /** | ||
| * @param {Object} config | ||
| * @param {string} config.strategy - "heuristic" (Phase 1) or "llm_judge" (Phase 2) | ||
| * @param {number} config.majority_bonus - score boost for majority command (default 0.2) | ||
| * @param {number} config.latency_penalty_per_sec - penalty per second of latency (default 0.02) | ||
| */ | ||
| constructor(config = {}) { | ||
| this.strategy = config.strategy || 'heuristic'; | ||
| this.majorityBonus = config.majority_bonus ?? 0.2; | ||
| this.latencyPenalty = config.latency_penalty_per_sec ?? 0.02; | ||
| this._confidenceThreshold = config.confidence_threshold ?? 0.08; | ||
| this._lastConfidence = 1.0; // set after each pick() | ||
| } | ||
|
|
||
| /** | ||
| * Confidence threshold for triggering LLM judge. | ||
| * If top 2 scores are within this margin, it's "low confidence". | ||
| */ | ||
| get confidenceThreshold() { | ||
| return this._confidenceThreshold ?? 0.08; | ||
| } | ||
|
|
||
| /** | ||
| * Pick the best proposal from the panel's responses. | ||
| * Also sets `this._lastConfidence` for the controller to check. | ||
| * @param {Proposal[]} proposals - all proposals (may include failures) | ||
| * @returns {Proposal} - the winning proposal with `score` and `winReason` set | ||
| */ | ||
| pick(proposals) { | ||
| const successful = proposals.filter(p => p.status === 'success'); | ||
|
|
||
| if (successful.length === 0) { | ||
| return { | ||
| agentId: 'none', | ||
| modelName: 'none', | ||
| response: "I'm having trouble thinking right now. Let me try again in a moment.", | ||
| command: null, | ||
| commandArgs: null, | ||
| preCommandText: '', | ||
| latencyMs: 0, | ||
| status: 'error', | ||
| error: 'All panel members failed', | ||
| score: 0, | ||
| winReason: 'fallback' | ||
| }; | ||
| } | ||
|
|
||
| // Score each proposal | ||
| for (const p of successful) { | ||
| p.score = this._scoreProposal(p); | ||
| } | ||
|
|
||
| // Find majority command and apply bonus | ||
| const majorityCommand = this._findMajorityCommand(successful); | ||
| if (majorityCommand) { | ||
| for (const p of successful) { | ||
| if (p.command === majorityCommand) { | ||
| p.score += this.majorityBonus; | ||
| } | ||
| } | ||
| } | ||
|
|
||
| // Apply latency penalty (tiebreaker) | ||
| for (const p of successful) { | ||
| p.score -= this.latencyPenalty * (p.latencyMs / 1000); | ||
| } | ||
|
|
||
| // Sort: highest score first, then fastest (lowest latency) | ||
| successful.sort((a, b) => { | ||
| if (Math.abs(b.score - a.score) > 0.001) return b.score - a.score; | ||
| return a.latencyMs - b.latencyMs; | ||
| }); | ||
|
|
||
| const winner = successful[0]; | ||
| winner.winReason = majorityCommand && winner.command === majorityCommand | ||
| ? 'majority+highest_score' | ||
| : 'highest_score'; | ||
|
|
||
| // Compute confidence: margin between top 2 scores | ||
| this._lastConfidence = successful.length >= 2 | ||
| ? successful[0].score - successful[1].score | ||
| : 1.0; | ||
|
|
||
| return winner; | ||
| } | ||
|
|
||
| /** | ||
| * Returns true if the last pick() result had low confidence | ||
| * and an LLM judge should be consulted. | ||
| */ | ||
| isLowConfidence() { | ||
| return this._lastConfidence < this._confidenceThreshold; | ||
| } | ||
|
|
||
| /** | ||
| * Compute heuristic score for a proposal. | ||
| * @param {Proposal} proposal | ||
| * @returns {number} score between 0.0 and ~1.0 | ||
| */ | ||
| _scoreProposal(proposal) { | ||
| let score = 0; | ||
| const r = proposal.response || ''; | ||
|
|
||
| // Non-empty response | ||
| if (r.trim().length > 0) score += 0.10; | ||
|
|
||
| // Contains a command | ||
| if (proposal.command) score += 0.25; | ||
|
|
||
| // Command exists in the game's registry | ||
| if (proposal.command && commandExists(proposal.command)) score += 0.15; | ||
|
|
||
| // No hallucination markers | ||
| const hallucinations = ['(FROM OTHER BOT)', 'My brain disconnected', 'Error:']; | ||
| if (!hallucinations.some(h => r.includes(h))) score += 0.15; | ||
|
|
||
| // Reasonable length (not too short, not too long) | ||
| if (r.length > 5 && r.length < 2000) score += 0.10; | ||
|
|
||
| // Not a tab-only or whitespace-only response | ||
| if (r.trim().length > 1) score += 0.10; | ||
|
|
||
| // Has pre-command reasoning text (shows the model "thought") | ||
| if (proposal.preCommandText && proposal.preCommandText.trim().length > 0) score += 0.05; | ||
|
|
||
| // Response contains actual content words (not just a command) | ||
| if (r.replace(/![a-zA-Z]+\(.*?\)/g, '').trim().length > 3) score += 0.10; | ||
|
|
||
| return score; | ||
| } | ||
|
|
||
| /** | ||
| * Find the command that appears most among proposals. | ||
| * @param {Proposal[]} proposals - successful proposals only | ||
| * @returns {string|null} majority command or null | ||
| */ | ||
| _findMajorityCommand(proposals) { | ||
| const commands = proposals.map(p => p.command).filter(Boolean); | ||
| if (commands.length === 0) return null; | ||
|
|
||
| const counts = {}; | ||
| for (const c of commands) { | ||
| counts[c] = (counts[c] || 0) + 1; | ||
| } | ||
|
|
||
| const sorted = Object.entries(counts).sort((a, b) => b[1] - a[1]); | ||
| // Majority: top command appears more than once AND strictly more than runner-up | ||
| if (sorted[0][1] > 1 && (sorted.length === 1 || sorted[0][1] > sorted[1][1])) { | ||
| return sorted[0][0]; | ||
| } | ||
| return null; | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change | ||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,190 @@ | ||||||||||||||
| import { Panel } from './panel.js'; | ||||||||||||||
| import { Arbiter } from './arbiter.js'; | ||||||||||||||
| import { LLMJudge } from './judge.js'; | ||||||||||||||
| import { EnsembleLogger } from './logger.js'; | ||||||||||||||
| import { FeedbackCollector } from './feedback.js'; | ||||||||||||||
|
|
||||||||||||||
| /** | ||||||||||||||
| * EnsembleModel — implements the same interface as any single model class | ||||||||||||||
| * (Gemini, Grok, etc.) so it can be used as a drop-in replacement for chat_model | ||||||||||||||
| * in the Prompter class. | ||||||||||||||
| * | ||||||||||||||
| * Instead of a single LLM call, it queries a panel of models in parallel, | ||||||||||||||
| * runs an arbiter to pick the best response, and returns the winning string. | ||||||||||||||
| */ | ||||||||||||||
| export class EnsembleModel { | ||||||||||||||
| static prefix = 'ensemble'; | ||||||||||||||
|
|
||||||||||||||
| /** | ||||||||||||||
|
Comment on lines
+15
to
+18
|
||||||||||||||
| * @param {Object} ensembleConfig - the profile.ensemble configuration block | ||||||||||||||
| * @param {Object} profile - the full bot profile (for context/name) | ||||||||||||||
| */ | ||||||||||||||
| constructor(ensembleConfig, profile) { | ||||||||||||||
| this.model_name = 'ensemble'; | ||||||||||||||
| this.profile = profile; | ||||||||||||||
|
|
||||||||||||||
| this.panel = new Panel( | ||||||||||||||
| ensembleConfig.panel, | ||||||||||||||
| ensembleConfig.timeout_ms || 15000 | ||||||||||||||
| ); | ||||||||||||||
| this.arbiter = new Arbiter(ensembleConfig.arbiter || {}); | ||||||||||||||
| this.judge = ensembleConfig.judge !== false | ||||||||||||||
| ? new LLMJudge(ensembleConfig.judge || {}) | ||||||||||||||
| : null; | ||||||||||||||
| this.logger = new EnsembleLogger(profile.name); | ||||||||||||||
| this.feedback = new FeedbackCollector(); | ||||||||||||||
|
|
||||||||||||||
| this.minResponses = ensembleConfig.min_responses || 2; | ||||||||||||||
| this.logDecisions = ensembleConfig.log_decisions !== false; | ||||||||||||||
|
|
||||||||||||||
| // Usage tracking compatibility (Prompter reads this after each call) | ||||||||||||||
| this._lastUsage = null; | ||||||||||||||
| this._lastUsageByModel = null; | ||||||||||||||
|
|
||||||||||||||
| console.log(`[Ensemble] Initialized for ${profile.name}: ${this.panel.members.length} panel members`); | ||||||||||||||
| } | ||||||||||||||
|
|
||||||||||||||
| /** | ||||||||||||||
| * Phase 3: inject the shared embedding model into FeedbackCollector. | ||||||||||||||
| * Called by Prompter after both chat_model and embedding_model are ready. | ||||||||||||||
| */ | ||||||||||||||
| setEmbeddingModel(embeddingModel) { | ||||||||||||||
| this.feedback.setEmbeddingModel(embeddingModel); | ||||||||||||||
| } | ||||||||||||||
|
|
||||||||||||||
| /** | ||||||||||||||
| * Standard model interface — called by Prompter.promptConvo(). | ||||||||||||||
| * Queries all panel members, arbitrates, returns winning response. | ||||||||||||||
| * | ||||||||||||||
| * @param {Array<{role:string, content:string}>} turns - conversation history | ||||||||||||||
| * @param {string} systemMessage - the built system prompt | ||||||||||||||
| * @returns {Promise<string>} - the winning response text | ||||||||||||||
| */ | ||||||||||||||
| async sendRequest(turns, systemMessage) { | ||||||||||||||
| const startTime = Date.now(); | ||||||||||||||
|
|
||||||||||||||
| // Phase 3: retrieve similar past experiences to augment context | ||||||||||||||
| let augmentedSystem = systemMessage; | ||||||||||||||
| if (this.feedback.isReady) { | ||||||||||||||
| const situationText = turns.filter(t => t.role === 'user').slice(-2) | ||||||||||||||
| .map(t => t.content).join(' '); | ||||||||||||||
| const experiences = await this.feedback.getSimilar(situationText, 3); | ||||||||||||||
| if (experiences.length > 0) { | ||||||||||||||
| const memBlock = experiences.map(e => { | ||||||||||||||
| const m = e.metadata; | ||||||||||||||
| const outcome = m.outcome && m.outcome !== 'pending' ? ` (outcome: ${m.outcome})` : ''; | ||||||||||||||
| return `- Situation: "${e.document.slice(0, 120)}" → action: ${m.winner_command || 'chat'}${outcome}`; | ||||||||||||||
| }).join('\n'); | ||||||||||||||
| augmentedSystem = systemMessage + `\n\n[PAST EXPERIENCE - similar situations]\n${memBlock}`; | ||||||||||||||
| console.log(`[Ensemble] Injected ${experiences.length} past experience(s) into context`); | ||||||||||||||
| } | ||||||||||||||
| } | ||||||||||||||
|
|
||||||||||||||
| // Query all panel members in parallel | ||||||||||||||
| const proposals = await this.panel.queryAll(turns, augmentedSystem); | ||||||||||||||
|
|
||||||||||||||
| const successful = proposals.filter(p => p.status === 'success'); | ||||||||||||||
| const failed = proposals.filter(p => p.status !== 'success'); | ||||||||||||||
|
|
||||||||||||||
| if (failed.length > 0) { | ||||||||||||||
| const failSummary = failed.map(p => `${p.agentId}:${p.status}`).join(', '); | ||||||||||||||
| console.log(`[Ensemble] Panel failures: ${failSummary}`); | ||||||||||||||
| } | ||||||||||||||
|
|
||||||||||||||
| if (successful.length < this.minResponses) { | ||||||||||||||
| console.warn(`[Ensemble] Only ${successful.length}/${this.panel.members.length} responses (need ${this.minResponses})`); | ||||||||||||||
| if (successful.length === 0) { | ||||||||||||||
| this._lastUsage = null; | ||||||||||||||
| return "I'm having trouble processing right now. Let me try again."; | ||||||||||||||
| } | ||||||||||||||
| } | ||||||||||||||
|
|
||||||||||||||
| // Heuristic arbiter — always runs first | ||||||||||||||
| let winner = this.arbiter.pick(proposals); | ||||||||||||||
| let judgeUsed = false; | ||||||||||||||
|
|
||||||||||||||
| // Phase 2: LLM judge fallback when heuristic confidence is low | ||||||||||||||
| if (this.judge && this.arbiter.isLowConfidence() && successful.length >= 2) { | ||||||||||||||
| console.log(`[Ensemble] Low confidence (margin=${this.arbiter._lastConfidence.toFixed(3)}), consulting LLM judge...`); | ||||||||||||||
|
||||||||||||||
| console.log(`[Ensemble] Low confidence (margin=${this.arbiter._lastConfidence.toFixed(3)}), consulting LLM judge...`); | |
| console.log('[Ensemble] Low confidence detected, consulting LLM judge...'); |
Copilot
AI
Mar 4, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
logDecision() is async but is called without await/.catch(). If the logger's _ready promise rejects (e.g., mkdir fails), this becomes an unhandled promise rejection. Either await the call, or explicitly handle it with .catch()/void plus internal error handling to keep logging failures from surfacing.
| this.logger.logDecision(proposals, winner); | |
| try { | |
| await this.logger.logDecision(proposals, winner); | |
| } catch (err) { | |
| console.warn('[Ensemble] Failed to log decision:', err); | |
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
config.strategyis stored on the instance but never used by the arbiter logic. If strategy selection is planned, implement it; otherwise remove it to avoid dead config options and reduce maintenance overhead.