Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
156 changes: 156 additions & 0 deletions src/ensemble/arbiter.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
import { commandExists } from '../agent/commands/index.js';

export class Arbiter {
/**
* @param {Object} config
* @param {string} config.strategy - "heuristic" (Phase 1) or "llm_judge" (Phase 2)
* @param {number} config.majority_bonus - score boost for majority command (default 0.2)
* @param {number} config.latency_penalty_per_sec - penalty per second of latency (default 0.02)
*/
constructor(config = {}) {
this.strategy = config.strategy || 'heuristic';
this.majorityBonus = config.majority_bonus ?? 0.2;
this.latencyPenalty = config.latency_penalty_per_sec ?? 0.02;
this._confidenceThreshold = config.confidence_threshold ?? 0.08;
this._lastConfidence = 1.0; // set after each pick()
}
Comment on lines +10 to +16
Copy link

Copilot AI Mar 4, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

config.strategy is stored on the instance but never used by the arbiter logic. If strategy selection is planned, implement it; otherwise remove it to avoid dead config options and reduce maintenance overhead.

Copilot uses AI. Check for mistakes.

/**
* Confidence threshold for triggering LLM judge.
* If top 2 scores are within this margin, it's "low confidence".
*/
get confidenceThreshold() {
return this._confidenceThreshold ?? 0.08;
}

/**
* Pick the best proposal from the panel's responses.
* Also sets `this._lastConfidence` for the controller to check.
* @param {Proposal[]} proposals - all proposals (may include failures)
* @returns {Proposal} - the winning proposal with `score` and `winReason` set
*/
pick(proposals) {
const successful = proposals.filter(p => p.status === 'success');

if (successful.length === 0) {
return {
agentId: 'none',
modelName: 'none',
response: "I'm having trouble thinking right now. Let me try again in a moment.",
command: null,
commandArgs: null,
preCommandText: '',
latencyMs: 0,
status: 'error',
error: 'All panel members failed',
score: 0,
winReason: 'fallback'
};
}

// Score each proposal
for (const p of successful) {
p.score = this._scoreProposal(p);
}

// Find majority command and apply bonus
const majorityCommand = this._findMajorityCommand(successful);
if (majorityCommand) {
for (const p of successful) {
if (p.command === majorityCommand) {
p.score += this.majorityBonus;
}
}
}

// Apply latency penalty (tiebreaker)
for (const p of successful) {
p.score -= this.latencyPenalty * (p.latencyMs / 1000);
}

// Sort: highest score first, then fastest (lowest latency)
successful.sort((a, b) => {
if (Math.abs(b.score - a.score) > 0.001) return b.score - a.score;
return a.latencyMs - b.latencyMs;
});

const winner = successful[0];
winner.winReason = majorityCommand && winner.command === majorityCommand
? 'majority+highest_score'
: 'highest_score';

// Compute confidence: margin between top 2 scores
this._lastConfidence = successful.length >= 2
? successful[0].score - successful[1].score
: 1.0;

return winner;
}

/**
* Returns true if the last pick() result had low confidence
* and an LLM judge should be consulted.
*/
isLowConfidence() {
return this._lastConfidence < this._confidenceThreshold;
}

/**
* Compute heuristic score for a proposal.
* @param {Proposal} proposal
* @returns {number} score between 0.0 and ~1.0
*/
_scoreProposal(proposal) {
let score = 0;
const r = proposal.response || '';

// Non-empty response
if (r.trim().length > 0) score += 0.10;

// Contains a command
if (proposal.command) score += 0.25;

// Command exists in the game's registry
if (proposal.command && commandExists(proposal.command)) score += 0.15;

// No hallucination markers
const hallucinations = ['(FROM OTHER BOT)', 'My brain disconnected', 'Error:'];
if (!hallucinations.some(h => r.includes(h))) score += 0.15;

// Reasonable length (not too short, not too long)
if (r.length > 5 && r.length < 2000) score += 0.10;

// Not a tab-only or whitespace-only response
if (r.trim().length > 1) score += 0.10;

// Has pre-command reasoning text (shows the model "thought")
if (proposal.preCommandText && proposal.preCommandText.trim().length > 0) score += 0.05;

// Response contains actual content words (not just a command)
if (r.replace(/![a-zA-Z]+\(.*?\)/g, '').trim().length > 3) score += 0.10;

return score;
}

/**
* Find the command that appears most among proposals.
* @param {Proposal[]} proposals - successful proposals only
* @returns {string|null} majority command or null
*/
_findMajorityCommand(proposals) {
const commands = proposals.map(p => p.command).filter(Boolean);
if (commands.length === 0) return null;

const counts = {};
for (const c of commands) {
counts[c] = (counts[c] || 0) + 1;
}

const sorted = Object.entries(counts).sort((a, b) => b[1] - a[1]);
// Majority: top command appears more than once AND strictly more than runner-up
if (sorted[0][1] > 1 && (sorted.length === 1 || sorted[0][1] > sorted[1][1])) {
return sorted[0][0];
}
return null;
}
}
190 changes: 190 additions & 0 deletions src/ensemble/controller.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
import { Panel } from './panel.js';
import { Arbiter } from './arbiter.js';
import { LLMJudge } from './judge.js';
import { EnsembleLogger } from './logger.js';
import { FeedbackCollector } from './feedback.js';

/**
* EnsembleModel — implements the same interface as any single model class
* (Gemini, Grok, etc.) so it can be used as a drop-in replacement for chat_model
* in the Prompter class.
*
* Instead of a single LLM call, it queries a panel of models in parallel,
* runs an arbiter to pick the best response, and returns the winning string.
*/
export class EnsembleModel {
static prefix = 'ensemble';

/**
Comment on lines +15 to +18
Copy link

Copilot AI Mar 4, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

EnsembleModel declares static prefix = 'ensemble', but it lives under src/ensemble/ and is not discoverable by src/models/_model_map.js (which only auto-loads models from src/models/). As-is, selectAPI/createModel cannot construct this model from a profile, so it isn't actually a drop-in chat_model replacement. Consider moving/exporting this model from src/models/ (e.g., src/models/ensemble.js) or extending _model_map.js to include it.

Copilot uses AI. Check for mistakes.
* @param {Object} ensembleConfig - the profile.ensemble configuration block
* @param {Object} profile - the full bot profile (for context/name)
*/
constructor(ensembleConfig, profile) {
this.model_name = 'ensemble';
this.profile = profile;

this.panel = new Panel(
ensembleConfig.panel,
ensembleConfig.timeout_ms || 15000
);
this.arbiter = new Arbiter(ensembleConfig.arbiter || {});
this.judge = ensembleConfig.judge !== false
? new LLMJudge(ensembleConfig.judge || {})
: null;
this.logger = new EnsembleLogger(profile.name);
this.feedback = new FeedbackCollector();

this.minResponses = ensembleConfig.min_responses || 2;
this.logDecisions = ensembleConfig.log_decisions !== false;

// Usage tracking compatibility (Prompter reads this after each call)
this._lastUsage = null;
this._lastUsageByModel = null;

console.log(`[Ensemble] Initialized for ${profile.name}: ${this.panel.members.length} panel members`);
}

/**
* Phase 3: inject the shared embedding model into FeedbackCollector.
* Called by Prompter after both chat_model and embedding_model are ready.
*/
setEmbeddingModel(embeddingModel) {
this.feedback.setEmbeddingModel(embeddingModel);
}

/**
* Standard model interface — called by Prompter.promptConvo().
* Queries all panel members, arbitrates, returns winning response.
*
* @param {Array<{role:string, content:string}>} turns - conversation history
* @param {string} systemMessage - the built system prompt
* @returns {Promise<string>} - the winning response text
*/
async sendRequest(turns, systemMessage) {
const startTime = Date.now();

// Phase 3: retrieve similar past experiences to augment context
let augmentedSystem = systemMessage;
if (this.feedback.isReady) {
const situationText = turns.filter(t => t.role === 'user').slice(-2)
.map(t => t.content).join(' ');
const experiences = await this.feedback.getSimilar(situationText, 3);
if (experiences.length > 0) {
const memBlock = experiences.map(e => {
const m = e.metadata;
const outcome = m.outcome && m.outcome !== 'pending' ? ` (outcome: ${m.outcome})` : '';
return `- Situation: "${e.document.slice(0, 120)}" → action: ${m.winner_command || 'chat'}${outcome}`;
}).join('\n');
augmentedSystem = systemMessage + `\n\n[PAST EXPERIENCE - similar situations]\n${memBlock}`;
console.log(`[Ensemble] Injected ${experiences.length} past experience(s) into context`);
}
}

// Query all panel members in parallel
const proposals = await this.panel.queryAll(turns, augmentedSystem);

const successful = proposals.filter(p => p.status === 'success');
const failed = proposals.filter(p => p.status !== 'success');

if (failed.length > 0) {
const failSummary = failed.map(p => `${p.agentId}:${p.status}`).join(', ');
console.log(`[Ensemble] Panel failures: ${failSummary}`);
}

if (successful.length < this.minResponses) {
console.warn(`[Ensemble] Only ${successful.length}/${this.panel.members.length} responses (need ${this.minResponses})`);
if (successful.length === 0) {
this._lastUsage = null;
return "I'm having trouble processing right now. Let me try again.";
}
}

// Heuristic arbiter — always runs first
let winner = this.arbiter.pick(proposals);
let judgeUsed = false;

// Phase 2: LLM judge fallback when heuristic confidence is low
if (this.judge && this.arbiter.isLowConfidence() && successful.length >= 2) {
console.log(`[Ensemble] Low confidence (margin=${this.arbiter._lastConfidence.toFixed(3)}), consulting LLM judge...`);
Copy link

Copilot AI Mar 4, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This log line reaches into this.arbiter._lastConfidence directly. Since _lastConfidence is effectively internal state, consider exposing it via a getter (e.g., lastConfidence) to avoid controller↔arbiter coupling on a private field name.

Suggested change
console.log(`[Ensemble] Low confidence (margin=${this.arbiter._lastConfidence.toFixed(3)}), consulting LLM judge...`);
console.log('[Ensemble] Low confidence detected, consulting LLM judge...');

Copilot uses AI. Check for mistakes.
try {
const judgeId = await this.judge.judge(successful, systemMessage, turns);
if (judgeId) {
const judgeWinner = successful.find(p => p.agentId === judgeId);
if (judgeWinner) {
judgeWinner.winReason = 'llm_judge';
winner = judgeWinner;
judgeUsed = true;
console.log(`[Ensemble] Judge overruled heuristic: winner=${judgeId}`);
}
}
} catch (err) {
console.warn(`[Ensemble] Judge error, keeping heuristic winner: ${err.message}`);
}
}

const totalMs = Date.now() - startTime;
console.log(
`[Ensemble] Decision in ${totalMs}ms: ` +
`${successful.length}/${this.panel.members.length} responded, ` +
`winner=${winner.agentId} (${winner.command || 'chat'}, score=${winner.score?.toFixed(2)})` +
(judgeUsed ? ' [judge]' : '')
);

// Log decision
if (this.logDecisions) {
this.logger.logDecision(proposals, winner);
Copy link

Copilot AI Mar 4, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

logDecision() is async but is called without await/.catch(). If the logger's _ready promise rejects (e.g., mkdir fails), this becomes an unhandled promise rejection. Either await the call, or explicitly handle it with .catch()/void plus internal error handling to keep logging failures from surfacing.

Suggested change
this.logger.logDecision(proposals, winner);
try {
await this.logger.logDecision(proposals, winner);
} catch (err) {
console.warn('[Ensemble] Failed to log decision:', err);
}

Copilot uses AI. Check for mistakes.
}

// Phase 3: Record decision in ChromaDB for continuous learning
const situationText = turns.filter(t => t.role === 'user').slice(-2)
.map(t => t.content).join(' ');
this.feedback.recordDecision({
winner,
proposals,
timestamp: Date.now(),
situationText
});

// Aggregate usage from all successful members
this._lastUsage = this._aggregateUsage(successful);
this._lastUsageByModel = this._buildUsageBreakdown(successful);

return winner.response;
}

/**
* Embeddings are not supported by the ensemble — the Prompter uses
* a separate embedding model configured in the profile.
*/
async embed(_text) {
throw new Error('Embeddings not supported by EnsembleModel. Configure a separate embedding model in the profile.');
}

/**
* Sum token usage across all panel members for cost tracking.
*/
_aggregateUsage(proposals) {
let prompt = 0, completion = 0;
for (const p of proposals) {
if (p.usage) {
prompt += p.usage.prompt_tokens || 0;
completion += p.usage.completion_tokens || 0;
}
}
if (prompt === 0 && completion === 0) return null;
return { prompt_tokens: prompt, completion_tokens: completion, total_tokens: prompt + completion };
}

_buildUsageBreakdown(proposals) {
const breakdown = [];
for (const p of proposals) {
if (!p.usage) continue;
breakdown.push({
modelName: p.modelName || 'unknown',
provider: p.provider || 'unknown',
usage: p.usage
});
}
return breakdown.length > 0 ? breakdown : null;
}
}
Loading
Loading