mindcraft-bots · Z0mb13V1 · Mar 4, 2026 · Copilot · Mar 4, 2026 · Copilot
diff --git a/src/ensemble/arbiter.js b/src/ensemble/arbiter.js
@@ -0,0 +1,156 @@
+import { commandExists } from '../agent/commands/index.js';
+
+export class Arbiter {
+    /**
+     * @param {Object} config
+     * @param {string} config.strategy - "heuristic" (Phase 1) or "llm_judge" (Phase 2)
+     * @param {number} config.majority_bonus - score boost for majority command (default 0.2)
+     * @param {number} config.latency_penalty_per_sec - penalty per second of latency (default 0.02)
+     */
+    constructor(config = {}) {
+        this.strategy = config.strategy || 'heuristic';
+        this.majorityBonus = config.majority_bonus ?? 0.2;
+        this.latencyPenalty = config.latency_penalty_per_sec ?? 0.02;
+        this._confidenceThreshold = config.confidence_threshold ?? 0.08;
+        this._lastConfidence = 1.0;  // set after each pick()
+    }
+
+    /**
+     * Confidence threshold for triggering LLM judge.
+     * If top 2 scores are within this margin, it's "low confidence".
+     */
+    get confidenceThreshold() {
+        return this._confidenceThreshold ?? 0.08;
+    }
+
+    /**
+     * Pick the best proposal from the panel's responses.
+     * Also sets `this._lastConfidence` for the controller to check.
+     * @param {Proposal[]} proposals - all proposals (may include failures)
+     * @returns {Proposal} - the winning proposal with `score` and `winReason` set
+     */
+    pick(proposals) {
+        const successful = proposals.filter(p => p.status === 'success');
+
+        if (successful.length === 0) {
+            return {
+                agentId: 'none',
+                modelName: 'none',
+                response: "I'm having trouble thinking right now. Let me try again in a moment.",
+                command: null,
+                commandArgs: null,
+                preCommandText: '',
+                latencyMs: 0,
+                status: 'error',
+                error: 'All panel members failed',
+                score: 0,
+                winReason: 'fallback'
+            };
+        }
+
+        // Score each proposal
+        for (const p of successful) {
+            p.score = this._scoreProposal(p);
+        }
+
+        // Find majority command and apply bonus
+        const majorityCommand = this._findMajorityCommand(successful);
+        if (majorityCommand) {
+            for (const p of successful) {
+                if (p.command === majorityCommand) {
+                    p.score += this.majorityBonus;
+                }
+            }
+        }
+
+        // Apply latency penalty (tiebreaker)
+        for (const p of successful) {
+            p.score -= this.latencyPenalty * (p.latencyMs / 1000);
+        }
+
+        // Sort: highest score first, then fastest (lowest latency)
+        successful.sort((a, b) => {
+            if (Math.abs(b.score - a.score) > 0.001) return b.score - a.score;
+            return a.latencyMs - b.latencyMs;
+        });
+
+        const winner = successful[0];
+        winner.winReason = majorityCommand && winner.command === majorityCommand
+            ? 'majority+highest_score'
+            : 'highest_score';
+
+        // Compute confidence: margin between top 2 scores
+        this._lastConfidence = successful.length >= 2
+            ? successful[0].score - successful[1].score
+            : 1.0;
+
+        return winner;
+    }
+
+    /**
+     * Returns true if the last pick() result had low confidence
+     * and an LLM judge should be consulted.
+     */
+    isLowConfidence() {
+        return this._lastConfidence < this._confidenceThreshold;
+    }
+
+    /**
+     * Compute heuristic score for a proposal.
+     * @param {Proposal} proposal
+     * @returns {number} score between 0.0 and ~1.0
+     */
+    _scoreProposal(proposal) {
+        let score = 0;
+        const r = proposal.response || '';
+
+        // Non-empty response
+        if (r.trim().length > 0) score += 0.10;
+
+        // Contains a command
+        if (proposal.command) score += 0.25;
+
+        // Command exists in the game's registry
+        if (proposal.command && commandExists(proposal.command)) score += 0.15;
+
+        // No hallucination markers
+        const hallucinations = ['(FROM OTHER BOT)', 'My brain disconnected', 'Error:'];
+        if (!hallucinations.some(h => r.includes(h))) score += 0.15;
+
+        // Reasonable length (not too short, not too long)
+        if (r.length > 5 && r.length < 2000) score += 0.10;
+
+        // Not a tab-only or whitespace-only response
+        if (r.trim().length > 1) score += 0.10;
+
+        // Has pre-command reasoning text (shows the model "thought")
+        if (proposal.preCommandText && proposal.preCommandText.trim().length > 0) score += 0.05;
+
+        // Response contains actual content words (not just a command)
+        if (r.replace(/![a-zA-Z]+\(.*?\)/g, '').trim().length > 3) score += 0.10;
+
+        return score;
+    }
+
+    /**
+     * Find the command that appears most among proposals.
+     * @param {Proposal[]} proposals - successful proposals only
+     * @returns {string|null} majority command or null
+     */
+    _findMajorityCommand(proposals) {
+        const commands = proposals.map(p => p.command).filter(Boolean);
+        if (commands.length === 0) return null;
+
+        const counts = {};
+        for (const c of commands) {
+            counts[c] = (counts[c] || 0) + 1;
+        }
+
+        const sorted = Object.entries(counts).sort((a, b) => b[1] - a[1]);
+        // Majority: top command appears more than once AND strictly more than runner-up
+        if (sorted[0][1] > 1 && (sorted.length === 1 || sorted[0][1] > sorted[1][1])) {
+            return sorted[0][0];
+        }
+        return null;
+    }
+}
diff --git a/src/ensemble/controller.js b/src/ensemble/controller.js
@@ -0,0 +1,190 @@
+import { Panel } from './panel.js';
+import { Arbiter } from './arbiter.js';
+import { LLMJudge } from './judge.js';
+import { EnsembleLogger } from './logger.js';
+import { FeedbackCollector } from './feedback.js';
+
+/**
+ * EnsembleModel — implements the same interface as any single model class
+ * (Gemini, Grok, etc.) so it can be used as a drop-in replacement for chat_model
+ * in the Prompter class.
+ *
+ * Instead of a single LLM call, it queries a panel of models in parallel,
+ * runs an arbiter to pick the best response, and returns the winning string.
+ */
+export class EnsembleModel {
+    static prefix = 'ensemble';
+
+    /**
+     * @param {Object} ensembleConfig - the profile.ensemble configuration block
+     * @param {Object} profile - the full bot profile (for context/name)
+     */
+    constructor(ensembleConfig, profile) {
+        this.model_name = 'ensemble';
+        this.profile = profile;
+
+        this.panel = new Panel(
+            ensembleConfig.panel,
+            ensembleConfig.timeout_ms || 15000
+        );
+        this.arbiter = new Arbiter(ensembleConfig.arbiter || {});
+        this.judge = ensembleConfig.judge !== false
+            ? new LLMJudge(ensembleConfig.judge || {})
+            : null;
+        this.logger = new EnsembleLogger(profile.name);
+        this.feedback = new FeedbackCollector();
+
+        this.minResponses = ensembleConfig.min_responses || 2;
+        this.logDecisions = ensembleConfig.log_decisions !== false;
+
+        // Usage tracking compatibility (Prompter reads this after each call)
+        this._lastUsage = null;
+        this._lastUsageByModel = null;
+
+        console.log(`[Ensemble] Initialized for ${profile.name}: ${this.panel.members.length} panel members`);
+    }
+
+    /**
+     * Phase 3: inject the shared embedding model into FeedbackCollector.
+     * Called by Prompter after both chat_model and embedding_model are ready.
+     */
+    setEmbeddingModel(embeddingModel) {
+        this.feedback.setEmbeddingModel(embeddingModel);
+    }
+
+    /**
+     * Standard model interface — called by Prompter.promptConvo().
+     * Queries all panel members, arbitrates, returns winning response.
+     *
+     * @param {Array<{role:string, content:string}>} turns - conversation history
+     * @param {string} systemMessage - the built system prompt
+     * @returns {Promise<string>} - the winning response text
+     */
+    async sendRequest(turns, systemMessage) {
+        const startTime = Date.now();
+
+        // Phase 3: retrieve similar past experiences to augment context
+        let augmentedSystem = systemMessage;
+        if (this.feedback.isReady) {
+            const situationText = turns.filter(t => t.role === 'user').slice(-2)
+                .map(t => t.content).join(' ');
+            const experiences = await this.feedback.getSimilar(situationText, 3);
+            if (experiences.length > 0) {
+                const memBlock = experiences.map(e => {
+                    const m = e.metadata;
+                    const outcome = m.outcome && m.outcome !== 'pending' ? ` (outcome: ${m.outcome})` : '';
+                    return `- Situation: "${e.document.slice(0, 120)}" → action: ${m.winner_command || 'chat'}${outcome}`;
+                }).join('\n');
+                augmentedSystem = systemMessage + `\n\n[PAST EXPERIENCE - similar situations]\n${memBlock}`;
+                console.log(`[Ensemble] Injected ${experiences.length} past experience(s) into context`);
+            }
+        }
+
+        // Query all panel members in parallel
+        const proposals = await this.panel.queryAll(turns, augmentedSystem);
+
+        const successful = proposals.filter(p => p.status === 'success');
+        const failed = proposals.filter(p => p.status !== 'success');
+
+        if (failed.length > 0) {
+            const failSummary = failed.map(p => `${p.agentId}:${p.status}`).join(', ');
+            console.log(`[Ensemble] Panel failures: ${failSummary}`);
+        }
+
+        if (successful.length < this.minResponses) {
+            console.warn(`[Ensemble] Only ${successful.length}/${this.panel.members.length} responses (need ${this.minResponses})`);
+            if (successful.length === 0) {
+                this._lastUsage = null;
+                return "I'm having trouble processing right now. Let me try again.";
+            }
+        }
+
+        // Heuristic arbiter — always runs first
+        let winner = this.arbiter.pick(proposals);
+        let judgeUsed = false;
+
+        // Phase 2: LLM judge fallback when heuristic confidence is low
+        if (this.judge && this.arbiter.isLowConfidence() && successful.length >= 2) {
+            console.log(`[Ensemble] Low confidence (margin=${this.arbiter._lastConfidence.toFixed(3)}), consulting LLM judge...`);
-            console.log(`[Ensemble] Low confidence (margin=${this.arbiter._lastConfidence.toFixed(3)}), consulting LLM judge...`);
+            console.log('[Ensemble] Low confidence detected, consulting LLM judge...');
-            console.log(`[Ensemble] Low confidence (margin=${this.arbiter._lastConfidence.toFixed(3)}), consulting LLM judge...`);
+            console.log('[Ensemble] Low confidence detected, consulting LLM judge...');
+            try {
+                const judgeId = await this.judge.judge(successful, systemMessage, turns);
+                if (judgeId) {
+                    const judgeWinner = successful.find(p => p.agentId === judgeId);
+                    if (judgeWinner) {
+                        judgeWinner.winReason = 'llm_judge';
+                        winner = judgeWinner;
+                        judgeUsed = true;
+                        console.log(`[Ensemble] Judge overruled heuristic: winner=${judgeId}`);
+                    }
+                }
+            } catch (err) {
+                console.warn(`[Ensemble] Judge error, keeping heuristic winner: ${err.message}`);
+            }
+        }
+
+        const totalMs = Date.now() - startTime;
+        console.log(
+            `[Ensemble] Decision in ${totalMs}ms: ` +
+            `${successful.length}/${this.panel.members.length} responded, ` +
+            `winner=${winner.agentId} (${winner.command || 'chat'}, score=${winner.score?.toFixed(2)})` +
+            (judgeUsed ? ' [judge]' : '')
+        );
+
+        // Log decision
+        if (this.logDecisions) {
+            this.logger.logDecision(proposals, winner);
-            this.logger.logDecision(proposals, winner);
+            try {
+                await this.logger.logDecision(proposals, winner);
+            } catch (err) {
+                console.warn('[Ensemble] Failed to log decision:', err);
+            }
-            this.logger.logDecision(proposals, winner);
+            try {
+                await this.logger.logDecision(proposals, winner);
+            } catch (err) {
+                console.warn('[Ensemble] Failed to log decision:', err);
+            }
+        }
+
+        // Phase 3: Record decision in ChromaDB for continuous learning
+        const situationText = turns.filter(t => t.role === 'user').slice(-2)
+            .map(t => t.content).join(' ');
+        this.feedback.recordDecision({
+            winner,
+            proposals,
+            timestamp: Date.now(),
+            situationText
+        });
+
+        // Aggregate usage from all successful members
+        this._lastUsage = this._aggregateUsage(successful);
+        this._lastUsageByModel = this._buildUsageBreakdown(successful);
+
+        return winner.response;
+    }
+
+    /**
+     * Embeddings are not supported by the ensemble — the Prompter uses
+     * a separate embedding model configured in the profile.
+     */
+    async embed(_text) {
+        throw new Error('Embeddings not supported by EnsembleModel. Configure a separate embedding model in the profile.');
+    }
+
+    /**
+     * Sum token usage across all panel members for cost tracking.
+     */
+    _aggregateUsage(proposals) {
+        let prompt = 0, completion = 0;
+        for (const p of proposals) {
+            if (p.usage) {
+                prompt += p.usage.prompt_tokens || 0;
+                completion += p.usage.completion_tokens || 0;
+            }
+        }
+        if (prompt === 0 && completion === 0) return null;
+        return { prompt_tokens: prompt, completion_tokens: completion, total_tokens: prompt + completion };
+    }
+
+    _buildUsageBreakdown(proposals) {
+        const breakdown = [];
+        for (const p of proposals) {
+            if (!p.usage) continue;
+            breakdown.push({
+                modelName: p.modelName || 'unknown',
+                provider: p.provider || 'unknown',
+                usage: p.usage
+            });
+        }
+        return breakdown.length > 0 ? breakdown : null;
+    }
+}