diff --git a/crates/mcp-brain-server/src/gist.rs b/crates/mcp-brain-server/src/gist.rs index 89f96de1c..2eff3254f 100644 --- a/crates/mcp-brain-server/src/gist.rs +++ b/crates/mcp-brain-server/src/gist.rs @@ -15,26 +15,26 @@ use parking_lot::Mutex; use serde::{Deserialize, Serialize}; // ── Novelty thresholds ── -// VERY aggressive: only publish when something genuinely new is discovered. -// With ~3100 memories and 2.8M edges, the bar must be HIGH to avoid noise. -// Target: ~1 gist per WEEK, only for real innovations. -/// Minimum new inferences: must derive many non-trivial forward-chained claims -const MIN_NEW_INFERENCES: usize = 10; -/// Minimum evidence observations — need substantial data -const MIN_EVIDENCE: usize = 1000; -/// Minimum strange loop quality score — high bar for self-aware reasoning -const MIN_STRANGE_LOOP_SCORE: f32 = 0.1; +// Tuned April 2026: brain has 10K+ memories and 38M edges. +// Previous thresholds were too aggressive — no gists were ever published. +// Target: ~1 gist per day, with genuinely interesting content. +/// Minimum new inferences: at least some non-trivial forward-chained claims +const MIN_NEW_INFERENCES: usize = 3; +/// Minimum evidence observations — brain has 10K+, so this is easy +const MIN_EVIDENCE: usize = 100; +/// Minimum strange loop quality score — lower bar to start publishing +const MIN_STRANGE_LOOP_SCORE: f32 = 0.01; /// Minimum propositions extracted in this cycle -const MIN_PROPOSITIONS: usize = 20; +const MIN_PROPOSITIONS: usize = 5; /// Minimum SONA patterns — require at least some SONA learning const MIN_SONA_PATTERNS: usize = 1; -/// Minimum Pareto front growth — evolution must find multiple new solutions -const MIN_PARETO_GROWTH: usize = 3; +/// Minimum Pareto front growth — any new solution counts +const MIN_PARETO_GROWTH: usize = 1; /// Minimum confidence for ANY inference to be included in a discovery -const MIN_INFERENCE_CONFIDENCE: f64 = 0.70; +const MIN_INFERENCE_CONFIDENCE: f64 = 0.60; /// Minimum number of UNIQUE categories across strong propositions -/// (prevents "debug-architecture-geopolitics" recycling) -const MIN_UNIQUE_CATEGORIES: usize = 4; +/// (prevents single-domain noise — but 2 domains is enough for cross-domain) +const MIN_UNIQUE_CATEGORIES: usize = 2; /// A discovery worthy of publishing. /// @@ -165,8 +165,8 @@ impl Discovery { && self.propositions_extracted >= MIN_PROPOSITIONS && self.sona_patterns >= MIN_SONA_PATTERNS && self.pareto_growth >= MIN_PARETO_GROWTH - && strong.len() >= 3 // Must have at least 3 non-trivial inferences - && strong_props.len() >= 5 // Must have at least 5 substantive propositions + && strong.len() >= 1 // Must have at least 1 non-trivial inference + && strong_props.len() >= 2 // Must have at least 2 substantive propositions && diversity >= MIN_UNIQUE_CATEGORIES // Must span multiple domains } @@ -228,7 +228,7 @@ impl GistPublisher { Some(Self { token, last_publish: Mutex::new(None), - min_interval: Duration::from_secs(259200), // 3 day minimum between gists + min_interval: Duration::from_secs(86400), // 1 day minimum between gists published_count: Mutex::new(0), published_titles: Mutex::new(Vec::new()), }) diff --git a/crates/mcp-brain-server/src/routes.rs b/crates/mcp-brain-server/src/routes.rs index 7b1a06abd..95313f113 100644 --- a/crates/mcp-brain-server/src/routes.rs +++ b/crates/mcp-brain-server/src/routes.rs @@ -5980,20 +5980,45 @@ async fn notify_digest( let topic = body["topic"].as_str(); let hours = body["hours"].as_u64().unwrap_or(24); - // Gather recent discoveries from the store + // Gather recent discoveries from the store — excluding debug/training noise let cutoff = chrono::Utc::now() - chrono::Duration::hours(hours as i64); let mut all = state.store.all_memories(); all.sort_by(|a, b| b.created_at.cmp(&a.created_at)); - // Filter by recency and optionally by topic + // Filter out noise: training cycles, self-reflections, debug entries, + // and low-signal web scraping results + let noise_patterns: &[&str] = &[ + "Self-reflection: training cycle", + "Fact Check: Self-reflection", + "vTools Events", + "Executive Committee Meeting", + "DailyMed", + "AccessGUDID", + "Site en construction", + ]; + let filtered: Vec<_> = all.iter() .filter(|m| { if m.created_at < cutoff { return false; } + // Skip debug/auto-generated training noise + if matches!(m.category, crate::types::BrainCategory::Debug) { + return false; + } + // Skip known noise patterns in titles + let title_lower = m.title.to_lowercase(); + if noise_patterns.iter().any(|p| title_lower.contains(&p.to_lowercase())) { + return false; + } + // Skip very short content (likely scraping artifacts) + if m.content.len() < 50 { + return false; + } + // Apply optional topic filter topic.map_or(true, |t| { let t_lower = t.to_lowercase(); - m.title.to_lowercase().contains(&t_lower) + title_lower.contains(&t_lower) || m.content.to_lowercase().contains(&t_lower) || m.tags.iter().any(|tag| tag.to_lowercase().contains(&t_lower)) }) @@ -6009,27 +6034,52 @@ async fn notify_digest( }))); } - // Build HTML rows + // Build HTML rows — human-readable format let mut rows = String::new(); + let category_emoji = |cat: &crate::types::BrainCategory| -> &str { + use crate::types::BrainCategory::*; + match cat { + Architecture => "🏗️", + Pattern => "🔄", + Solution => "💡", + Security => "🔒", + Convention => "📐", + Performance => "⚡", + Tooling => "🔧", + Debug => "🐛", + _ => "📝", + } + }; + for (i, m) in filtered.iter().enumerate() { - let title = if m.title.len() > 100 { &m.title[..100] } else { &m.title }; - let content = if m.content.len() > 200 { &m.content[..200] } else { &m.content }; - let quality = m.quality_score.mean(); - let tags_html: Vec<_> = m.tags.iter().take(4).map(|t| { - format!("{}", t) - }).collect(); + let title = if m.title.len() > 120 { &m.title[..120] } else { &m.title }; + // Take first ~250 chars but break at sentence boundary + let content_raw = if m.content.len() > 250 { &m.content[..250] } else { &m.content }; + let content = match content_raw.rfind(". ") { + Some(pos) if pos > 80 => &content_raw[..pos + 1], + _ => content_raw, + }; + let emoji = category_emoji(&m.category); + let tags_html: Vec<_> = m.tags.iter() + .filter(|t| !t.contains("auto-generated") && !t.contains("training-cycle")) + .take(3) + .map(|t| { + format!("{}", t) + }).collect(); rows.push_str(&format!( - r#" - -{num}. {title}
-{cat} | quality: {quality:.2} {tags}
-{content}... + r#" + +
{emoji} {title}
+
{tags}
+
{content}
"#, - num = i + 1, + emoji = emoji, title = title, - cat = m.category, - quality = quality, - tags = tags_html.join(""), + tags = if tags_html.is_empty() { + format!("{:?}", m.category) + } else { + tags_html.join("") + }, content = content, )); } @@ -6042,16 +6092,21 @@ async fn notify_digest( let edges = state.graph.read().edge_count(); let html = format!( - r#"
-

Daily Discovery Digest

-

Last {hours}h | {count} discoveries | {total} total memories | {edges} edges

+ r#"
+

What the Brain Learned Today

+

+{count} new discoveries in the last {hours} hours. +The brain now holds {total} memories connected by {edges} relationships. +

{topic_line} -{rows}
-
-

Reply with search <query> to explore | help for commands

+{rows}
+
+

Explore the brain

+

Reply search seizure prediction to find related knowledge, +or help for all commands.

-pi.ruv.io | Powered by Resend +pi.ruv.io — the shared brain for collective intelligence
"#, hours = hours, count = filtered.len(), @@ -6062,8 +6117,8 @@ async fn notify_digest( ); let subject = match topic { - Some(t) => format!("[pi.ruv.io/discovery] Daily Digest: {}", t), - None => "[pi.ruv.io/discovery] Daily Discovery Digest".into(), + Some(t) => format!("[pi brain] {} — {} new discoveries", t, filtered.len()), + None => format!("[pi brain] {} new discoveries today", filtered.len()), }; match notifier.send("discovery", &subject, &html).await { diff --git a/docs/adr/ADR-148-brain-hypothesis-engine.md b/docs/adr/ADR-148-brain-hypothesis-engine.md new file mode 100644 index 000000000..64bb8dead --- /dev/null +++ b/docs/adr/ADR-148-brain-hypothesis-engine.md @@ -0,0 +1,238 @@ +# ADR-148: Brain Hypothesis Engine — Self-Improving Knowledge System with Gemini, DiskANN, and Auto-Experimentation + +## Status + +Proposed + +## Date + +2026-04-13 + +## Context + +The pi.ruv.io brain (10,300+ memories, 38M graph edges, LoRA epoch 41) stores and retrieves knowledge but cannot: +1. Generate hypotheses from cross-domain connections +2. Evaluate quality beyond embedding similarity (quality scores mostly 0.0) +3. Filter noise from curated knowledge (random IEEE events alongside real patterns) +4. Measure whether LoRA training actually improves search quality + +The brain runs on Google Cloud Run (`ruvbrain` service, us-central1) backed by `crates/mcp-brain-server/` (Rust/Axum). Current embedding: `ruvllm::RlmEmbedder` at 128-dim. Current index: flat HNSW. + +## Decision + +Add four capabilities as **additive layers** — no changes to the running brain's core path. All new code is behind feature flags or in separate Cloud Run services. + +### Architecture: Three New Components + +``` +┌─────────────────────────────────────────────────────────┐ +│ EXISTING (untouched) │ +│ mcp-brain-server: store, search, graph, drift, LoRA │ +│ Embedder: ruvllm::RlmEmbedder (128-dim) │ +│ Index: flat HNSW │ +└──────────────┬──────────────────────────────────────────┘ + │ (reads from, writes back to) + v +┌─────────────────────────────────────────────────────────┐ +│ NEW: Hypothesis Engine (separate Cloud Run service) │ +│ │ +│ 1. HYPOTHESIS GENERATOR │ +│ - Watches for new cross-domain graph edges │ +│ - Templates: "If X works in domain A, │ +│ then X should work in domain B" │ +│ - Uses Gemini 2.5 Flash for hypothesis formulation │ +│ and experiment design │ +│ - Stores hypotheses as "untested" memories │ +│ │ +│ 2. QUALITY SCORER │ +│ - DiskANN index over all 10K+ memory embeddings │ +│ - PageRank via ruvector-solver ForwardPush │ +│ - Multi-signal: centrality + citations + verdicts │ +│ + contributor rep + temporal + surprise │ +│ - Updates quality field via brain API │ +│ │ +│ 3. NOISE FILTER │ +│ - Ingestion gate: regex + embedding dedup │ +│ - Weekly cleanup: archive orphan low-quality │ +│ - Meta-mincut: ruvector-mincut on knowledge graph │ +│ to find noise partition │ +│ │ +│ 4. BENCHMARK SUITE │ +│ - 50 curated test queries with known-good answers │ +│ - Runs before/after each LoRA epoch │ +│ - Tracks MRR, precision@5, cross-domain recall │ +│ - Auto-rollback if MRR drops > 5% │ +│ │ +└─────────────────────────────────────────────────────────┘ +``` + +### Component Details + +#### Gemini 2.5 Flash for Hypothesis Generation + +**Why Gemini, not local LLM:** +- Hypothesis generation is infrequent (triggered by new cross-domain edges, ~10/day) +- Requires reasoning about domain transfer ("if mincut detects seizures, could it detect X?") +- Gemini 2.5 Flash: fast, cheap (~$0.15/1M input tokens), 1M context window +- Local RLM embedder stays for indexing (it's tuned to the corpus) — Gemini is for reasoning only + +**API integration:** +```rust +// New module: crates/mcp-brain-server/src/hypothesis.rs +// Feature-gated: #[cfg(feature = "hypothesis")] + +use google_generativeai::Client; // or raw REST via reqwest + +async fn generate_hypothesis(edge: &CrossDomainEdge) -> Hypothesis { + let prompt = format!( + "Given this cross-domain connection:\n\ + Domain A: {}\nDomain B: {}\nBridge concept: {}\n\n\ + Generate a testable hypothesis: if the pattern from domain A \ + works, what specific prediction does it make in domain B? \ + Include: hypothesis statement, test method, expected outcome, \ + null hypothesis, required data.", + edge.domain_a, edge.domain_b, edge.bridge_concept + ); + // Call Gemini 2.5 Flash + let response = gemini_client.generate(&prompt).await?; + parse_hypothesis(response) +} +``` + +**Cost estimate:** ~10 hypotheses/day × ~500 tokens each = ~5K tokens/day = ~$0.001/day. Negligible. + +#### DiskANN for Scalable Quality Scoring + +**Why DiskANN, not current flat HNSW:** +- Current HNSW is in-memory, fine for 10K memories +- At 100K+ memories (projected within months), memory pressure becomes real +- DiskANN stores the graph on SSD, loads only neighbors on demand +- Product Quantization (PQ) compresses vectors 4-8x for candidate filtering +- `ruvector-diskann` already implements Vamana graph + PQ (ADR-146) + +**Integration plan:** +```rust +// New module: crates/mcp-brain-server/src/diskann_index.rs +// Feature-gated: #[cfg(feature = "diskann")] + +use ruvector_diskann::{DiskAnnIndex, DiskAnnConfig}; + +pub struct HybridIndex { + hnsw: HnswIndex, // Existing, stays as primary for <50K + diskann: DiskAnnIndex, // New, activates at >50K memories + threshold: usize, // Switch point (default: 50_000) +} + +impl HybridIndex { + pub fn search(&self, query: &[f32], k: usize) -> Vec<(usize, f32)> { + if self.hnsw.len() < self.threshold { + self.hnsw.search(query, k) + } else { + self.diskann.search(query, k) + } + } +} +``` + +**Benchmark plan:** Run both HNSW and DiskANN on the current 10K corpus, measure: +- Recall@10 (should be >95% for both) +- Query latency (HNSW: ~1ms, DiskANN: ~5-10ms expected) +- Memory usage (HNSW: ~50MB, DiskANN: ~5MB + SSD) +- Index build time + +#### Quality Scorer with ForwardPush PageRank + +```rust +// crates/mcp-brain-server/src/quality.rs + +pub fn compute_quality_scores(brain: &Brain) -> Vec<(MemoryId, f64)> { + // 1. Build CSR graph from memory edges + let graph = brain.graph_to_csr(); + + // 2. Run ForwardPush PageRank (sublinear, O(1/epsilon)) + let pr = ForwardPushSolver::new(0.85, 0.001); + let pagerank = pr.solve(&graph)?; + + // 3. Compute multi-signal quality + brain.memories().map(|m| { + let centrality = pagerank[m.id]; + let citations = m.inbound_edge_count as f64 / max_citations; + let verdict = match m.verdict { + Confirmed => 1.0, + Refuted => -0.5, + Untested => 0.0, + }; + let surprise = 1.0 - m.max_similarity_to_existing; + let temporal = recency_weight(m.created_at); + let bridge = if m.crosses_domains { 0.3 } else { 0.0 }; + + let quality = 0.25 * centrality + + 0.20 * citations + + 0.20 * verdict + + 0.15 * surprise + + 0.10 * temporal + + 0.10 * bridge; + + (m.id, quality.clamp(0.0, 1.0)) + }).collect() +} +``` + +### Safety Constraints (don't break the running system) + +1. **All new code is feature-gated.** The existing `mcp-brain-server` binary is unchanged unless `--features hypothesis,diskann,benchmark` is explicitly enabled. + +2. **Hypothesis engine runs as a SEPARATE Cloud Run service.** It calls the brain's API; it doesn't modify the brain's process. If it crashes, the brain keeps running. + +3. **DiskANN is a fallback, not a replacement.** HNSW stays as primary for <50K memories. DiskANN only activates when memory count exceeds the threshold. Both can be queried in parallel for benchmark comparison. + +4. **Quality scores are written to a NEW field (`quality_v2`).** The existing `quality` field is untouched until v2 scores are validated. + +5. **Noise filtering is archive-only.** Memories are archived (moved to cold storage), never deleted. Full rollback possible. + +6. **Benchmark auto-rollback.** If LoRA epoch N+1 degrades MRR by >5%, the epoch is discarded and the EWC checkpoint is restored automatically. + +7. **Gemini API key stored in gcloud secrets.** Already available as `GEMINI_API_KEY`. Rate-limited to 10 calls/hour to avoid cost surprises. + +### Implementation Phases + +| Phase | What | Risk | Timeline | +|-------|------|------|----------| +| **P0: ADR + Branch** | This document + feature branch | None | Done | +| **P1: Benchmark suite** | 50 test queries, MRR tracking | None (read-only) | 3 days | +| **P2: Quality scorer** | PageRank + multi-signal scoring | Low (writes to new field) | 1 week | +| **P3: Noise filter** | Ingestion gate + weekly cleanup | Low (archive-only) | 3 days | +| **P4: DiskANN integration** | Hybrid index behind feature flag | Low (fallback only) | 1 week | +| **P5: Hypothesis engine** | Gemini integration + auto-test | Medium (new service) | 2 weeks | + +**Total: ~5 weeks, phased. P1-P3 can run in parallel.** + +## Consequences + +### Positive +- Brain evolves from "smart database" to "scientific reasoner" +- Quality scores become meaningful (currently all 0.0) +- Noise filtering reduces graph pollution +- LoRA training becomes measurable and rollback-safe +- DiskANN prepares for 100K+ memory scale +- Gemini hypothesis generation is the first step toward autonomous discovery + +### Negative +- New dependency: Google Gemini API (adds cost, ~$0.03/day estimated) +- DiskANN adds complexity to the index path +- Hypothesis engine needs curation — false hypotheses could pollute if not filtered +- More Cloud Run services to monitor + +### Risks +- Gemini may generate low-quality hypotheses → mitigated by verdict system (untested until confirmed) +- DiskANN recall may be lower than HNSW at small corpus → mitigated by hybrid approach with threshold +- Quality scoring may be gamed by circular citations → mitigated by PageRank dampening + +## References + +- ADR-146: DiskANN Vamana Implementation +- ADR-131: Consciousness Metrics Crate +- ADR-048: Sublinear Graph Attention +- Subramanya et al., "DiskANN: Fast Accurate Billion-point Nearest Neighbor Search" (NeurIPS 2019) +- Google Gemini API: https://ai.google.dev/gemini-api +- ForwardPush PPR: Andersen, Chung, Lang 2006