kenforthewin · bk-ty · May 1, 2026 · May 1, 2026 · May 1, 2026 · May 1, 2026
diff --git a/crates/atomic-core/src/boilerplate.rs b/crates/atomic-core/src/boilerplate.rs
@@ -0,0 +1,156 @@
+//! Boilerplate-aware embedding filter.
+//!
+//! Detects chunks shared across multiple atoms and excludes them from
+//! semantic search vectors (vec_chunks). The stored atom content
+//! (atom_chunks.content) is never modified — only the embeddings change.
+
+use sha2::{Digest, Sha256};
+use std::collections::{HashMap, HashSet};
+
+/// Normalize chunk text for boilerplate fingerprinting.
+/// Strips markdown heading markers, collapses whitespace, lowercases.
+pub(crate) fn normalize_for_dedup(text: &str) -> String {
+    let stripped: String = text
+        .lines()
+        .map(|l| l.trim_start_matches('#').trim())
+        .collect::<Vec<_>>()
+        .join(" ");
+    stripped
+        .split_whitespace()
+        .collect::<Vec<_>>()
+        .join(" ")
+        .to_lowercase()
+}
+
+/// Compute SHA-256 hex digest of the normalized chunk text.
+pub(crate) fn content_hash(text: &str) -> String {
+    let normalized = normalize_for_dedup(text);
+    let mut hasher = Sha256::new();
+    hasher.update(normalized.as_bytes());
+    format!("{:x}", hasher.finalize())
+}
+
+/// Given a map of `hash → distinct_atom_count`, return the set of chunk
+/// indices that are boilerplate (count >= min_atom_threshold).
+///
+/// **Fallback:** if every chunk would be filtered, returns an empty set
+/// so atoms with 100% boilerplate content still get embedded.
+pub(crate) fn boilerplate_indices(
+    chunks: &[String],
+    counts: &HashMap<String, i64>,
+    min_atom_threshold: i64,
+) -> HashSet<usize> {
+    if min_atom_threshold <= 0 {
+        return HashSet::new();
+    }
+    let indices: HashSet<usize> = chunks
+        .iter()
+        .enumerate()
+        .filter_map(|(i, chunk)| {
+            let h = content_hash(chunk);
+            let count = counts.get(&h).copied().unwrap_or(0);
+            (count >= min_atom_threshold).then_some(i)
+        })
+        .collect();
+    // Fallback: never strip all chunks
+    if indices.len() == chunks.len() && !chunks.is_empty() {
+        HashSet::new()
+    } else {
+        indices
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_normalize_strips_heading_markers() {
+        assert_eq!(normalize_for_dedup("# My Header"), "my header");
+        assert_eq!(normalize_for_dedup("## Section"), "section");
+    }
+
+    #[test]
+    fn test_normalize_collapses_whitespace() {
+        assert_eq!(normalize_for_dedup("  hello   world  "), "hello world");
+    }
+
+    #[test]
+    fn test_normalize_lowercases() {
+        assert_eq!(normalize_for_dedup("Hello World"), "hello world");
+    }
+
+    #[test]
+    fn test_content_hash_deterministic() {
+        let h1 = content_hash("# My Header");
+        let h2 = content_hash("# My Header");
+        assert_eq!(h1, h2);
+        assert_eq!(h1.len(), 64); // SHA-256 hex
+    }
+
+    #[test]
+    fn test_content_hash_normalizes_heading_variants() {
+        // Different markdown levels with same text → same hash after normalization
+        let h1 = content_hash("# Terms of Service");
+        let h2 = content_hash("## Terms of Service");
+        assert_eq!(h1, h2);
+    }
+
+    #[test]
+    fn test_boilerplate_indices_all_unique() {
+        let chunks = vec![
+            "unique content a".to_string(),
+            "unique content b".to_string(),
+        ];
+        let counts: HashMap<String, i64> = HashMap::new();
+        let indices = boilerplate_indices(&chunks, &counts, 5);
+        assert!(indices.is_empty());
+    }
+
+    #[test]
+    fn test_boilerplate_indices_shared_chunks() {
+        let chunks = vec![
+            "shared header".to_string(),
+            "unique body content".to_string(),
+            "shared footer".to_string(),
+        ];
+        let mut counts = HashMap::new();
+        counts.insert(content_hash("shared header"), 10i64);
+        counts.insert(content_hash("shared footer"), 8i64);
+        let indices = boilerplate_indices(&chunks, &counts, 5);
+        assert_eq!(indices, HashSet::from([0, 2]));
+    }
+
+    #[test]
+    fn test_boilerplate_indices_fallback_all_boilerplate() {
+        let chunks = vec![
+            "shared chunk a".to_string(),
+            "shared chunk b".to_string(),
+        ];
+        let mut counts = HashMap::new();
+        counts.insert(content_hash("shared chunk a"), 20i64);
+        counts.insert(content_hash("shared chunk b"), 15i64);
+        // All chunks are boilerplate → fallback: return empty set
+        let indices = boilerplate_indices(&chunks, &counts, 5);
+        assert!(indices.is_empty(), "should fall back to empty when all chunks are boilerplate");
+    }
+
+    #[test]
+    fn test_boilerplate_below_threshold_not_filtered() {
+        let chunks = vec!["shared header".to_string()];
+        let mut counts = HashMap::new();
+        counts.insert(content_hash("shared header"), 3i64); // below threshold of 5
+        let indices = boilerplate_indices(&chunks, &counts, 5);
+        assert!(indices.is_empty());
+    }
+
+    #[test]
+    fn test_boilerplate_threshold_zero_disabled() {
+        let chunks = vec!["any content".to_string()];
+        let mut counts = HashMap::new();
+        counts.insert(content_hash("any content"), 100i64);
+        // threshold = 0 means disabled → nothing filtered
+        let indices = boilerplate_indices(&chunks, &counts, 0);
+        assert!(indices.is_empty());
+    }
+}
diff --git a/crates/atomic-core/src/briefing/agentic.rs b/crates/atomic-core/src/briefing/agentic.rs
@@ -650,6 +650,7 @@ mod tests {
                 tagging_status: "complete".to_string(),
                 embedding_error: None,
                 tagging_error: None,
+                is_locked: false,
             },
             tags: vec![],
         }

diff --git a/crates/atomic-core/src/briefing/mod.rs b/crates/atomic-core/src/briefing/mod.rs
@@ -103,10 +103,25 @@ pub async fn run_briefing(
     }
 
     // Run the agent loop.
-    let (content, citations) = agentic::generate(core, &since, &new_atoms, total_new)
+    let (mut content, citations) = agentic::generate(core, &since, &new_atoms, total_new)
         .await
         .map_err(AtomicCoreError::Wiki)?;
 
+    // Append health summary when score is concerning
+    if let Ok(report) = crate::health::compute_health(core).await {
+        if report.overall_score < 85 {
+            let health_section = format!(
+                "\n\n## Knowledge Health\n\n\
+                Your knowledge base health score is **{}/100** ({}).\n\n\
+                {} issues can be auto-fixed via the dashboard.",
+                report.overall_score,
+                report.overall_status,
+                report.auto_fixable
+            );
+            content.push_str(&health_section);
+        }
+    }
+
     let id = uuid::Uuid::new_v4().to_string();
     let now = Utc::now().to_rfc3339();
     let briefing = Briefing {

diff --git a/crates/atomic-core/src/db.rs b/crates/atomic-core/src/db.rs
@@ -211,7 +211,7 @@ impl Database {
     ///   1. Add a new `if version < N` block at the end (before the virtual-table section)
     ///   2. End the block with `PRAGMA user_version = N;`
     ///   3. Bump LATEST_VERSION
-    const LATEST_VERSION: i32 = 16;
+    const LATEST_VERSION: i32 = 21;
 
     pub fn run_migrations(conn: &Connection) -> Result<(), AtomicCoreError> {
         Self::run_migrations_internal(conn, false)
@@ -816,6 +816,119 @@ impl Database {
             conn.execute_batch("PRAGMA user_version = 16;")?;
         }
 
+        // --- V16 → V17: Knowledge health tables ---
+        if version < 17 {
+            conn.execute_batch(
+                r#"
+                CREATE TABLE IF NOT EXISTS health_reports (
+                    id TEXT PRIMARY KEY,
+                    computed_at TEXT NOT NULL,
+                    overall_score INTEGER NOT NULL,
+                    check_scores TEXT NOT NULL,
+                    atom_count INTEGER NOT NULL,
+                    auto_fixes_applied INTEGER NOT NULL DEFAULT 0,
+                    report_json TEXT NOT NULL
+                );
+                CREATE INDEX IF NOT EXISTS idx_health_reports_computed
+                    ON health_reports(computed_at DESC);
+
+                CREATE TABLE IF NOT EXISTS health_fix_log (
+                    id TEXT PRIMARY KEY,
+                    check_name TEXT NOT NULL,
+                    action TEXT NOT NULL,
+                    tier TEXT NOT NULL,
+                    atom_ids TEXT,
+                    tag_ids TEXT,
+                    before_state TEXT NOT NULL DEFAULT '{}',
+                    after_state TEXT NOT NULL DEFAULT '{}',
+                    llm_prompt TEXT,
+                    llm_response TEXT,
+                    executed_at TEXT NOT NULL,
+                    undone_at TEXT
+                );
+                CREATE INDEX IF NOT EXISTS idx_health_fix_log_executed
+                    ON health_fix_log(executed_at DESC);
+                CREATE INDEX IF NOT EXISTS idx_health_fix_log_check
+                    ON health_fix_log(check_name);
+
+                PRAGMA user_version = 17;
+                "#,
+            )?;
+        }
+
+        // --- V17 → V18: content_hash column on atom_chunks for boilerplate detection ---
+        if version < 18 {
+            // ALTER TABLE ADD COLUMN has no IF NOT EXISTS in SQLite; ignore the error
+            // if the column was already added (e.g. during a test migration re-run).
+            let _ = conn.execute(
+                "ALTER TABLE atom_chunks ADD COLUMN content_hash TEXT",
+                [],
+            );
+            conn.execute_batch(
+                r#"
+                CREATE INDEX IF NOT EXISTS idx_atom_chunks_content_hash
+                    ON atom_chunks(content_hash);
+                PRAGMA user_version = 18;
+                "#,
+            )?;
+        }
+
+        // --- V18 → V19: persistent dismissals for the review queue ---
+        if version < 19 {
+            conn.execute_batch(
+                r#"
+                CREATE TABLE IF NOT EXISTS health_dismissals (
+                    id TEXT PRIMARY KEY,
+                    check_name TEXT NOT NULL,
+                    item_key TEXT NOT NULL,
+                    reason TEXT NOT NULL,
+                    dismissed_at TEXT NOT NULL,
+                    expires_at TEXT
+                );
+                CREATE UNIQUE INDEX IF NOT EXISTS idx_health_dismissals_lookup
+                    ON health_dismissals(check_name, item_key);
+                PRAGMA user_version = 19;
+                "#,
+            )?;
+        }
+
+        // --- V19 → V20: tag_proposals table ---
+        if version < 20 {
+            conn.execute_batch(
+                r#"
+                CREATE TABLE IF NOT EXISTS tag_proposals (
+                    id TEXT PRIMARY KEY,
+                    summary TEXT NOT NULL,
+                    actions_json TEXT NOT NULL,
+                    created_at TEXT NOT NULL,
+                    applied_at TEXT
+                );
+                CREATE INDEX IF NOT EXISTS idx_tag_proposals_created
+                    ON tag_proposals(created_at DESC);
+                PRAGMA user_version = 20;
+                "#,
+            )?;
+        }
+
+        // --- V20 → V21: atoms.is_locked flag ---
+        //
+        // Locked atoms are protected from automated mutation by health fixes
+        // (strip-boilerplate, auto-merge-duplicate, auto-resolve-contradiction,
+        // relink-broken-link). They remain readable and editable through the
+        // normal UI. Use for source-of-truth material (books, studies, primary
+        // research) where automated "correction" would do more harm than good.
+        if version < 21 {
+            // ALTER TABLE ADD COLUMN has no IF NOT EXISTS in SQLite. Ignore the
+            // "duplicate column" error so migration stays idempotent when a
+            // test resets user_version to a pre-V21 value on a DB whose table
+            // was already migrated by the initial open.
+            let _ = conn.execute(
+                "ALTER TABLE atoms ADD COLUMN is_locked INTEGER NOT NULL DEFAULT 0",
+                [],
+            );
+            conn.execute_batch("PRAGMA user_version = 21;")?;
+        }
+
         // --- Triggers (recreated every startup to stay current) ---
         conn.execute_batch(
             "DROP TRIGGER IF EXISTS atom_tags_insert_count;
@@ -1012,6 +1125,40 @@ impl Database {
         // legacy seed rows so the resolver's "any per-DB row is an override"
         // rule stays correct.
 
+        // ---------------------------------------------------------------
+        // Self-healing: idempotent column checks.
+        //
+        // Runs on every migration pass regardless of version. Exists because
+        // a rebase/renumber in the past let some DBs tick their user_version
+        // past the migration that added `tags.autotag_description` without
+        // ever executing the ALTER. Any query joining that column then errors
+        // at runtime ("no such column: t.autotag_description"). Cheap enough
+        // to always verify; keeps migration drift from bricking a DB.
+        //
+        // When adding a new column, prefer listing it here in addition to the
+        // versioned migration step — belt and braces.
+        const EXPECTED_COLUMNS: &[(&str, &str, &str)] = &[
+            // (table, column, DDL to add)
+            ("tags", "autotag_description", "ALTER TABLE tags ADD COLUMN autotag_description TEXT NOT NULL DEFAULT ''"),
+        ];
+        for (table, column, ddl) in EXPECTED_COLUMNS {
+            let has_col: bool = conn
+                .query_row(
+                    "SELECT 1 FROM pragma_table_info(?1) WHERE name = ?2",
+                    rusqlite::params![table, column],
+                    |_| Ok(true),
+                )
+                .unwrap_or(false);
+            if !has_col {
+                tracing::warn!(
+                    table,
+                    column,
+                    "healing missing column (migration drift); running late ALTER"
+                );
+                conn.execute_batch(ddl)?;
+            }
+        }
+
         Ok(())
     }
 }