diff --git a/crates/atomic-core/src/boilerplate.rs b/crates/atomic-core/src/boilerplate.rs new file mode 100644 index 00000000..5c42f30e --- /dev/null +++ b/crates/atomic-core/src/boilerplate.rs @@ -0,0 +1,156 @@ +//! Boilerplate-aware embedding filter. +//! +//! Detects chunks shared across multiple atoms and excludes them from +//! semantic search vectors (vec_chunks). The stored atom content +//! (atom_chunks.content) is never modified — only the embeddings change. + +use sha2::{Digest, Sha256}; +use std::collections::{HashMap, HashSet}; + +/// Normalize chunk text for boilerplate fingerprinting. +/// Strips markdown heading markers, collapses whitespace, lowercases. +pub(crate) fn normalize_for_dedup(text: &str) -> String { + let stripped: String = text + .lines() + .map(|l| l.trim_start_matches('#').trim()) + .collect::>() + .join(" "); + stripped + .split_whitespace() + .collect::>() + .join(" ") + .to_lowercase() +} + +/// Compute SHA-256 hex digest of the normalized chunk text. +pub(crate) fn content_hash(text: &str) -> String { + let normalized = normalize_for_dedup(text); + let mut hasher = Sha256::new(); + hasher.update(normalized.as_bytes()); + format!("{:x}", hasher.finalize()) +} + +/// Given a map of `hash → distinct_atom_count`, return the set of chunk +/// indices that are boilerplate (count >= min_atom_threshold). +/// +/// **Fallback:** if every chunk would be filtered, returns an empty set +/// so atoms with 100% boilerplate content still get embedded. +pub(crate) fn boilerplate_indices( + chunks: &[String], + counts: &HashMap, + min_atom_threshold: i64, +) -> HashSet { + if min_atom_threshold <= 0 { + return HashSet::new(); + } + let indices: HashSet = chunks + .iter() + .enumerate() + .filter_map(|(i, chunk)| { + let h = content_hash(chunk); + let count = counts.get(&h).copied().unwrap_or(0); + (count >= min_atom_threshold).then_some(i) + }) + .collect(); + // Fallback: never strip all chunks + if indices.len() == chunks.len() && !chunks.is_empty() { + HashSet::new() + } else { + indices + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_normalize_strips_heading_markers() { + assert_eq!(normalize_for_dedup("# My Header"), "my header"); + assert_eq!(normalize_for_dedup("## Section"), "section"); + } + + #[test] + fn test_normalize_collapses_whitespace() { + assert_eq!(normalize_for_dedup(" hello world "), "hello world"); + } + + #[test] + fn test_normalize_lowercases() { + assert_eq!(normalize_for_dedup("Hello World"), "hello world"); + } + + #[test] + fn test_content_hash_deterministic() { + let h1 = content_hash("# My Header"); + let h2 = content_hash("# My Header"); + assert_eq!(h1, h2); + assert_eq!(h1.len(), 64); // SHA-256 hex + } + + #[test] + fn test_content_hash_normalizes_heading_variants() { + // Different markdown levels with same text → same hash after normalization + let h1 = content_hash("# Terms of Service"); + let h2 = content_hash("## Terms of Service"); + assert_eq!(h1, h2); + } + + #[test] + fn test_boilerplate_indices_all_unique() { + let chunks = vec![ + "unique content a".to_string(), + "unique content b".to_string(), + ]; + let counts: HashMap = HashMap::new(); + let indices = boilerplate_indices(&chunks, &counts, 5); + assert!(indices.is_empty()); + } + + #[test] + fn test_boilerplate_indices_shared_chunks() { + let chunks = vec![ + "shared header".to_string(), + "unique body content".to_string(), + "shared footer".to_string(), + ]; + let mut counts = HashMap::new(); + counts.insert(content_hash("shared header"), 10i64); + counts.insert(content_hash("shared footer"), 8i64); + let indices = boilerplate_indices(&chunks, &counts, 5); + assert_eq!(indices, HashSet::from([0, 2])); + } + + #[test] + fn test_boilerplate_indices_fallback_all_boilerplate() { + let chunks = vec![ + "shared chunk a".to_string(), + "shared chunk b".to_string(), + ]; + let mut counts = HashMap::new(); + counts.insert(content_hash("shared chunk a"), 20i64); + counts.insert(content_hash("shared chunk b"), 15i64); + // All chunks are boilerplate → fallback: return empty set + let indices = boilerplate_indices(&chunks, &counts, 5); + assert!(indices.is_empty(), "should fall back to empty when all chunks are boilerplate"); + } + + #[test] + fn test_boilerplate_below_threshold_not_filtered() { + let chunks = vec!["shared header".to_string()]; + let mut counts = HashMap::new(); + counts.insert(content_hash("shared header"), 3i64); // below threshold of 5 + let indices = boilerplate_indices(&chunks, &counts, 5); + assert!(indices.is_empty()); + } + + #[test] + fn test_boilerplate_threshold_zero_disabled() { + let chunks = vec!["any content".to_string()]; + let mut counts = HashMap::new(); + counts.insert(content_hash("any content"), 100i64); + // threshold = 0 means disabled → nothing filtered + let indices = boilerplate_indices(&chunks, &counts, 0); + assert!(indices.is_empty()); + } +} diff --git a/crates/atomic-core/src/briefing/agentic.rs b/crates/atomic-core/src/briefing/agentic.rs index e2b5fc69..99558484 100644 --- a/crates/atomic-core/src/briefing/agentic.rs +++ b/crates/atomic-core/src/briefing/agentic.rs @@ -650,6 +650,7 @@ mod tests { tagging_status: "complete".to_string(), embedding_error: None, tagging_error: None, + is_locked: false, }, tags: vec![], } diff --git a/crates/atomic-core/src/briefing/mod.rs b/crates/atomic-core/src/briefing/mod.rs index 35d8f14d..adc9eafc 100644 --- a/crates/atomic-core/src/briefing/mod.rs +++ b/crates/atomic-core/src/briefing/mod.rs @@ -103,10 +103,25 @@ pub async fn run_briefing( } // Run the agent loop. - let (content, citations) = agentic::generate(core, &since, &new_atoms, total_new) + let (mut content, citations) = agentic::generate(core, &since, &new_atoms, total_new) .await .map_err(AtomicCoreError::Wiki)?; + // Append health summary when score is concerning + if let Ok(report) = crate::health::compute_health(core).await { + if report.overall_score < 85 { + let health_section = format!( + "\n\n## Knowledge Health\n\n\ + Your knowledge base health score is **{}/100** ({}).\n\n\ + {} issues can be auto-fixed via the dashboard.", + report.overall_score, + report.overall_status, + report.auto_fixable + ); + content.push_str(&health_section); + } + } + let id = uuid::Uuid::new_v4().to_string(); let now = Utc::now().to_rfc3339(); let briefing = Briefing { diff --git a/crates/atomic-core/src/db.rs b/crates/atomic-core/src/db.rs index a1d37f03..99450a5c 100644 --- a/crates/atomic-core/src/db.rs +++ b/crates/atomic-core/src/db.rs @@ -211,7 +211,7 @@ impl Database { /// 1. Add a new `if version < N` block at the end (before the virtual-table section) /// 2. End the block with `PRAGMA user_version = N;` /// 3. Bump LATEST_VERSION - const LATEST_VERSION: i32 = 16; + const LATEST_VERSION: i32 = 21; pub fn run_migrations(conn: &Connection) -> Result<(), AtomicCoreError> { Self::run_migrations_internal(conn, false) @@ -816,6 +816,119 @@ impl Database { conn.execute_batch("PRAGMA user_version = 16;")?; } + // --- V16 → V17: Knowledge health tables --- + if version < 17 { + conn.execute_batch( + r#" + CREATE TABLE IF NOT EXISTS health_reports ( + id TEXT PRIMARY KEY, + computed_at TEXT NOT NULL, + overall_score INTEGER NOT NULL, + check_scores TEXT NOT NULL, + atom_count INTEGER NOT NULL, + auto_fixes_applied INTEGER NOT NULL DEFAULT 0, + report_json TEXT NOT NULL + ); + CREATE INDEX IF NOT EXISTS idx_health_reports_computed + ON health_reports(computed_at DESC); + + CREATE TABLE IF NOT EXISTS health_fix_log ( + id TEXT PRIMARY KEY, + check_name TEXT NOT NULL, + action TEXT NOT NULL, + tier TEXT NOT NULL, + atom_ids TEXT, + tag_ids TEXT, + before_state TEXT NOT NULL DEFAULT '{}', + after_state TEXT NOT NULL DEFAULT '{}', + llm_prompt TEXT, + llm_response TEXT, + executed_at TEXT NOT NULL, + undone_at TEXT + ); + CREATE INDEX IF NOT EXISTS idx_health_fix_log_executed + ON health_fix_log(executed_at DESC); + CREATE INDEX IF NOT EXISTS idx_health_fix_log_check + ON health_fix_log(check_name); + + PRAGMA user_version = 17; + "#, + )?; + } + + // --- V17 → V18: content_hash column on atom_chunks for boilerplate detection --- + if version < 18 { + // ALTER TABLE ADD COLUMN has no IF NOT EXISTS in SQLite; ignore the error + // if the column was already added (e.g. during a test migration re-run). + let _ = conn.execute( + "ALTER TABLE atom_chunks ADD COLUMN content_hash TEXT", + [], + ); + conn.execute_batch( + r#" + CREATE INDEX IF NOT EXISTS idx_atom_chunks_content_hash + ON atom_chunks(content_hash); + PRAGMA user_version = 18; + "#, + )?; + } + + // --- V18 → V19: persistent dismissals for the review queue --- + if version < 19 { + conn.execute_batch( + r#" + CREATE TABLE IF NOT EXISTS health_dismissals ( + id TEXT PRIMARY KEY, + check_name TEXT NOT NULL, + item_key TEXT NOT NULL, + reason TEXT NOT NULL, + dismissed_at TEXT NOT NULL, + expires_at TEXT + ); + CREATE UNIQUE INDEX IF NOT EXISTS idx_health_dismissals_lookup + ON health_dismissals(check_name, item_key); + PRAGMA user_version = 19; + "#, + )?; + } + + // --- V19 → V20: tag_proposals table --- + if version < 20 { + conn.execute_batch( + r#" + CREATE TABLE IF NOT EXISTS tag_proposals ( + id TEXT PRIMARY KEY, + summary TEXT NOT NULL, + actions_json TEXT NOT NULL, + created_at TEXT NOT NULL, + applied_at TEXT + ); + CREATE INDEX IF NOT EXISTS idx_tag_proposals_created + ON tag_proposals(created_at DESC); + PRAGMA user_version = 20; + "#, + )?; + } + + // --- V20 → V21: atoms.is_locked flag --- + // + // Locked atoms are protected from automated mutation by health fixes + // (strip-boilerplate, auto-merge-duplicate, auto-resolve-contradiction, + // relink-broken-link). They remain readable and editable through the + // normal UI. Use for source-of-truth material (books, studies, primary + // research) where automated "correction" would do more harm than good. + if version < 21 { + // ALTER TABLE ADD COLUMN has no IF NOT EXISTS in SQLite. Ignore the + // "duplicate column" error so migration stays idempotent when a + // test resets user_version to a pre-V21 value on a DB whose table + // was already migrated by the initial open. + let _ = conn.execute( + "ALTER TABLE atoms ADD COLUMN is_locked INTEGER NOT NULL DEFAULT 0", + [], + ); + conn.execute_batch("PRAGMA user_version = 21;")?; + } + // --- Triggers (recreated every startup to stay current) --- conn.execute_batch( "DROP TRIGGER IF EXISTS atom_tags_insert_count; @@ -1012,6 +1125,40 @@ impl Database { // legacy seed rows so the resolver's "any per-DB row is an override" // rule stays correct. + // --------------------------------------------------------------- + // Self-healing: idempotent column checks. + // + // Runs on every migration pass regardless of version. Exists because + // a rebase/renumber in the past let some DBs tick their user_version + // past the migration that added `tags.autotag_description` without + // ever executing the ALTER. Any query joining that column then errors + // at runtime ("no such column: t.autotag_description"). Cheap enough + // to always verify; keeps migration drift from bricking a DB. + // + // When adding a new column, prefer listing it here in addition to the + // versioned migration step — belt and braces. + const EXPECTED_COLUMNS: &[(&str, &str, &str)] = &[ + // (table, column, DDL to add) + ("tags", "autotag_description", "ALTER TABLE tags ADD COLUMN autotag_description TEXT NOT NULL DEFAULT ''"), + ]; + for (table, column, ddl) in EXPECTED_COLUMNS { + let has_col: bool = conn + .query_row( + "SELECT 1 FROM pragma_table_info(?1) WHERE name = ?2", + rusqlite::params![table, column], + |_| Ok(true), + ) + .unwrap_or(false); + if !has_col { + tracing::warn!( + table, + column, + "healing missing column (migration drift); running late ALTER" + ); + conn.execute_batch(ddl)?; + } + } + Ok(()) } } diff --git a/crates/atomic-core/src/embedding.rs b/crates/atomic-core/src/embedding.rs index 71ddc019..28246bd9 100644 --- a/crates/atomic-core/src/embedding.rs +++ b/crates/atomic-core/src/embedding.rs @@ -582,16 +582,49 @@ async fn process_embedding_only_inner( return Ok(()); } - // Use adaptive batching so provider batch-size limits (e.g. DashScope's - // max 10) are handled by splitting, same as the bulk embedding path. - let pending: Vec = chunks + // ---- Boilerplate filtering ---- + // Exclude chunks shared across >= threshold distinct atoms from vec_chunks. + // They are still saved to atom_chunks (for FTS/display); only embedding is skipped. + let threshold = settings_map + .get("boilerplate_min_atom_count") + .and_then(|v| v.parse::().ok()) + .unwrap_or(5); + let boilerplate_set: std::collections::HashSet = if threshold > 0 && !chunks.is_empty() { + let hashes: Vec = chunks + .iter() + .map(|c| crate::boilerplate::content_hash(c)) + .collect(); + let counts = storage + .count_chunk_hash_occurrences_sync(&hashes) + .await + .unwrap_or_default(); + crate::boilerplate::boilerplate_indices(&chunks, &counts, threshold) + } else { + std::collections::HashSet::new() + }; + if !boilerplate_set.is_empty() { + tracing::debug!( + atom_id, + stripped = boilerplate_set.len(), + total = chunks.len(), + "Boilerplate filter: excluding shared chunks from embedding" + ); + } + + // Partition chunks: embed only non-boilerplate ones. + // Boilerplate chunks are saved to atom_chunks with empty embedding (skipped from vec_chunks). + let (embed_chunks, skip_chunks): (Vec<(usize, String)>, Vec<(usize, String)>) = chunks .into_iter() .enumerate() + .partition(|(index, _)| !boilerplate_set.contains(index)); + + let pending: Vec = embed_chunks + .iter() .map(|(index, chunk)| PendingChunk { atom_id: atom_id.to_string(), existing_chunk_id: None, - chunk_index: index, - content: chunk, + chunk_index: *index, + content: chunk.clone(), }) .collect(); @@ -607,12 +640,16 @@ async fn process_embedding_only_inner( } // Store chunks and embeddings - let chunks_with_embeddings: Vec<(String, Vec)> = embedded + // Boilerplate chunks (skip_chunks) are saved with empty vec → atom_chunks only, no vec_chunks. + let mut all_chunks_for_save: Vec<(String, Vec)> = embedded .into_iter() .map(|(chunk, emb)| (chunk.content, emb)) .collect(); + for (_, boilerplate_content) in skip_chunks { + all_chunks_for_save.push((boilerplate_content, vec![])); + } storage - .save_chunks_and_embeddings_sync(atom_id, &chunks_with_embeddings) + .save_chunks_and_embeddings_sync(atom_id, &all_chunks_for_save) .await .map_err(|e| format!("Failed to store chunks: {}", e))?; @@ -1708,6 +1745,53 @@ where chunks.sort_by_key(|chunk| chunk.chunk_index); } + + // ---- Boilerplate filtering for re-embed path ---- + { + let threshold = settings_map + .get("boilerplate_min_atom_count") + .and_then(|v| v.parse::().ok()) + .unwrap_or(5); + if threshold > 0 { + let all_hashes: Vec = { + let hash_set: std::collections::HashSet = atom_groups + .iter() + .flat_map(|(_, chunks)| chunks.iter().map(|c| crate::boilerplate::content_hash(&c.content))) + .collect(); + hash_set.into_iter().collect() + }; + let occurrence_counts = storage + .count_chunk_hash_occurrences_sync(&all_hashes) + .await + .unwrap_or_default(); + let mut boilerplate_chunk_ids: Vec = Vec::new(); + for (_, chunks) in &mut atom_groups { + let texts: Vec = chunks.iter().map(|c| c.content.clone()).collect(); + let bp_indices = crate::boilerplate::boilerplate_indices(&texts, &occurrence_counts, threshold); + if !bp_indices.is_empty() { + for idx in &bp_indices { + if let Some(chunk) = chunks.get(*idx) { + if let Some(ref id) = chunk.existing_chunk_id { + boilerplate_chunk_ids.push(id.clone()); + } + } + } + let kept: Vec = chunks + .drain(..) + .enumerate() + .filter(|(i, _)| !bp_indices.contains(i)) + .map(|(_, c)| c) + .collect(); + *chunks = kept; + } + } + if !boilerplate_chunk_ids.is_empty() { + if let Err(e) = storage.delete_vec_chunks_by_ids_sync(&boilerplate_chunk_ids).await { + tracing::warn!(error = %e, "Failed to delete boilerplate vec_chunks entries"); + } + } + } + } let mut chunk_groups: Vec)>> = Vec::new(); let mut current_group = Vec::new(); let mut current_chunk_count = 0usize; diff --git a/crates/atomic-core/src/health/audit.rs b/crates/atomic-core/src/health/audit.rs new file mode 100644 index 00000000..474a6414 --- /dev/null +++ b/crates/atomic-core/src/health/audit.rs @@ -0,0 +1,201 @@ +//! Fix audit log and undo capability. +//! +//! Every auto-fix action is recorded in `health_fix_log` with a JSON +//! `before_state` snapshot. `undo_fix` reads that snapshot and restores the +//! affected atoms / tags. + +use crate::error::AtomicCoreError; +use crate::AtomicCore; +use serde::{Deserialize, Serialize}; + +/// A persisted record of one fix action (stored in `health_fix_log`). +#[cfg_attr(feature = "openapi", derive(utoipa::ToSchema))] +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct HealthFixLog { + pub id: String, + pub check_name: String, + pub action: String, + /// "safe" | "low" | "medium" | "high" + pub tier: String, + /// JSON array of atom IDs touched by this fix. + pub atom_ids: Option>, + /// JSON array of tag IDs touched by this fix. + pub tag_ids: Option>, + /// Full JSON snapshot before the fix was applied. Used by undo. + pub before_state: String, + /// Full JSON snapshot after the fix was applied. + pub after_state: String, + pub llm_prompt: Option, + pub llm_response: Option, + pub executed_at: String, + pub undone_at: Option, +} + +/// Lightweight record stored in `health_reports`. +#[cfg_attr(feature = "openapi", derive(utoipa::ToSchema))] +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct StoredHealthReport { + pub id: String, + pub computed_at: String, + pub overall_score: u32, + /// JSON map check_name → score. + pub check_scores: String, + pub atom_count: i32, + pub auto_fixes_applied: i32, + /// Full serialised `HealthReport`. + pub report_json: String, +} + +/// Record a fix action in the audit log, returning the generated `id`. +#[allow(clippy::too_many_arguments)] +pub async fn log_fix( + core: &AtomicCore, + check_name: &str, + action: &str, + tier: &str, + atom_ids: Option<&[String]>, + tag_ids: Option<&[String]>, + before_state: serde_json::Value, + after_state: serde_json::Value, + llm_prompt: Option<&str>, + llm_response: Option<&str>, +) -> Result { + let id = uuid::Uuid::new_v4().to_string(); + let log = HealthFixLog { + id: id.clone(), + check_name: check_name.to_string(), + action: action.to_string(), + tier: tier.to_string(), + atom_ids: atom_ids.map(|ids| ids.to_vec()), + tag_ids: tag_ids.map(|ids| ids.to_vec()), + before_state: serde_json::to_string(&before_state).unwrap_or_default(), + after_state: serde_json::to_string(&after_state).unwrap_or_default(), + llm_prompt: llm_prompt.map(|s| s.to_string()), + llm_response: llm_response.map(|s| s.to_string()), + executed_at: chrono::Utc::now().to_rfc3339(), + undone_at: None, + }; + core.storage().log_fix_action_sync(&log).await?; + Ok(id) +} + +/// Undo a previously applied fix using the stored `before_state` snapshot. +/// +/// Currently supports: +/// - Recreating deleted atoms (JSON array of `{id, content, source_url, tags}`) +/// - Recreating deleted tags (JSON array of `{id, name, parent_id}`) +/// - Restoring updated atom content (JSON array of `{id, content}`) +pub async fn undo(core: &AtomicCore, fix_id: &str) -> Result<(), AtomicCoreError> { + let log = core + .storage() + .get_fix_log_sync(fix_id) + .await? + .ok_or_else(|| { + AtomicCoreError::NotFound(format!("health fix log {fix_id} not found")) + })?; + + if log.undone_at.is_some() { + return Err(AtomicCoreError::Validation(format!( + "fix {fix_id} has already been undone" + ))); + } + + let before: serde_json::Value = serde_json::from_str(&log.before_state) + .unwrap_or(serde_json::json!({})); + + match log.action.as_str() { + "deleted_tags" => { + // before_state: [ { "id": "...", "name": "...", "parent_id": null } ] + if let Some(tags) = before.as_array() { + for tag in tags { + let name = tag + .get("name") + .and_then(|v| v.as_str()) + .unwrap_or_default(); + let parent_id = tag.get("parent_id").and_then(|v| v.as_str()); + // Re-create tag (ID won't be preserved, but name + parent will match) + if !name.is_empty() { + let _ = core.storage().create_tag_impl(name, parent_id).await; + } + } + } + } + "deleted_atoms" => { + // before_state: [ { "id": "...", "content": "...", "source_url": null, "tag_ids": [...] } ] + if let Some(atoms) = before.as_array() { + for atom_snap in atoms { + let content = atom_snap + .get("content") + .and_then(|v| v.as_str()) + .unwrap_or_default() + .to_string(); + let source_url = atom_snap + .get("source_url") + .and_then(|v| v.as_str()) + .map(|s| s.to_string()); + let tag_ids: Vec = atom_snap + .get("tag_ids") + .and_then(|v| v.as_array()) + .map(|arr| { + arr.iter() + .filter_map(|v| v.as_str().map(|s| s.to_string())) + .collect() + }) + .unwrap_or_default(); + let req = crate::CreateAtomRequest { + content, + source_url, + tag_ids, + ..Default::default() + }; + let _ = core.create_atom(req, |_| {}).await; + } + } + } + "updated_atoms" => { + // before_state: [ { "id": "...", "content": "...", "source_url": null } ] + if let Some(atoms) = before.as_array() { + for atom_snap in atoms { + let id = atom_snap + .get("id") + .and_then(|v| v.as_str()) + .unwrap_or_default() + .to_string(); + let content = atom_snap + .get("content") + .and_then(|v| v.as_str()) + .unwrap_or_default() + .to_string(); + let source_url = atom_snap + .get("source_url") + .and_then(|v| v.as_str()) + .map(|s| s.to_string()); + if !id.is_empty() { + let upd = crate::UpdateAtomRequest { + content, + source_url, + published_at: None, + tag_ids: None, + }; + let _ = core.update_atom(&id, upd, |_| {}).await; + } + } + } + } + _ => { + tracing::warn!(action = %log.action, "undo not implemented for this action type"); + } + } + + // Mark fix as undone + core.storage().mark_fix_undone_sync(fix_id).await?; + Ok(()) +} + +/// Fetch recent fix log entries (most recent first). +pub async fn get_recent_fixes( + core: &AtomicCore, + limit: i32, +) -> Result, AtomicCoreError> { + core.storage().get_recent_fixes_sync(limit).await +} diff --git a/crates/atomic-core/src/health/checks.rs b/crates/atomic-core/src/health/checks.rs new file mode 100644 index 00000000..84cc853b --- /dev/null +++ b/crates/atomic-core/src/health/checks.rs @@ -0,0 +1,491 @@ +//! Individual health check implementations. +//! +//! Each check takes a `&HealthRawData` snapshot (fetched once) and returns a +//! `HealthCheckResult` with a 0–100 score and check-specific JSON data. + +use super::{DuplicatePair, HealthCheckResult, HealthThresholds, WikiGap, WikiStaleEntry}; +use crate::storage::sqlite::health::HealthRawData; +use serde_json::json; +use std::collections::HashMap; + +/// Run all 10 checks against pre-fetched raw data. +pub fn run_all( + raw: &HealthRawData, + thresholds: &HealthThresholds, +) -> HashMap { + let mut map = HashMap::new(); + map.insert("embedding_coverage".to_string(), embedding_coverage(raw)); + map.insert("tagging_coverage".to_string(), tagging_coverage(raw)); + map.insert("source_uniqueness".to_string(), source_uniqueness(raw)); + map.insert("orphan_tags".to_string(), orphan_tags(raw)); + map.insert( + "semantic_graph_freshness".to_string(), + semantic_graph_freshness(raw, thresholds), + ); + map.insert("wiki_coverage".to_string(), wiki_coverage(raw)); + map.insert("content_quality".to_string(), content_quality(raw)); + map.insert("tag_health".to_string(), tag_health(raw, thresholds)); + map.insert("content_overlap".to_string(), content_overlap(raw)); + map.insert( + "contradiction_detection".to_string(), + contradiction_detection(raw), + ); + // Diagnostic check — not included in CHECK_WEIGHTS, doesn't affect score. + // Surfaces boilerplate-dominated atoms to the UI without penalising the KB. + map.insert( + "boilerplate_pollution".to_string(), + boilerplate_pollution(raw, thresholds), + ); + map +} + +// ==================== Individual checks ==================== + +pub fn embedding_coverage(raw: &HealthRawData) -> HealthCheckResult { + let total = raw.total_atoms; + let complete = raw.embedding_complete; + let pending = raw.embedding_pending; + let processing = raw.embedding_processing; + let failed = raw.embedding_failed; + + let score = if total == 0 { + 100 + } else { + let pct = (complete as f64 / total as f64 * 100.0) as u32; + if failed > 0 { + pct.min(50) + } else { + pct + } + }; + let status = if score == 100 { + "ok" + } else if score >= 80 { + "warning" + } else { + "error" + }; + HealthCheckResult { + status: status.to_string(), + score, + auto_fixable: failed > 0 || pending > 0, + requires_review: false, + informational: false, + fix_action: Some("retry_failed_and_process_pending".to_string()), + data: json!({ + "total": total, + "complete": complete, + "pending": pending, + "processing": processing, + "failed": failed + }), + } +} + +pub fn tagging_coverage(raw: &HealthRawData) -> HealthCheckResult { + let total = raw.total_atoms; + let failed = raw.tagging_failed; + let pending = raw.tagging_pending; + let untagged = raw.untagged_complete; + // skipped_untagged: the tagger skipped these atoms AND they have 0 tags. + // Atoms with tagging_status='skipped' that DO have tags are fine — + // they were imported with existing tags and the tagger deliberately skipped. + let skipped_untagged = raw.skipped_untagged; + + // Only count actually-problematic atoms: failed, truly pending, complete-but-untagged, + // and skipped-with-no-tags. Skipped atoms that HAVE tags are fine. + let bad = (failed + pending + untagged + skipped_untagged).min(total); + let tagged = (total - bad).max(0); + + let score = if total == 0 { + 100 + } else { + (tagged as f64 / total as f64 * 100.0) as u32 + }; + let status = if score == 100 { + "ok" + } else if score >= 80 { + "warning" + } else { + "error" + }; + HealthCheckResult { + status: status.to_string(), + score, + auto_fixable: failed > 0 || pending > 0 || untagged > 0 || skipped_untagged > 0, + requires_review: false, + informational: false, + fix_action: Some("retry_tagging_pipeline".to_string()), + data: json!({ + "total": total, + "tagged": tagged, + "untagged_complete": untagged, + "skipped_untagged": skipped_untagged, + "failed": failed, + "pending": pending, + "skipped_with_tags": raw.tagging_skipped - skipped_untagged + }), + } +} + +pub fn source_uniqueness(raw: &HealthRawData) -> HealthCheckResult { + let dup_count = raw.duplicate_sources.len() as i32; + let score = (100i32 - dup_count * 15).max(0) as u32; + let status = if dup_count == 0 { "ok" } else { "warning" }; + let pairs: Vec = raw + .duplicate_sources + .iter() + .map(|(url, ids)| { + json!({ + "source_url": url, + "atom_count": ids.len(), + "atom_ids": ids + }) + }) + .collect(); + HealthCheckResult { + status: status.to_string(), + score, + auto_fixable: dup_count > 0, + requires_review: false, + informational: false, + fix_action: Some("merge_exact_source_duplicates".to_string()), + data: json!({ + "count": dup_count, + "pairs": pairs + }), + } +} + +pub fn orphan_tags(raw: &HealthRawData) -> HealthCheckResult { + let count = raw.orphan_tags.len() as i32; + let score = (100i32 - count * 2).max(0) as u32; + let status = if count == 0 { "ok" } else { "warning" }; + let tag_list: Vec = raw + .orphan_tags + .iter() + .map(|(id, name)| json!({ "id": id, "name": name })) + .collect(); + HealthCheckResult { + status: status.to_string(), + score, + auto_fixable: count > 0, + requires_review: false, + informational: false, + fix_action: Some("delete_orphan_tags".to_string()), + data: json!({ "count": count, "tags": tag_list }), + } +} + +pub fn semantic_graph_freshness( + raw: &HealthRawData, + thresholds: &HealthThresholds, +) -> HealthCheckResult { + let atoms_since = raw.atoms_since_edge_rebuild; + let score = (100i32 - atoms_since * 2).max(0) as u32; + let status = if atoms_since == 0 { + "ok" + } else if atoms_since <= thresholds.semantic_graph_freshness_warning { + "warning" + } else { + "error" + }; + HealthCheckResult { + status: status.to_string(), + score, + auto_fixable: atoms_since > 0, + requires_review: false, + informational: false, + fix_action: Some("rebuild_semantic_edges".to_string()), + data: json!({ + "last_rebuilt": raw.newest_edge_created_at, + "newest_atom": raw.newest_atom_updated_at, + "atoms_since_rebuild": atoms_since + }), + } +} + +pub fn wiki_coverage(raw: &HealthRawData) -> HealthCheckResult { + let eligible = raw.wiki_eligible_count; + let with_wiki = raw.wiki_present_count; + let stale = raw.wiki_stale_count; + let without_wiki = eligible - with_wiki; + + let score = if eligible == 0 { + 100 + } else { + let coverage_pct = (with_wiki as f64 / eligible as f64) * 70.0; + let freshness_pct = if with_wiki == 0 { + 30.0 + } else { + let non_stale = (with_wiki - stale).max(0); + (non_stale as f64 / with_wiki as f64) * 30.0 + }; + (coverage_pct + freshness_pct).round() as u32 + }; + let status = if score >= 90 { + "ok" + } else if score >= 60 { + "warning" + } else { + "error" + }; + + let gaps: Vec = raw + .wiki_gaps + .iter() + .map(|g: &WikiGap| json!({ "tag_id": g.tag_id, "tag_name": g.tag_name, "atom_count": g.atom_count })) + .collect(); + let stale_list: Vec = raw + .wiki_stale + .iter() + .map(|s: &WikiStaleEntry| { + json!({ "tag_id": s.tag_id, "tag_name": s.tag_name, "new_atoms": s.new_atom_count }) + }) + .collect(); + + HealthCheckResult { + status: status.to_string(), + score, + auto_fixable: without_wiki > 0 || stale > 0, + requires_review: false, + informational: true, + fix_action: Some("generate_missing_wikis".to_string()), + data: json!({ + "eligible_tags": eligible, + "with_wiki": with_wiki, + "without_wiki": without_wiki, + "stale_wikis": stale, + "gaps": gaps, + "stale": stale_list + }), + } +} + +pub fn content_quality(raw: &HealthRawData) -> HealthCheckResult { + let mut issues = 0; + if !raw.very_short_atoms.is_empty() { + issues += 1; + } + if !raw.very_long_atoms.is_empty() { + issues += 1; + } + if !raw.no_heading_atoms.is_empty() { + issues += 1; + } + if !raw.no_source_atoms.is_empty() { + issues += 1; + } + + let score = (85u32).saturating_sub(issues * 5); + let status = if issues == 0 { "ok" } else { "info" }; + + HealthCheckResult { + status: status.to_string(), + score, + auto_fixable: !raw.very_short_atoms.is_empty() + || !raw.very_long_atoms.is_empty() + || !raw.no_heading_atoms.is_empty(), + requires_review: !raw.no_source_atoms.is_empty(), + informational: true, + fix_action: None, + data: json!({ + "total": raw.total_atoms, + "issues": { + "very_short": { + "count": raw.very_short_atoms.len(), + "auto_fixable": true, + "atoms": raw.very_short_atoms + }, + "very_long": { + "count": raw.very_long_atoms.len(), + "auto_fixable": true, + "atoms": raw.very_long_atoms + }, + "no_headings": { + "count": raw.no_heading_atoms.len(), + "auto_fixable": true, + "atoms": raw.no_heading_atoms + }, + "no_source": { + "count": raw.no_source_atoms.len(), + "auto_fixable": false, + "atoms": raw.no_source_atoms.iter().map(|a| json!({ + "id": a.id, + "title": a.title, + "created_at": a.created_at + })).collect::>() + } + } + }), + } +} + +pub fn tag_health( + raw: &HealthRawData, + thresholds: &HealthThresholds, +) -> HealthCheckResult { + let single = raw.single_atom_tags; + let rootless = raw.rootless_tags; + let similar = raw.similar_name_pair_count; + let single_thresh = thresholds.tag_health_single_atom_threshold; + + let issues = (single > single_thresh) as u32 + (rootless > 0) as u32 + (similar > 0) as u32; + let score = (100u32).saturating_sub(issues * 10); + let status = if issues == 0 { "ok" } else { "warning" }; + + // auto_fixable only when there are auto-tag single-atom tags above the threshold. + // fix_tag_health_single_atom targets is_autotag=true entries only. + let autotag_single = raw.single_atom_tag_list.iter().filter(|t| t.is_autotag).count() as i32; + let auto_fixable = autotag_single > single_thresh; + + let single_atom_truncated = raw.single_atom_tags > raw.single_atom_tag_list.len() as i32; + + HealthCheckResult { + status: status.to_string(), + score, + auto_fixable, + requires_review: rootless > 0 || similar > 0 || single > single_thresh, + informational: false, + fix_action: None, + data: json!({ + "single_atom_tags": single, + "rootless_tags": rootless, + "similar_name_pairs": similar, + "rootless_tag_list": raw.rootless_tag_list.iter().map(|t| json!({ + "id": t.id, + "name": t.name, + "atom_count": t.atom_count + })).collect::>(), + "similar_name_pair_list": raw.similar_name_pairs_list.iter().map(|(a_id, a_name, b_id, b_name)| json!({ + "pair_id": format!("{}__{}", a_id, b_id), + "a_id": a_id, "a_name": a_name, + "b_id": b_id, "b_name": b_name, + })).collect::>(), + "single_atom_tag_list": raw.single_atom_tag_list.iter().map(|t| json!({ + "id": t.id, + "name": t.name, + "is_autotag": t.is_autotag, + })).collect::>(), + "single_atom_tag_list_truncated": single_atom_truncated, + }), + } +} + +pub fn content_overlap(raw: &HealthRawData) -> HealthCheckResult { + let overlaps = raw.duplicate_pairs.len() as i32; + let exact_dupes = raw.duplicate_sources.len() as i32; + let template_clones = raw.boilerplate_affected_atoms.len() as i32; + + // Score: deduct for unreviewed cross-source overlaps. + // Exact dupes handled by source_uniqueness, template clones by boilerplate_pollution. + let score = (100i32 - overlaps * 8).max(0) as u32; + let status = if overlaps == 0 { "ok" } else { "warning" }; + + let pairs: Vec = raw + .duplicate_pairs + .iter() + .map(|p: &DuplicatePair| { + json!({ + "pair_id": p.pair_id, + "atom_a": { "id": p.atom_a_id, "title": p.atom_a_title, "source": p.atom_a_source, "created_at": p.atom_a_created_at }, + "atom_b": { "id": p.atom_b_id, "title": p.atom_b_title, "source": p.atom_b_source, "created_at": p.atom_b_created_at }, + "similarity": p.similarity, + "shared_tag_count": p.shared_tag_count, + "available_actions": ["merge_with_llm", "keep_both", "delete_older", "mark_complementary"] + }) + }) + .collect(); + + HealthCheckResult { + status: status.to_string(), + score, + auto_fixable: false, + requires_review: overlaps > 0, + informational: false, + fix_action: None, + data: json!({ + "exact_duplicates": exact_dupes, + "template_clones": template_clones, + "cross_source_overlaps": overlaps, + "count": overlaps, + "pairs": pairs + }), + } +} + +pub fn contradiction_detection(raw: &HealthRawData) -> HealthCheckResult { + let pair_count = raw.contradiction_pairs.len() as i32; + let score = (100i32 - pair_count * 8).max(0) as u32; + let status = if pair_count == 0 { "ok" } else { "warning" }; + + HealthCheckResult { + status: status.to_string(), + score, + auto_fixable: false, + requires_review: pair_count > 0, + informational: true, + fix_action: None, + data: json!({ + "pairs_checked": raw.contradiction_pairs_checked, + "potential_contradictions": pair_count, + "pairs": raw.contradiction_pairs.iter().map(|p| json!({ + "pair_id": p.pair_id, + "atom_a": { "id": p.atom_a.id, "title": p.atom_a.title, "source": p.atom_a.source, "created_at": p.atom_a.created_at }, + "atom_b": { "id": p.atom_b.id, "title": p.atom_b.title, "source": p.atom_b.source, "created_at": p.atom_b.created_at }, + "similarity": p.similarity, + "shared_tag_count": p.shared_tag_count + })).collect::>() + }), + } +} + + +/// Diagnostic check: atoms whose embeddings are dominated by shared boilerplate. +/// +/// An atom is flagged when it has >= thresholds.boilerplate_min_clones semantic edges +/// at similarity >= thresholds.boilerplate_similarity. +/// That means the vector space can't distinguish it from multiple other atoms, +/// so semantic search will return the wrong runbook / article for those queries. +/// +/// This check does NOT affect the overall score (not in CHECK_WEIGHTS). +/// Fix: re-chunk excluding boilerplate sections, or re-embed with a unique-content prefix. +pub fn boilerplate_pollution( + raw: &HealthRawData, + thresholds: &HealthThresholds, +) -> HealthCheckResult { + let count = raw.boilerplate_affected_atoms.len() as u32; + let status = if count == 0 { "ok" } else { "warning" }; + // Score reflects detection health: 0 affected = 100, degrades ~3/atom, floor at 50. + // Does NOT contribute to overall KB score (excluded from CHECK_WEIGHTS), + // but row-level score now matches user expectation: if there are issues, + // the score should not be 100. + let score: u32 = 100u32.saturating_sub(count.saturating_mul(3).min(50)).max(50); + + HealthCheckResult { + status: status.to_string(), + score, + auto_fixable: false, + requires_review: count > 0, + informational: true, + fix_action: None, + data: json!({ + "count": count, + "affected_atoms": raw.boilerplate_affected_atoms.iter().map(|a| json!({ + "id": a.id, + "title": a.title, + "clone_count": a.clone_count + })).collect::>(), + "threshold_similarity": thresholds.boilerplate_similarity, + "threshold_min_clones": thresholds.boilerplate_min_clones, + "description": format!( + "Atoms with >= {} near-identical edges (similarity >= {:.2}). \ + Shared boilerplate text drowns out unique content in their \ + embeddings. Semantic search cannot reliably distinguish \ + these atoms from each other.", + thresholds.boilerplate_min_clones, + thresholds.boilerplate_similarity, + ) + }), + } +} \ No newline at end of file diff --git a/crates/atomic-core/src/health/compute.rs b/crates/atomic-core/src/health/compute.rs new file mode 100644 index 00000000..2438b7a0 --- /dev/null +++ b/crates/atomic-core/src/health/compute.rs @@ -0,0 +1,555 @@ +//! Health computation pipeline. +//! +//! Responsible for: +//! - `compute_health` — run every enabled check, aggregate score, +//! apply dismissals, persist the snapshot. +//! - `compute_single_check` — run a single check in isolation. +//! - `compute_link_check` — async broken-internal-links check. +//! - `apply_dismissals` — filter dismissed entries out of check data. +//! +//! The fix orchestrator lives in `run_fix.rs`. Shared types live in +//! `types.rs`, scoring in `score.rs`. + +use super::score::aggregate_score; +use super::types::{HealthCheckOverride, HealthCheckResult, HealthReport, HealthStatus}; +use super::{checks, custom, link_resolution, pair_key}; +use crate::error::AtomicCoreError; +use crate::AtomicCore; +use std::collections::HashMap; + +/// Run all health checks and return a complete `HealthReport`. +/// +/// Completes in < 2s for databases with up to ~1,000 atoms. Contradiction +/// detection is a stub (no LLM call) so it won't time out on large graphs. +pub async fn compute_health(core: &AtomicCore) -> Result { + let computed_at = chrono::Utc::now().to_rfc3339(); + + // Load per-DB health config (fall back to defaults on error — health + // should never fail because of a corrupt config). + let config = core.get_health_config().await.unwrap_or_default(); + + // Fetch all raw data in a single spawn_blocking pass + let raw = core.storage().health_check_data_sync(config.thresholds.clone()).await?; + + // Run all synchronous checks + let mut checks = checks::run_all(&raw, &config.thresholds); + + // Run async link-resolution check (needs DB lookups per candidate atom) + match compute_link_check(core).await { + Ok(link_check) => { + checks.insert("broken_internal_links".to_string(), link_check); + } + Err(e) => { + tracing::warn!(error = %e, "broken_internal_links check failed"); + } + } + + // Apply persistent dismissals to review-producing checks + let reviewable = ["content_overlap", "contradiction_detection", "boilerplate_pollution", "content_quality", "tag_health", "broken_internal_links"]; + for check_name in reviewable { + let dismissed_pairs = core.storage().list_dismissed_keys_sync(check_name).await.unwrap_or_default(); + if dismissed_pairs.is_empty() { + continue; + } + let dismissed: std::collections::HashSet = + dismissed_pairs.into_iter().map(|(k, _)| k).collect(); + if let Some(result) = checks.get_mut(check_name) { + apply_dismissals(check_name, result, &dismissed); + } + } + + // Run user-defined custom checks (per-DB, synchronous, SqliteStorage-only). + // Failures are logged, not propagated — a bad custom rule must not take down + // the built-in health report. + let mut effective_config = config.clone(); + if let Some(sqlite) = core.storage.as_sqlite() { + match core.get_custom_health_checks().await { + Ok(custom_checks) if !custom_checks.is_empty() => { + match custom::run_all(sqlite, &custom_checks) { + Ok(results) => { + for (key, result, check) in results { + // Feed each custom check's weight into the aggregate-score + // config so zero-weight rules stay informational and + // positive-weight rules contribute at the requested weight. + effective_config.overrides.insert( + key.clone(), + HealthCheckOverride { + enabled: check.enabled, + weight: Some(check.weight), + }, + ); + checks.insert(key, result); + } + } + Err(e) => tracing::warn!(error = %e, "custom health checks failed"), + } + } + Ok(_) => {} + Err(e) => tracing::warn!(error = %e, "load custom health checks failed"), + } + } + + // Drop disabled checks entirely (config-driven). + checks.retain(|name, _| { + effective_config + .overrides + .get(name) + .map(|o| o.enabled) + .unwrap_or(true) + }); + + // Aggregate score + let overall_score = aggregate_score(&checks, Some(&effective_config)); + let overall_status = HealthStatus::from_score(overall_score).as_str().to_string(); + + // Count auto-fixable vs requires-review + let auto_fixable = checks + .values() + .filter(|c| c.auto_fixable && c.status != "ok") + .count() as i32; + let requires_review = checks + .values() + .filter(|c| c.requires_review && c.status != "ok") + .count() as i32; + + let atom_count = raw.total_atoms; + + // Fetch previous report for trending (before storing the current one) + let (previous_score, previous_check_scores) = + match core.get_latest_health_report().await { + Ok(Some(prev)) => { + let check_scores: HashMap = + prev.checks.iter().map(|(k, v)| (k.clone(), v.score)).collect(); + (Some(prev.overall_score), Some(check_scores)) + } + _ => (None, None), + }; + + let report = HealthReport { + overall_score, + overall_status, + computed_at: computed_at.clone(), + atom_count, + checks, + auto_fixable, + requires_review, + previous_score, + previous_check_scores, + }; + + // Persist for trending (fire-and-forget; ignore errors) + let _ = store_report(core, &report).await; + + Ok(report) +} + +/// Compute a single named health check in isolation. +/// +/// Accepts any check name from the standard set. For the async +/// `broken_internal_links` check, runs `compute_link_check` directly. +/// Returns `(check_name, HealthCheckResult)` so callers can update +/// a cached `HealthReport` in place. +pub async fn compute_single_check( + core: &AtomicCore, + check_name: &str, +) -> Result<(String, HealthCheckResult), AtomicCoreError> { + let mut result = match check_name { + // Async check — requires per-atom DB lookups + "broken_internal_links" => compute_link_check(core).await?, + // Sync checks — fetch raw data once, dispatch to the appropriate fn + "embedding_coverage" + | "tagging_coverage" + | "content_overlap" + | "source_uniqueness" + | "wiki_coverage" + | "semantic_graph_freshness" + | "content_quality" + | "orphan_tags" + | "tag_health" + | "contradiction_detection" + | "boilerplate_pollution" => { + let config = core.get_health_config().await.unwrap_or_default(); + let raw = core + .storage() + .health_check_data_sync(config.thresholds.clone()) + .await?; + let t = &config.thresholds; + match check_name { + "embedding_coverage" => checks::embedding_coverage(&raw), + "tagging_coverage" => checks::tagging_coverage(&raw), + "content_overlap" => checks::content_overlap(&raw), + "source_uniqueness" => checks::source_uniqueness(&raw), + "wiki_coverage" => checks::wiki_coverage(&raw), + "semantic_graph_freshness" => checks::semantic_graph_freshness(&raw, t), + "content_quality" => checks::content_quality(&raw), + "orphan_tags" => checks::orphan_tags(&raw), + "tag_health" => checks::tag_health(&raw, t), + "contradiction_detection" => checks::contradiction_detection(&raw), + "boilerplate_pollution" => checks::boilerplate_pollution(&raw, t), + _ => unreachable!(), + } + } + _ => { + return Err(AtomicCoreError::Validation(format!( + "Unknown health check: {check_name}" + ))) + } + }; + // Apply persistent dismissals + if matches!(check_name, "content_overlap" | "contradiction_detection" | "boilerplate_pollution" | "content_quality" | "tag_health") { + let dismissed_pairs = core.storage().list_dismissed_keys_sync(check_name).await.unwrap_or_default(); + if !dismissed_pairs.is_empty() { + let dismissed: std::collections::HashSet = + dismissed_pairs.into_iter().map(|(k, _)| k).collect(); + apply_dismissals(check_name, &mut result, &dismissed); + } + } + Ok((check_name.to_string(), result)) +} + +/// Store a completed report in the health_reports table. +async fn store_report( + core: &AtomicCore, + report: &HealthReport, +) -> Result<(), AtomicCoreError> { + use crate::health::audit::StoredHealthReport; + let check_scores: HashMap = report + .checks + .iter() + .map(|(k, v)| (k.clone(), v.score)) + .collect(); + let stored = StoredHealthReport { + id: uuid::Uuid::new_v4().to_string(), + computed_at: report.computed_at.clone(), + overall_score: report.overall_score, + check_scores: serde_json::to_string(&check_scores).unwrap_or_default(), + atom_count: report.atom_count, + auto_fixes_applied: 0, + report_json: serde_json::to_string(report).unwrap_or_default(), + }; + core.storage().store_health_report_sync(&stored).await +} + +/// Per-link detail within a broken atom. +#[derive(serde::Serialize, Clone)] +struct BrokenLinkDetail { + raw: String, + target: String, + kind: String, +} + +/// Atom-level summary of broken links. +#[derive(serde::Serialize, Clone)] +struct BrokenLinkItem { + atom_id: String, + atom_title: String, + links: Vec, +} + +pub(crate) fn title_preview(content: &str) -> String { + for line in content.lines() { + let clean = line.trim().trim_start_matches('#').trim(); + if !clean.is_empty() { + return if clean.len() > 80 { + format!("{}\u{2026}", &clean[..80]) + } else { + clean.to_string() + }; + } + } + String::new() +} + +async fn compute_link_check(core: &AtomicCore) -> Result { + use link_resolution::{extract_internal_links, vault_root}; + + let candidates = core.storage().get_link_candidate_atoms_sync().await?; + if candidates.is_empty() { + return Ok(HealthCheckResult { + status: "ok".to_string(), + score: 100, + auto_fixable: false, + requires_review: false, + informational: false, + fix_action: None, + data: serde_json::json!({ "broken_count": 0, "affected_atoms": 0, "broken_link_list": [], "broken_link_list_truncated": false }), + }); + } + + let mut broken_count = 0i32; + let mut affected_atoms = 0i32; + let mut broken_items: Vec = Vec::new(); + + for (atom_id, content, source_url) in &candidates { + let links = extract_internal_links(content, source_url.as_deref()); + if links.is_empty() { + continue; + } + + let candidate_urls: Vec = links + .iter() + .flat_map(|l| l.candidate_source_urls.iter().cloned()) + .collect(); + + let url_map = core + .storage() + .find_atoms_by_source_urls_sync(candidate_urls) + .await + .unwrap_or_default(); + + let vault_pfx = source_url + .as_deref() + .and_then(vault_root) + .map(|s| s.to_string()); + + let mut atom_broken = false; + let mut atom_link_details: Vec = Vec::new(); + for link in &links { + let resolved_by_url = link + .candidate_source_urls + .iter() + .any(|u| url_map.contains_key(u)); + + if resolved_by_url { + continue; + } + + let resolved_by_name = if let Some(pfx) = &vault_pfx { + let name = link + .wikilink_name + .clone() + .or_else(|| link_resolution::markdown_stem_fallback(&link.href)); + if let Some(name) = name { + core.storage() + .find_atom_by_wikilink_name_sync(name, pfx.clone()) + .await + .unwrap_or(None) + .is_some() + } else { + false + } + } else { + false + }; + + if !resolved_by_name { + broken_count += 1; + atom_broken = true; + let kind = if link.wikilink_name.is_some() { + "wikilink".to_string() + } else { + "markdown".to_string() + }; + let target = link + .wikilink_name + .as_deref() + .unwrap_or(&link.href) + .to_string(); + atom_link_details.push(BrokenLinkDetail { + raw: link.original.clone(), + target, + kind, + }); + } + } + if atom_broken { + affected_atoms += 1; + if broken_items.len() < 50 { + broken_items.push(BrokenLinkItem { + atom_id: atom_id.clone(), + atom_title: title_preview(content), + links: atom_link_details, + }); + } + } + } + + let total_atoms = core + .count_atoms() + .await + .unwrap_or(candidates.len() as i32); + let clean_atoms = (total_atoms - affected_atoms).max(0); + let score = if total_atoms == 0 { + 100 + } else { + (clean_atoms as f64 / total_atoms as f64 * 100.0) as u32 + }; + let status = if broken_count == 0 { "ok" } else { "warning" }; + let truncated = affected_atoms > 50; + + Ok(HealthCheckResult { + status: status.to_string(), + score, + auto_fixable: broken_count > 0, + requires_review: broken_count > 0, + informational: false, + fix_action: Some("resolve_internal_links".to_string()), + data: serde_json::json!({ + "broken_count": broken_count, + "affected_atoms": affected_atoms, + "broken_link_list": broken_items, + "broken_link_list_truncated": truncated, + }), + }) +} + +/// Filter a check result's JSON data to exclude dismissed entries. +pub(crate) fn apply_dismissals( + check_name: &str, + result: &mut HealthCheckResult, + dismissed_keys: &std::collections::HashSet, +) { + if dismissed_keys.is_empty() { + return; + } + + use serde_json::Value; + let data = &mut result.data; + + match check_name { + "content_overlap" => { + if let Some(pairs) = data.get_mut("pairs").and_then(Value::as_array_mut) { + pairs.retain(|p| { + let a = p.get("atom_a").and_then(|o| o.get("id")).and_then(Value::as_str).unwrap_or(""); + let b = p.get("atom_b").and_then(|o| o.get("id")).and_then(Value::as_str).unwrap_or(""); + !dismissed_keys.contains(&pair_key(a, b)) + }); + let new_count = pairs.len(); + if let Some(c) = data.get_mut("count") { + *c = Value::from(new_count); + } + if let Some(c) = data.get_mut("cross_source_overlaps") { + *c = Value::from(new_count); + } + // Keep score in sync with the post-dismissal pair count. Mirrors + // `checks::content_overlap`: `100 - overlaps * 8`, floored at 0. + result.score = (100i32 - (new_count as i32) * 8).max(0) as u32; + if new_count == 0 { + result.status = "ok".to_string(); + result.requires_review = false; + } + } + } + "contradiction_detection" => { + if let Some(pairs) = data.get_mut("pairs").and_then(Value::as_array_mut) { + pairs.retain(|p| { + let a = p.get("atom_a").and_then(|o| o.get("id")).and_then(Value::as_str).unwrap_or(""); + let b = p.get("atom_b").and_then(|o| o.get("id")).and_then(Value::as_str).unwrap_or(""); + !dismissed_keys.contains(&pair_key(a, b)) + }); + let new_count = pairs.len(); + if let Some(c) = data.get_mut("potential_contradictions") { + *c = Value::from(new_count); + } + // Keep score in sync with the post-dismissal pair count. Mirrors + // `checks::contradiction_detection`: `100 - pair_count * 8`, + // floored at 0. Without this, dismissing the last contradiction + // leaves the row showing score=4 (from the frozen 12-pair + // baseline) next to "0 atom pairs" — a UI contradiction worse + // than the one we claim to have caught. + result.score = (100i32 - (new_count as i32) * 8).max(0) as u32; + if new_count == 0 { + result.status = "ok".to_string(); + result.requires_review = false; + } + } + } + "boilerplate_pollution" => { + if let Some(arr) = data.get_mut("affected_atoms").and_then(Value::as_array_mut) { + arr.retain(|entry| { + let id = entry.get("id").and_then(Value::as_str).unwrap_or(""); + !dismissed_keys.contains(id) + }); + let new_count = arr.len() as u32; + if let Some(c) = data.get_mut("count") { + *c = Value::from(new_count); + } + // Mirror `checks::boilerplate_pollution`: 100 - 3*n, floored at 50. + result.score = 100u32 + .saturating_sub(new_count.saturating_mul(3).min(50)) + .max(50); + if new_count == 0 { + result.status = "ok".to_string(); + result.requires_review = false; + } + } + } + "content_quality" => { + if let Some(ns) = data + .pointer_mut("/issues/no_source/atoms") + .and_then(Value::as_array_mut) + { + ns.retain(|entry| { + let id = entry.get("id").and_then(Value::as_str).unwrap_or(""); + !dismissed_keys.contains(id) + }); + let new_count = ns.len(); + if let Some(c) = data.pointer_mut("/issues/no_source/count") { + *c = Value::from(new_count); + } + if new_count == 0 { + result.requires_review = false; + } + } + } + "tag_health" => { + if let Some(arr) = data.get_mut("rootless_tag_list").and_then(Value::as_array_mut) { + arr.retain(|t| { + let id = t.get("id").and_then(Value::as_str).unwrap_or(""); + !dismissed_keys.contains(id) + }); + let new_count = arr.len(); + if let Some(c) = data.get_mut("rootless_tags") { + *c = Value::from(new_count); + } + if new_count == 0 { + result.requires_review = false; + } + } + if let Some(arr) = data.get_mut("similar_name_pair_list").and_then(Value::as_array_mut) { + arr.retain(|p| { + let pair_id = p.get("pair_id").and_then(Value::as_str).unwrap_or(""); + !dismissed_keys.contains(pair_id) + }); + let new_similar = arr.len(); + if let Some(c) = data.get_mut("similar_name_pairs") { + *c = Value::from(new_similar); + } + } + if let Some(arr) = data.get_mut("single_atom_tag_list").and_then(Value::as_array_mut) { + arr.retain(|t| { + let id = t.get("id").and_then(Value::as_str).unwrap_or(""); + !dismissed_keys.contains(id) + }); + let new_count = arr.len() as i32; + if let Some(c) = data.get_mut("single_atom_tags") { + *c = Value::from(new_count); + } + } + } + "broken_internal_links" => { + if let Some(arr) = data.get_mut("broken_link_list").and_then(Value::as_array_mut) { + arr.retain(|entry| { + let id = entry.get("atom_id").and_then(Value::as_str).unwrap_or(""); + !dismissed_keys.contains(id) + }); + let new_count = arr.len() as i64; + // Recompute broken_count as sum of remaining link counts + let new_broken: i64 = arr.iter().map(|entry| { + entry.get("links").and_then(|l| l.as_array()).map_or(0, |v| v.len() as i64) + }).sum(); + if let Some(c) = data.get_mut("affected_atoms") { + *c = Value::from(new_count); + } + if let Some(c) = data.get_mut("broken_count") { + *c = Value::from(new_broken); + } + if new_count == 0 { + result.requires_review = false; + result.auto_fixable = false; + } + } + } + _ => {} + } +} diff --git a/crates/atomic-core/src/health/custom/helpers.rs b/crates/atomic-core/src/health/custom/helpers.rs new file mode 100644 index 00000000..1e56dac7 --- /dev/null +++ b/crates/atomic-core/src/health/custom/helpers.rs @@ -0,0 +1,158 @@ +//! Shared helpers used by every rule evaluator. +//! +//! Kept separate so new rules can lift any of these without reaching into +//! sibling evaluators. Nothing here is public outside `health::custom`. + +use super::types::FlaggedAtom; +use crate::error::AtomicCoreError; +use rusqlite::params; + +/// Cap flagged-atom lists so a malformed rule can't blow up the report. +pub(super) const MAX_FLAGGED: usize = 500; + +/// Cap on sample atoms returned in the preview. UI doesn't need more. +pub(super) const PREVIEW_SAMPLE: usize = 10; + +/// Preview the first non-empty line of `content` for UI display. +pub(super) fn preview(content: &str) -> String { + let trimmed = content + .lines() + .map(|l| l.trim()) + .find(|l| !l.is_empty()) + .unwrap_or(""); + const MAX: usize = 80; + if trimmed.chars().count() > MAX { + let truncated: String = trimmed.chars().take(MAX).collect(); + format!("{truncated}\u{2026}") + } else { + trimmed.to_string() + } +} + +/// Fast, allocation-free word count. Treats any run of ASCII whitespace as a +/// separator — matches `str::split_whitespace` but avoids allocating an +/// intermediate iterator collection. +pub(super) fn word_count(s: &str) -> u32 { + s.split_whitespace().count() as u32 +} + +/// Count markdown links `[text](url)` plus wikilinks `[[...]]`. +pub(super) fn count_citations(content: &str) -> u32 { + let mut n = 0u32; + let bytes = content.as_bytes(); + let mut i = 0; + while i < bytes.len() { + if bytes[i] == b'[' { + // wikilink: `[[` + if i + 1 < bytes.len() && bytes[i + 1] == b'[' { + n += 1; + i += 2; + continue; + } + // markdown link: `](` following `[...]` + if let Some(close) = content[i..].find("](") { + // must not contain a newline between [ and ]( + if !content[i..i + close].contains('\n') { + n += 1; + i += close + 2; + continue; + } + } + } + i += 1; + } + n +} + +/// Parse a URL's host. Accepts http/https/ftp/etc. Returns None for empty or +/// malformed URLs — those atoms are skipped (not flagged) since policing the +/// presence of a URL is `RequireSource`'s job. +pub(super) fn host_of(url: &str) -> Option { + let rest = url.split_once("://").map(|(_, r)| r).unwrap_or(url); + let host = rest.split('/').next()?.split('?').next()?.split('#').next()?; + if host.is_empty() { None } else { Some(host.to_lowercase()) } +} + +pub(super) fn host_matches(host: &str, domains: &[String]) -> bool { + domains.iter().any(|d| { + let d = d.trim().trim_start_matches("https://").trim_start_matches("http://").to_lowercase(); + if d.is_empty() { return false; } + host == d || host.ends_with(&format!(".{d}")) + }) +} + +/// Load candidate atoms. When `tag_filter` is `Some`, restricts to atoms +/// tagged with that tag id; otherwise all atoms. Standardizes the select- +/// and-iterate boilerplate every evaluator below shares. +pub(super) fn for_each_atom( + conn: &rusqlite::Connection, + tag_filter: Option<&str>, + cols: &str, + mut consume: F, +) -> Result +where + F: FnMut(&rusqlite::Row<'_>) -> rusqlite::Result, +{ + let mut total = 0i32; + match tag_filter { + Some(tag) => { + let sql = format!( + "SELECT DISTINCT {cols} FROM atoms a \ + JOIN atom_tags at ON at.atom_id = a.id \ + WHERE at.tag_id = ?1" + ); + let mut stmt = conn.prepare(&sql)?; + let mut rows = stmt.query(params![tag])?; + while let Some(row) = rows.next()? { + if consume(row)? { total += 1; } + } + } + None => { + let sql = format!("SELECT {cols} FROM atoms a"); + let mut stmt = conn.prepare(&sql)?; + let mut rows = stmt.query([])?; + while let Some(row) = rows.next()? { + if consume(row)? { total += 1; } + } + } + } + Ok(total) +} + +pub(super) fn push_flag(flagged: &mut Vec, id: String, content: &str) { + if flagged.len() < MAX_FLAGGED { + flagged.push(FlaggedAtom { + title_preview: preview(content), + id, + }); + } +} + +/// Load `(id, content)` pairs for the candidate atom set. Wrapped as a helper +/// so evaluators that need the full candidate list (not just a streaming +/// callback) don't have to juggle statement lifetimes across match arms. +pub(super) fn load_candidates_id_content( + conn: &rusqlite::Connection, + tag_filter: Option<&str>, +) -> Result, AtomicCoreError> { + match tag_filter { + Some(tag) => { + let mut stmt = conn.prepare( + "SELECT DISTINCT a.id, a.content FROM atoms a \ + JOIN atom_tags at ON at.atom_id = a.id \ + WHERE at.tag_id = ?1", + )?; + let rows = stmt.query_map(params![tag], |row| { + Ok((row.get::<_, String>(0)?, row.get::<_, String>(1)?)) + })?; + Ok(rows.collect::>()?) + } + None => { + let mut stmt = conn.prepare("SELECT id, content FROM atoms")?; + let rows = stmt.query_map([], |row| { + Ok((row.get::<_, String>(0)?, row.get::<_, String>(1)?)) + })?; + Ok(rows.collect::>()?) + } + } +} diff --git a/crates/atomic-core/src/health/custom/mod.rs b/crates/atomic-core/src/health/custom/mod.rs new file mode 100644 index 00000000..5c43e977 --- /dev/null +++ b/crates/atomic-core/src/health/custom/mod.rs @@ -0,0 +1,183 @@ +//! User-defined custom health checks. +//! +//! Atomic ships with opinionated built-in checks, but different knowledge-base +//! workflows have different conventions. Custom checks let users declare rules +//! that reflect *their* conventions ("every atom tagged `paper` must have a +//! source URL", "flag atoms containing TODO markers", etc.) without shipping +//! arbitrary SQL or JavaScript from the UI. +//! +//! # Safety +//! +//! Rules are structured, not free-form: each [`CustomRule`] variant is a +//! hard-coded predicate the Rust evaluator applies. The UI only controls +//! parameters (tag ids, regex patterns) — never the query shape. This avoids +//! the SQL-injection and resource-exhaustion risks that come with arbitrary +//! user-defined SQL, while still covering the workflows users actually ask +//! for. +//! +//! # Storage +//! +//! The full list is persisted per-DB as JSON under the `custom_health_checks` +//! setting key (NOT in registry — see AGENTS.md § Multi-DB Gotchas). Each DB +//! has its own independent rule set. +//! +//! # Layout +//! - [`types`] — `CustomRule`, `DomainMatchMode`, `CustomCheck`, `PreviewResult` +//! - [`helpers`] — shared preview/word-count/URL-host/for-each-atom helpers +//! - [`rules`] — one `eval_*` fn per `CustomRule` variant + +mod helpers; +mod rules; +pub mod types; + +use super::HealthCheckResult; +use crate::error::AtomicCoreError; +use crate::storage::sqlite::SqliteStorage; +use helpers::PREVIEW_SAMPLE; +use serde_json::json; + +pub use types::{CustomCheck, CustomRule, DomainMatchMode, PreviewResult}; +use types::RawOutcome; + +/// Pre-fixed key used to identify custom checks inside the `checks` map of a +/// `HealthReport`. Prevents collisions with built-in check names. +pub const CUSTOM_CHECK_PREFIX: &str = "custom."; + +/// Compose the map key a custom check appears under in the report. +pub fn result_key(check_id: &str) -> String { + format!("{CUSTOM_CHECK_PREFIX}{check_id}") +} + +/// Evaluate all enabled custom checks against the database and return a +/// `(map_key, HealthCheckResult)` entry per check. +/// +/// Runs on the caller's tokio runtime. Each rule executes one or two bounded +/// queries — no N+1, no full-table scans beyond what the built-in checks +/// already do. +pub fn run_all( + storage: &SqliteStorage, + checks: &[CustomCheck], +) -> Result, AtomicCoreError> { + let conn = storage + .db + .conn + .lock() + .map_err(|e| AtomicCoreError::Lock(e.to_string()))?; + let mut out = Vec::with_capacity(checks.len()); + for check in checks.iter().filter(|c| c.enabled) { + let result = evaluate(&conn, &check.rule)?; + out.push((result_key(&check.id), finalize(check, result), check.clone())); + } + Ok(out) +} + +/// Evaluate a single rule against the database WITHOUT persisting it or +/// touching the report. Fails when the rule itself is malformed (e.g. +/// invalid regex); the caller surfaces the error so the user can fix it. +pub fn preview_rule( + storage: &SqliteStorage, + rule: &CustomRule, +) -> Result { + let conn = storage + .db + .conn + .lock() + .map_err(|e| AtomicCoreError::Lock(e.to_string()))?; + let raw = evaluate(&conn, rule)?; + let sample = raw + .flagged_atoms + .iter() + .take(PREVIEW_SAMPLE) + .map(|f| json!({ "id": f.id, "title_preview": f.title_preview })) + .collect(); + Ok(PreviewResult { + total_considered: raw.total_considered, + flagged_count: raw.flagged_atoms.len() as i32, + sample, + }) +} + +/// Dispatch a rule variant to its evaluator. +fn evaluate( + conn: &rusqlite::Connection, + rule: &CustomRule, +) -> Result { + match rule { + CustomRule::TagRequires { any_of, required } => { + rules::eval_tag_requires(conn, any_of, required) + } + CustomRule::RequireSource { tag_filter } => { + rules::eval_require_source(conn, tag_filter.as_deref()) + } + CustomRule::ContentRegex { pattern, invert } => { + rules::eval_content_regex(conn, pattern, *invert) + } + CustomRule::RequireTag { any_of, tag_filter } => { + rules::eval_require_tag(conn, any_of, tag_filter.as_deref()) + } + CustomRule::ContentLength { min_words, max_words, tag_filter } => { + rules::eval_content_length(conn, *min_words, *max_words, tag_filter.as_deref()) + } + CustomRule::CitationCount { min_citations, tag_filter } => { + rules::eval_citation_count(conn, *min_citations, tag_filter.as_deref()) + } + CustomRule::SourceDomainMatches { domains, mode, tag_filter } => { + rules::eval_source_domain(conn, domains, *mode, tag_filter.as_deref()) + } + CustomRule::StaleAtom { tag, max_age_days } => { + rules::eval_stale_atom(conn, tag, *max_age_days) + } + CustomRule::ForbiddenTagCombo { all_of } => { + rules::eval_forbidden_combo(conn, all_of) + } + CustomRule::MissingHeading { min_length_chars, tag_filter } => { + rules::eval_missing_heading(conn, *min_length_chars, tag_filter.as_deref()) + } + CustomRule::TagCardinality { min, max, tag_filter } => { + rules::eval_tag_cardinality(conn, *min, *max, tag_filter.as_deref()) + } + } +} + +/// Wrap the raw predicate outcome as a `HealthCheckResult` honoring the check's +/// weight semantics (0 → informational, > 0 → contributes to overall score). +fn finalize(check: &CustomCheck, raw: RawOutcome) -> HealthCheckResult { + let flagged = raw.flagged_atoms.len() as i32; + let score = if raw.total_considered == 0 { + 100 + } else { + let bad = flagged.min(raw.total_considered); + let ratio = 1.0 - (bad as f64 / raw.total_considered as f64); + (ratio * 100.0).round().clamp(0.0, 100.0) as u32 + }; + let status = if flagged == 0 { + "ok" + } else if score >= 80 { + "warning" + } else { + "error" + } + .to_string(); + let informational = check.weight <= 0.0; + + HealthCheckResult { + status, + score, + auto_fixable: false, + requires_review: flagged > 0, + informational, + fix_action: None, + data: json!({ + "custom": true, + "label": check.label, + "description": check.description, + "rule": &check.rule, + "total_considered": raw.total_considered, + "flagged_count": flagged, + "flagged": raw.flagged_atoms, + }), + } +} + +#[cfg(test)] +mod tests; diff --git a/crates/atomic-core/src/health/custom/rules.rs b/crates/atomic-core/src/health/custom/rules.rs new file mode 100644 index 00000000..dda20634 --- /dev/null +++ b/crates/atomic-core/src/health/custom/rules.rs @@ -0,0 +1,472 @@ +//! Individual rule evaluators. +//! +//! Each `eval_*` function maps to exactly one `CustomRule` variant and returns +//! a `RawOutcome` (total_considered + flagged_atoms). The dispatcher lives in +//! `mod::evaluate`. Shared helpers are in [`super::helpers`]. + +use super::helpers::{ + for_each_atom, host_matches, host_of, load_candidates_id_content, preview, push_flag, + word_count, count_citations, MAX_FLAGGED, +}; +use super::types::{DomainMatchMode, FlaggedAtom, RawOutcome}; +use crate::error::AtomicCoreError; +use rusqlite::params; +use std::collections::HashMap; + +// ==================== Tier 0: core ==================== + +pub(super) fn eval_tag_requires( + conn: &rusqlite::Connection, + any_of: &[String], + required: &[String], +) -> Result { + if any_of.is_empty() { + // Empty filter = nothing to check; treat as passing. + return Ok(RawOutcome { + total_considered: 0, + flagged_atoms: Vec::new(), + }); + } + + // Candidate atoms: those carrying at least one of `any_of`. + let placeholders_any: String = std::iter::repeat_n("?", any_of.len()) + .collect::>() + .join(","); + let candidate_sql = format!( + "SELECT DISTINCT a.id, a.content FROM atoms a \ + JOIN atom_tags at ON at.atom_id = a.id \ + WHERE at.tag_id IN ({placeholders_any})" + ); + let mut stmt = conn.prepare(&candidate_sql)?; + let rows = stmt.query_map( + rusqlite::params_from_iter(any_of.iter()), + |row| { + let id: String = row.get(0)?; + let content: String = row.get(1)?; + Ok((id, content)) + }, + )?; + let candidates: Vec<(String, String)> = rows.collect::>()?; + let total_considered = candidates.len() as i32; + + if required.is_empty() { + // No required tags to check — everything is fine by definition. + return Ok(RawOutcome { + total_considered, + flagged_atoms: Vec::new(), + }); + } + + // For each candidate, fetch the set of tag ids; flag if any `required` + // tag is missing. We do a single query to load (atom_id, tag_id) pairs + // for the candidate set, then bucket in memory — O(N) over result rows + // rather than one query per atom. + let ids: Vec<&str> = candidates.iter().map(|(id, _)| id.as_str()).collect(); + let mut tags_by_atom: HashMap> = HashMap::new(); + if !ids.is_empty() { + let placeholders: String = std::iter::repeat_n("?", ids.len()) + .collect::>() + .join(","); + let sql = format!( + "SELECT atom_id, tag_id FROM atom_tags WHERE atom_id IN ({placeholders})" + ); + let mut stmt = conn.prepare(&sql)?; + let rows = stmt.query_map(rusqlite::params_from_iter(ids.iter().copied()), |row| { + let aid: String = row.get(0)?; + let tid: String = row.get(1)?; + Ok((aid, tid)) + })?; + for row in rows { + let (aid, tid) = row?; + tags_by_atom.entry(aid).or_default().insert(tid); + } + } + + let mut flagged = Vec::new(); + for (id, content) in candidates { + let tags = tags_by_atom.get(&id); + let missing_any = required.iter().any(|r| match tags { + Some(set) => !set.contains(r), + None => true, + }); + if missing_any { + flagged.push(FlaggedAtom { + title_preview: preview(&content), + id, + }); + if flagged.len() >= MAX_FLAGGED { + break; + } + } + } + + Ok(RawOutcome { + total_considered, + flagged_atoms: flagged, + }) +} + +pub(super) fn eval_require_source( + conn: &rusqlite::Connection, + tag_filter: Option<&str>, +) -> Result { + let mut total = 0i32; + let mut flagged = Vec::new(); + + let mut consume = |id: String, content: String, source: Option| { + total += 1; + let missing = match source { + Some(s) => s.trim().is_empty(), + None => true, + }; + if missing && flagged.len() < MAX_FLAGGED { + flagged.push(FlaggedAtom { + title_preview: preview(&content), + id, + }); + } + }; + + match tag_filter { + Some(tag) => { + let mut stmt = conn.prepare( + "SELECT DISTINCT a.id, a.content, a.source_url FROM atoms a \ + JOIN atom_tags at ON at.atom_id = a.id \ + WHERE at.tag_id = ?1", + )?; + let rows = stmt.query_map(params![tag], |row| { + let id: String = row.get(0)?; + let content: String = row.get(1)?; + let source: Option = row.get(2)?; + Ok((id, content, source)) + })?; + for row in rows { + let (id, content, source) = row?; + consume(id, content, source); + } + } + None => { + let mut stmt = conn.prepare("SELECT id, content, source_url FROM atoms")?; + let rows = stmt.query_map([], |row| { + let id: String = row.get(0)?; + let content: String = row.get(1)?; + let source: Option = row.get(2)?; + Ok((id, content, source)) + })?; + for row in rows { + let (id, content, source) = row?; + consume(id, content, source); + } + } + } + + Ok(RawOutcome { + total_considered: total, + flagged_atoms: flagged, + }) +} + +pub(super) fn eval_content_regex( + conn: &rusqlite::Connection, + pattern: &str, + invert: bool, +) -> Result { + // Bound pattern size — compiled regex state grows with the input. + if pattern.len() > 512 { + return Err(AtomicCoreError::Validation( + "regex pattern too long (max 512 chars)".to_string(), + )); + } + let re = regex::RegexBuilder::new(pattern) + .size_limit(1 << 20) + .dfa_size_limit(1 << 20) + .build() + .map_err(|e| AtomicCoreError::Validation(format!("invalid regex: {e}")))?; + + let mut stmt = conn.prepare("SELECT id, content FROM atoms")?; + let rows = stmt.query_map([], |row| { + let id: String = row.get(0)?; + let content: String = row.get(1)?; + Ok((id, content)) + })?; + + let mut total = 0i32; + let mut flagged = Vec::new(); + for row in rows { + let (id, content) = row?; + total += 1; + let matches = re.is_match(&content); + let flag = if invert { !matches } else { matches }; + if flag && flagged.len() < MAX_FLAGGED { + flagged.push(FlaggedAtom { + title_preview: preview(&content), + id, + }); + } + } + Ok(RawOutcome { + total_considered: total, + flagged_atoms: flagged, + }) +} + +// ==================== Tier 1 ==================== + +pub(super) fn eval_require_tag( + conn: &rusqlite::Connection, + any_of: &[String], + tag_filter: Option<&str>, +) -> Result { + if any_of.is_empty() { + return Ok(RawOutcome { total_considered: 0, flagged_atoms: Vec::new() }); + } + + // Candidate atoms (scoped by tag_filter if present), each with their + // full tag-id set. One bulk query, bucket in memory. + let mut flagged = Vec::new(); + let required: std::collections::HashSet<&str> = any_of.iter().map(|s| s.as_str()).collect(); + + let candidates: Vec<(String, String)> = load_candidates_id_content(conn, tag_filter)?; + let total = candidates.len() as i32; + + if candidates.is_empty() { + return Ok(RawOutcome { total_considered: 0, flagged_atoms: Vec::new() }); + } + let ids: Vec<&str> = candidates.iter().map(|(id, _)| id.as_str()).collect(); + let placeholders: String = std::iter::repeat_n("?", ids.len()).collect::>().join(","); + let sql = format!("SELECT atom_id, tag_id FROM atom_tags WHERE atom_id IN ({placeholders})"); + let mut stmt = conn.prepare(&sql)?; + let rows = stmt.query_map(rusqlite::params_from_iter(ids.iter().copied()), |row| { + Ok((row.get::<_, String>(0)?, row.get::<_, String>(1)?)) + })?; + let mut by_atom: HashMap> = HashMap::new(); + for row in rows { + let (aid, tid) = row?; + by_atom.entry(aid).or_default().push(tid); + } + + for (id, content) in candidates { + let has_any = by_atom + .get(&id) + .map(|tags| tags.iter().any(|t| required.contains(t.as_str()))) + .unwrap_or(false); + if !has_any { + push_flag(&mut flagged, id, &content); + } + } + Ok(RawOutcome { total_considered: total, flagged_atoms: flagged }) +} + +pub(super) fn eval_content_length( + conn: &rusqlite::Connection, + min_words: u32, + max_words: u32, + tag_filter: Option<&str>, +) -> Result { + let mut flagged = Vec::new(); + let total = for_each_atom(conn, tag_filter, "a.id, a.content", |row| { + let id: String = row.get(0)?; + let content: String = row.get(1)?; + let n = word_count(&content); + let too_short = min_words > 0 && n < min_words; + let too_long = max_words > 0 && n > max_words; + if too_short || too_long { + push_flag(&mut flagged, id, &content); + } + Ok(true) + })?; + Ok(RawOutcome { total_considered: total, flagged_atoms: flagged }) +} + +pub(super) fn eval_citation_count( + conn: &rusqlite::Connection, + min_citations: u32, + tag_filter: Option<&str>, +) -> Result { + let mut flagged = Vec::new(); + let total = for_each_atom(conn, tag_filter, "a.id, a.content", |row| { + let id: String = row.get(0)?; + let content: String = row.get(1)?; + if count_citations(&content) < min_citations { + push_flag(&mut flagged, id, &content); + } + Ok(true) + })?; + Ok(RawOutcome { total_considered: total, flagged_atoms: flagged }) +} + +pub(super) fn eval_source_domain( + conn: &rusqlite::Connection, + domains: &[String], + mode: DomainMatchMode, + tag_filter: Option<&str>, +) -> Result { + if domains.is_empty() { + return Ok(RawOutcome { total_considered: 0, flagged_atoms: Vec::new() }); + } + let mut flagged = Vec::new(); + let mut total = 0i32; + for_each_atom(conn, tag_filter, "a.id, a.content, a.source_url", |row| { + let id: String = row.get(0)?; + let content: String = row.get(1)?; + let source: Option = row.get(2)?; + let Some(url) = source.filter(|s| !s.trim().is_empty()) else { + return Ok(false); // skip — not in the pool this rule polices + }; + total += 1; + let host = match host_of(&url) { Some(h) => h, None => return Ok(false) }; + let on_list = host_matches(&host, domains); + let flag = match mode { + DomainMatchMode::Allowlist => !on_list, + DomainMatchMode::Blocklist => on_list, + }; + if flag { push_flag(&mut flagged, id, &content); } + Ok(false) // already counted manually + })?; + Ok(RawOutcome { total_considered: total, flagged_atoms: flagged }) +} + +pub(super) fn eval_stale_atom( + conn: &rusqlite::Connection, + tag: &str, + max_age_days: u32, +) -> Result { + // Compute the RFC3339 cutoff on the Rust side so SQLite string comparison + // (lexicographic over RFC3339) gives us the right answer. + let cutoff = chrono::Utc::now() - chrono::Duration::days(max_age_days as i64); + let cutoff_str = cutoff.to_rfc3339(); + let mut stmt = conn.prepare( + "SELECT a.id, a.content FROM atoms a \ + JOIN atom_tags at ON at.atom_id = a.id \ + WHERE at.tag_id = ?1", + )?; + let mut flagged = Vec::new(); + let mut total = 0i32; + let rows = stmt.query_map(params![tag], |row| { + Ok((row.get::<_, String>(0)?, row.get::<_, String>(1)?)) + })?; + // We fetch content + id, then check updated_at via a second single-stmt + // per atom. SQLite handles this trivially for the expected O(tagged-atoms) + // sizes; could be inlined with a JOIN on atoms for perf if needed. + let mut stale_stmt = conn.prepare( + "SELECT COALESCE(updated_at, created_at) FROM atoms WHERE id = ?1", + )?; + for row in rows { + let (id, content) = row?; + total += 1; + let ts: String = stale_stmt.query_row(params![&id], |r| r.get(0))?; + if ts < cutoff_str { + push_flag(&mut flagged, id, &content); + } + } + Ok(RawOutcome { total_considered: total, flagged_atoms: flagged }) +} + +// ==================== Tier 2 ==================== + +pub(super) fn eval_forbidden_combo( + conn: &rusqlite::Connection, + all_of: &[String], +) -> Result { + if all_of.len() < 2 { + return Ok(RawOutcome { total_considered: 0, flagged_atoms: Vec::new() }); + } + // Every atom is a candidate. Count atoms that carry every required tag. + let placeholders: String = std::iter::repeat_n("?", all_of.len()).collect::>().join(","); + let sql = format!( + "SELECT atom_id, COUNT(DISTINCT tag_id) as matched \ + FROM atom_tags \ + WHERE tag_id IN ({placeholders}) \ + GROUP BY atom_id \ + HAVING matched = ?\ + ORDER BY atom_id" + ); + // Collect atom ids that have ALL required tags. + let mut params_vec: Vec<&dyn rusqlite::ToSql> = all_of.iter().map(|s| s as &dyn rusqlite::ToSql).collect(); + let n = all_of.len() as i64; + params_vec.push(&n); + let mut stmt = conn.prepare(&sql)?; + let rows = stmt.query_map(params_vec.as_slice(), |row| row.get::<_, String>(0))?; + let flagged_ids: Vec = rows.collect::>()?; + + // Total considered = atoms that carry any of the tags (the superset we're policing). + let total = { + let sql = format!( + "SELECT COUNT(DISTINCT atom_id) FROM atom_tags WHERE tag_id IN ({placeholders})" + ); + let mut stmt = conn.prepare(&sql)?; + stmt.query_row(rusqlite::params_from_iter(all_of.iter()), |row| row.get::<_, i64>(0)) + .unwrap_or(0) as i32 + }; + + let mut flagged = Vec::new(); + if !flagged_ids.is_empty() { + let placeholders: String = std::iter::repeat_n("?", flagged_ids.len()).collect::>().join(","); + let sql = format!("SELECT id, content FROM atoms WHERE id IN ({placeholders})"); + let mut stmt = conn.prepare(&sql)?; + let rows = stmt.query_map(rusqlite::params_from_iter(flagged_ids.iter()), |row| { + Ok((row.get::<_, String>(0)?, row.get::<_, String>(1)?)) + })?; + for row in rows { + let (id, content) = row?; + push_flag(&mut flagged, id, &content); + } + } + Ok(RawOutcome { total_considered: total, flagged_atoms: flagged }) +} + +pub(super) fn eval_missing_heading( + conn: &rusqlite::Connection, + min_length_chars: u32, + tag_filter: Option<&str>, +) -> Result { + let mut flagged = Vec::new(); + let total = for_each_atom(conn, tag_filter, "a.id, a.content", |row| { + let id: String = row.get(0)?; + let content: String = row.get(1)?; + if (content.chars().count() as u32) < min_length_chars { + return Ok(true); // count toward total, but not flagged + } + let has_heading = content.lines().any(|l| l.trim_start().starts_with('#')); + if !has_heading { + push_flag(&mut flagged, id, &content); + } + Ok(true) + })?; + Ok(RawOutcome { total_considered: total, flagged_atoms: flagged }) +} + +pub(super) fn eval_tag_cardinality( + conn: &rusqlite::Connection, + min: u32, + max: u32, + tag_filter: Option<&str>, +) -> Result { + let candidates: Vec<(String, String)> = load_candidates_id_content(conn, tag_filter)?; + let total = candidates.len() as i32; + if candidates.is_empty() { + return Ok(RawOutcome { total_considered: 0, flagged_atoms: Vec::new() }); + } + let ids: Vec<&str> = candidates.iter().map(|(id, _)| id.as_str()).collect(); + let placeholders: String = std::iter::repeat_n("?", ids.len()).collect::>().join(","); + let sql = format!("SELECT atom_id, COUNT(*) FROM atom_tags WHERE atom_id IN ({placeholders}) GROUP BY atom_id"); + let mut stmt = conn.prepare(&sql)?; + let rows = stmt.query_map(rusqlite::params_from_iter(ids.iter().copied()), |row| { + Ok((row.get::<_, String>(0)?, row.get::<_, i64>(1)? as u32)) + })?; + let mut count_by: HashMap = HashMap::new(); + for row in rows { + let (id, n) = row?; + count_by.insert(id, n); + } + let mut flagged = Vec::new(); + for (id, content) in candidates { + let n = count_by.get(&id).copied().unwrap_or(0); + let too_few = min > 0 && n < min; + let too_many = max > 0 && n > max; + if too_few || too_many { + push_flag(&mut flagged, id, &content); + } + } + Ok(RawOutcome { total_considered: total, flagged_atoms: flagged }) +} diff --git a/crates/atomic-core/src/health/custom/tests.rs b/crates/atomic-core/src/health/custom/tests.rs new file mode 100644 index 00000000..d6027442 --- /dev/null +++ b/crates/atomic-core/src/health/custom/tests.rs @@ -0,0 +1,412 @@ +//! Custom-rule evaluator tests. + +use super::*; +use crate::db::Database; +use crate::storage::sqlite::SqliteStorage; +use rusqlite::params; +use std::sync::Arc; + +fn make_storage() -> (SqliteStorage, tempfile::TempDir) { + let tmp = tempfile::tempdir().unwrap(); + let path = tmp.path().join("test.db"); + let db = Arc::new(Database::open(&path).unwrap()); + (SqliteStorage::new(db), tmp) +} + +fn insert_atom(conn: &rusqlite::Connection, id: &str, content: &str, source: Option<&str>) { + conn.execute( + "INSERT INTO atoms (id, content, source_url, embedding_status, tagging_status, created_at, updated_at) \ + VALUES (?1, ?2, ?3, 'complete', 'complete', datetime('now'), datetime('now'))", + params![id, content, source], + ) + .unwrap(); +} + +fn insert_tag(conn: &rusqlite::Connection, id: &str, name: &str) { + conn.execute( + "INSERT INTO tags (id, name, parent_id, created_at, is_autotag_target) VALUES (?1, ?2, NULL, datetime('now'), 0)", + params![id, name], + ) + .unwrap(); +} + +fn link(conn: &rusqlite::Connection, atom: &str, tag: &str) { + conn.execute( + "INSERT INTO atom_tags (atom_id, tag_id) VALUES (?1, ?2)", + params![atom, tag], + ) + .unwrap(); +} + +#[test] +fn require_source_flags_atoms_without_url() { + let (storage, _tmp) = make_storage(); + { + let conn = storage.db.conn.lock().unwrap(); + insert_atom(&conn, "a1", "has source", Some("https://x.com/a")); + insert_atom(&conn, "a2", "no source", None); + insert_atom(&conn, "a3", "blank source", Some("")); + } + + let check = CustomCheck { + id: "c1".into(), + label: "needs source".into(), + description: String::new(), + enabled: true, + weight: 0.0, + rule: CustomRule::RequireSource { tag_filter: None }, + }; + let out = run_all(&storage, &[check.clone()]).unwrap(); + assert_eq!(out.len(), 1); + let (_, result, _) = &out[0]; + let data = &result.data; + assert_eq!(data["total_considered"], 3); + assert_eq!(data["flagged_count"], 2); + assert_eq!(result.status, "error"); +} + +#[test] +fn tag_requires_flags_atoms_missing_required_tag() { + let (storage, _tmp) = make_storage(); + { + let conn = storage.db.conn.lock().unwrap(); + insert_atom(&conn, "a1", "paper with source", Some("https://x")); + insert_atom(&conn, "a2", "paper no source", None); + insert_tag(&conn, "t_paper", "paper"); + insert_tag(&conn, "t_sourced", "sourced"); + link(&conn, "a1", "t_paper"); + link(&conn, "a1", "t_sourced"); + link(&conn, "a2", "t_paper"); + } + + let check = CustomCheck { + id: "c1".into(), + label: "papers need sourced".into(), + description: String::new(), + enabled: true, + weight: 0.0, + rule: CustomRule::TagRequires { + any_of: vec!["t_paper".into()], + required: vec!["t_sourced".into()], + }, + }; + let out = run_all(&storage, &[check.clone()]).unwrap(); + assert_eq!(out.len(), 1); + let (_, result, _) = &out[0]; + assert_eq!(result.data["total_considered"], 2); + assert_eq!(result.data["flagged_count"], 1); + // Only a2 is flagged. + let flagged = result.data["flagged"].as_array().unwrap(); + assert_eq!(flagged.len(), 1); + assert_eq!(flagged[0]["id"], "a2"); +} + +#[test] +fn content_regex_with_invert_flags_atoms_not_matching() { + let (storage, _tmp) = make_storage(); + { + let conn = storage.db.conn.lock().unwrap(); + insert_atom(&conn, "a1", "has TODO inside", None); + insert_atom(&conn, "a2", "no markers here", None); + } + + let check = CustomCheck { + id: "c1".into(), + label: "no TODO in notes".into(), + description: String::new(), + enabled: true, + weight: 0.0, + rule: CustomRule::ContentRegex { + pattern: r"TODO".into(), + invert: false, + }, + }; + let out = run_all(&storage, &[check.clone()]).unwrap(); + let (_, result, _) = &out[0]; + assert_eq!(result.data["flagged_count"], 1); + assert_eq!(result.data["flagged"][0]["id"], "a1"); + + let inverted = CustomCheck { + rule: CustomRule::ContentRegex { + pattern: r"TODO".into(), + invert: true, + }, + ..check.clone() + }; + let out = run_all(&storage, &[inverted]).unwrap(); + let (_, result, _) = &out[0]; + assert_eq!(result.data["flagged_count"], 1); + assert_eq!(result.data["flagged"][0]["id"], "a2"); +} + +#[test] +fn disabled_checks_are_skipped() { + let (storage, _tmp) = make_storage(); + let check = CustomCheck { + id: "c1".into(), + label: "anything".into(), + description: String::new(), + enabled: false, + weight: 0.0, + rule: CustomRule::RequireSource { tag_filter: None }, + }; + let out = run_all(&storage, &[check.clone()]).unwrap(); + assert!(out.is_empty()); +} + +#[test] +fn zero_weight_produces_informational_result() { + let (storage, _tmp) = make_storage(); + { + let conn = storage.db.conn.lock().unwrap(); + insert_atom(&conn, "a1", "x", None); + } + let check = CustomCheck { + id: "c1".into(), + label: "l".into(), + description: String::new(), + enabled: true, + weight: 0.0, + rule: CustomRule::RequireSource { tag_filter: None }, + }; + let out = run_all(&storage, &[check.clone()]).unwrap(); + assert!(out[0].1.informational); + + let scored = CustomCheck { + weight: 0.2, + ..check + }; + let out = run_all(&storage, &[scored]).unwrap(); + assert!(!out[0].1.informational); +} + +// ---- Tier 1 ---- + +fn check_with(rule: CustomRule) -> CustomCheck { + CustomCheck { + id: "c1".into(), + label: "l".into(), + description: String::new(), + enabled: true, + weight: 0.0, + rule, + } +} + +#[test] +fn require_tag_flags_untagged_atoms() { + let (storage, _tmp) = make_storage(); + { + let conn = storage.db.conn.lock().unwrap(); + insert_atom(&conn, "a1", "tagged", None); + insert_atom(&conn, "a2", "bare", None); + insert_tag(&conn, "t_topic", "topic"); + link(&conn, "a1", "t_topic"); + } + let check = check_with(CustomRule::RequireTag { + any_of: vec!["t_topic".into()], + tag_filter: None, + }); + let out = run_all(&storage, &[check]).unwrap(); + let (_, r, _) = &out[0]; + assert_eq!(r.data["flagged_count"], 1); + assert_eq!(r.data["flagged"][0]["id"], "a2"); +} + +#[test] +fn content_length_flags_too_short_and_too_long() { + let (storage, _tmp) = make_storage(); + { + let conn = storage.db.conn.lock().unwrap(); + insert_atom(&conn, "a1", "one two three four five six", None); // 6 words, OK + insert_atom(&conn, "a2", "tiny", None); // 1 word + insert_atom(&conn, "a3", &"w ".repeat(50), None); // 50 words + } + let check = check_with(CustomRule::ContentLength { + min_words: 5, + max_words: 30, + tag_filter: None, + }); + let out = run_all(&storage, &[check]).unwrap(); + let (_, r, _) = &out[0]; + assert_eq!(r.data["flagged_count"], 2); + let flagged: Vec<&str> = r.data["flagged"] + .as_array() + .unwrap() + .iter() + .map(|v| v["id"].as_str().unwrap()) + .collect(); + assert!(flagged.contains(&"a2")); + assert!(flagged.contains(&"a3")); +} + +#[test] +fn citation_count_flags_atoms_with_too_few_links() { + let (storage, _tmp) = make_storage(); + { + let conn = storage.db.conn.lock().unwrap(); + insert_atom(&conn, "a1", "one [link](http://a) and [[wiki]]", None); + insert_atom(&conn, "a2", "no citations here at all", None); + insert_atom(&conn, "a3", "only [one](http://x) here", None); + } + let check = check_with(CustomRule::CitationCount { + min_citations: 2, + tag_filter: None, + }); + let out = run_all(&storage, &[check]).unwrap(); + let (_, r, _) = &out[0]; + assert_eq!(r.data["flagged_count"], 2); + let flagged: Vec<&str> = r.data["flagged"] + .as_array().unwrap().iter().map(|v| v["id"].as_str().unwrap()).collect(); + assert!(flagged.contains(&"a2")); + assert!(flagged.contains(&"a3")); +} + +#[test] +fn source_domain_allowlist_flags_off_list_domains() { + let (storage, _tmp) = make_storage(); + { + let conn = storage.db.conn.lock().unwrap(); + insert_atom(&conn, "a1", "paper", Some("https://arxiv.org/abs/1")); + insert_atom(&conn, "a2", "blog", Some("https://random.example/post")); + insert_atom(&conn, "a3", "no source skipped", None); + } + let check = check_with(CustomRule::SourceDomainMatches { + domains: vec!["arxiv.org".into()], + mode: DomainMatchMode::Allowlist, + tag_filter: None, + }); + let out = run_all(&storage, &[check]).unwrap(); + let (_, r, _) = &out[0]; + // a3 is skipped (no source); a1 on allowlist; a2 off. + assert_eq!(r.data["total_considered"], 2); + assert_eq!(r.data["flagged_count"], 1); + assert_eq!(r.data["flagged"][0]["id"], "a2"); +} + +#[test] +fn source_domain_blocklist_flags_on_list_domains() { + let (storage, _tmp) = make_storage(); + { + let conn = storage.db.conn.lock().unwrap(); + insert_atom(&conn, "a1", "reddit", Some("https://old.reddit.com/r/x")); + insert_atom(&conn, "a2", "arxiv", Some("https://arxiv.org/abs/1")); + } + let check = check_with(CustomRule::SourceDomainMatches { + domains: vec!["reddit.com".into()], + mode: DomainMatchMode::Blocklist, + tag_filter: None, + }); + let out = run_all(&storage, &[check]).unwrap(); + let (_, r, _) = &out[0]; + assert_eq!(r.data["flagged_count"], 1); + assert_eq!(r.data["flagged"][0]["id"], "a1"); +} + +#[test] +fn stale_atom_flags_old_tagged_atoms() { + let (storage, _tmp) = make_storage(); + { + let conn = storage.db.conn.lock().unwrap(); + insert_tag(&conn, "t_draft", "draft"); + // Old: 30 days ago + let old = (chrono::Utc::now() - chrono::Duration::days(30)).to_rfc3339(); + conn.execute( + "INSERT INTO atoms (id, content, source_url, embedding_status, tagging_status, created_at, updated_at) \ + VALUES ('a1', 'stale', NULL, 'complete', 'complete', ?1, ?1)", + params![old], + ).unwrap(); + // Fresh: now + insert_atom(&conn, "a2", "fresh", None); + link(&conn, "a1", "t_draft"); + link(&conn, "a2", "t_draft"); + } + let check = check_with(CustomRule::StaleAtom { + tag: "t_draft".into(), + max_age_days: 14, + }); + let out = run_all(&storage, &[check]).unwrap(); + let (_, r, _) = &out[0]; + assert_eq!(r.data["total_considered"], 2); + assert_eq!(r.data["flagged_count"], 1); + assert_eq!(r.data["flagged"][0]["id"], "a1"); +} + +// ---- Tier 2 ---- + +#[test] +fn forbidden_combo_flags_atoms_carrying_all_forbidden_tags() { + let (storage, _tmp) = make_storage(); + { + let conn = storage.db.conn.lock().unwrap(); + insert_atom(&conn, "a1", "both", None); + insert_atom(&conn, "a2", "only draft", None); + insert_tag(&conn, "t_draft", "draft"); + insert_tag(&conn, "t_published", "published"); + link(&conn, "a1", "t_draft"); + link(&conn, "a1", "t_published"); + link(&conn, "a2", "t_draft"); + } + let check = check_with(CustomRule::ForbiddenTagCombo { + all_of: vec!["t_draft".into(), "t_published".into()], + }); + let out = run_all(&storage, &[check]).unwrap(); + let (_, r, _) = &out[0]; + assert_eq!(r.data["flagged_count"], 1); + assert_eq!(r.data["flagged"][0]["id"], "a1"); +} + +#[test] +fn missing_heading_flags_long_atoms_without_heading() { + let (storage, _tmp) = make_storage(); + let long = "x".repeat(200); + let with_h = format!("# Title\n{}", "y".repeat(200)); + { + let conn = storage.db.conn.lock().unwrap(); + insert_atom(&conn, "short", "too short to flag", None); + insert_atom(&conn, "no_h", &long, None); + insert_atom(&conn, "has_h", &with_h, None); + } + let check = check_with(CustomRule::MissingHeading { + min_length_chars: 120, + tag_filter: None, + }); + let out = run_all(&storage, &[check]).unwrap(); + let (_, r, _) = &out[0]; + assert_eq!(r.data["flagged_count"], 1); + assert_eq!(r.data["flagged"][0]["id"], "no_h"); +} + +#[test] +fn tag_cardinality_flags_over_and_under_tagged() { + let (storage, _tmp) = make_storage(); + { + let conn = storage.db.conn.lock().unwrap(); + insert_atom(&conn, "a0", "no tags", None); + insert_atom(&conn, "a1", "one tag", None); + insert_atom(&conn, "a2", "two tags", None); + insert_atom(&conn, "a5", "five tags", None); + for i in 0..5 { + insert_tag(&conn, &format!("t{i}"), &format!("t{i}")); + } + link(&conn, "a1", "t0"); + link(&conn, "a2", "t0"); + link(&conn, "a2", "t1"); + for i in 0..5 { + link(&conn, "a5", &format!("t{i}")); + } + } + let check = check_with(CustomRule::TagCardinality { + min: 1, + max: 3, + tag_filter: None, + }); + let out = run_all(&storage, &[check]).unwrap(); + let (_, r, _) = &out[0]; + let flagged: Vec<&str> = r.data["flagged"] + .as_array().unwrap().iter().map(|v| v["id"].as_str().unwrap()).collect(); + assert!(flagged.contains(&"a0")); // under min + assert!(flagged.contains(&"a5")); // over max + assert!(!flagged.contains(&"a1")); + assert!(!flagged.contains(&"a2")); +} diff --git a/crates/atomic-core/src/health/custom/types.rs b/crates/atomic-core/src/health/custom/types.rs new file mode 100644 index 00000000..9586c8d2 --- /dev/null +++ b/crates/atomic-core/src/health/custom/types.rs @@ -0,0 +1,160 @@ +//! Types shared across the custom-check implementation. +//! +//! The public types (`CustomRule`, `DomainMatchMode`, `CustomCheck`, +//! `PreviewResult`) are re-exported from `custom::mod`. `RawOutcome` and +//! `FlaggedAtom` are evaluator plumbing and stay crate-private. + +use serde::{Deserialize, Serialize}; + +/// Structural rule a custom check evaluates. One variant per supported +/// predicate shape — keeping this enum small is deliberate: every variant the +/// UI needs requires a Rust implementation, and that pressure keeps the +/// feature safe and predictable. +#[cfg_attr(feature = "openapi", derive(utoipa::ToSchema))] +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +#[serde(tag = "kind", rename_all = "snake_case")] +pub enum CustomRule { + /// Atoms tagged with any of `any_of` must also be tagged with every tag + /// in `required`. Flags atoms that violate the invariant. + TagRequires { + any_of: Vec, + required: Vec, + }, + /// Atoms carrying any of the `tag_filter` ids (or all atoms when None) + /// must have a non-empty `source_url`. Flags atoms missing a source. + RequireSource { + #[serde(default)] + tag_filter: Option, + }, + /// Atoms whose content matches (or doesn't match, when `invert`) the + /// given regex. Bounded regex size/DFA. + ContentRegex { + pattern: String, + #[serde(default)] + invert: bool, + }, + /// Atoms (optionally scoped to `tag_filter`) must carry at least one of + /// the `any_of` tags. Flags those that don't. + RequireTag { + any_of: Vec, + #[serde(default)] + tag_filter: Option, + }, + /// Flags atoms whose word count is outside `[min_words, max_words]`. + /// Either bound at 0 disables that side of the check. + ContentLength { + #[serde(default)] + min_words: u32, + #[serde(default)] + max_words: u32, + #[serde(default)] + tag_filter: Option, + }, + /// Flags atoms whose citation count (markdown + wiki links) is below + /// `min_citations`. + CitationCount { + min_citations: u32, + #[serde(default)] + tag_filter: Option, + }, + /// Flags atoms whose `source_url` host matches the `domains` list + /// according to `mode`. + SourceDomainMatches { + domains: Vec, + #[serde(default)] + mode: DomainMatchMode, + #[serde(default)] + tag_filter: Option, + }, + /// Flags atoms tagged with `tag` whose last update is older than + /// `max_age_days`. + StaleAtom { + tag: String, + max_age_days: u32, + }, + /// Flags atoms that carry every tag in `all_of` at once. Used to enforce + /// mutual-exclusion between tag sets (e.g. "draft" + "published"). + ForbiddenTagCombo { + all_of: Vec, + }, + /// Flags atoms longer than `min_length_chars` whose content has no + /// markdown heading. + MissingHeading { + #[serde(default = "default_min_heading_len")] + min_length_chars: u32, + #[serde(default)] + tag_filter: Option, + }, + /// Flags atoms whose tag count is outside `[min, max]`. Either bound at + /// 0 disables that side. + TagCardinality { + #[serde(default)] + min: u32, + #[serde(default)] + max: u32, + #[serde(default)] + tag_filter: Option, + }, +} + +/// How `SourceDomainMatches` interprets the `domains` list. +#[cfg_attr(feature = "openapi", derive(utoipa::ToSchema))] +#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)] +#[serde(rename_all = "snake_case")] +pub enum DomainMatchMode { + /// Flag atoms whose source_url domain is NOT in the list. + #[default] + Allowlist, + /// Flag atoms whose source_url domain IS in the list. + Blocklist, +} + +fn default_min_heading_len() -> u32 { + 120 +} + +/// User-defined health check. `id` is a stable uuid so UI edits don't change +/// the score identity across saves. +#[cfg_attr(feature = "openapi", derive(utoipa::ToSchema))] +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct CustomCheck { + pub id: String, + pub label: String, + #[serde(default)] + pub description: String, + #[serde(default = "default_enabled")] + pub enabled: bool, + /// 0 = informational (not scored). > 0 contributes at that weight, + /// normalized alongside built-in checks. + #[serde(default)] + pub weight: f64, + pub rule: CustomRule, +} + +fn default_enabled() -> bool { + true +} + +/// Preview output for a single (unsaved) rule. Used by the UI to show +/// users "this would flag N atoms" as they tune rule parameters, before +/// persisting the rule. +#[derive(Serialize, Debug)] +pub struct PreviewResult { + pub total_considered: i32, + pub flagged_count: i32, + /// First few flagged atoms (capped at `PREVIEW_SAMPLE`). Each entry + /// has `id` and `title_preview`. + pub sample: Vec, +} + +/// Raw per-rule evaluation output before we wrap it as a `HealthCheckResult`. +pub(super) struct RawOutcome { + pub(super) total_considered: i32, + pub(super) flagged_atoms: Vec, +} + +#[derive(Serialize)] +pub(super) struct FlaggedAtom { + pub(super) id: String, + pub(super) title_preview: String, +} diff --git a/crates/atomic-core/src/health/fixes.rs b/crates/atomic-core/src/health/fixes.rs new file mode 100644 index 00000000..ae096527 --- /dev/null +++ b/crates/atomic-core/src/health/fixes.rs @@ -0,0 +1,809 @@ +//! Deterministic (non-LLM) auto-fix implementations. +//! +//! Each function either executes the fix immediately (if `dry_run = false`) or +//! describes what it would do (if `dry_run = true`). Every executed fix logs +//! a `HealthFixLog` row for undo support. + +use super::{audit, FixAction}; +use crate::error::AtomicCoreError; +use crate::storage::sqlite::health::HealthRawData; +use crate::AtomicCore; +use serde_json::json; + + +/// Retry failed embeddings and process pending ones. Safe tier. +pub async fn fix_embedding_coverage( + core: &AtomicCore, + dry_run: bool, +) -> Result, AtomicCoreError> { + let status = core.get_pipeline_status().await?; + let pending = status.pending; + let failed = status.failed_count; + let count = pending + failed; + + if count == 0 { + return Ok(None); + } + + let id = if dry_run { + "dry_run".to_string() + } else { + let retried = core.retry_failed_embeddings(|_| {}).await.unwrap_or(0); + let processed = core.process_pending_embeddings(|_| {}).await.unwrap_or(0); + tracing::info!(retried, processed, "embedding_coverage fix applied"); + + audit::log_fix( + core, + "embedding_coverage", + "retry_failed_and_process_pending", + "safe", + None, + None, + json!({"failed": failed, "pending": pending}), + json!({"retried": retried, "processed": processed}), + None, + None, + ) + .await? + }; + + Ok(Some(FixAction { + id, + check: "embedding_coverage".to_string(), + action: "retry_failed_and_process_pending".to_string(), + count, + details: vec![ + format!("{} failed retried", failed), + format!("{} pending processed", pending), + ], + })) +} + +/// Queue a semantic edge graph rebuild. Safe tier. +pub async fn fix_graph_freshness( + core: &AtomicCore, + dry_run: bool, +) -> Result, AtomicCoreError> { + let id = if dry_run { + "dry_run".to_string() + } else { + let edges = core.rebuild_semantic_edges().await.unwrap_or(0); + tracing::info!(edges, "semantic_graph_freshness fix: edges rebuilt"); + + audit::log_fix( + core, + "semantic_graph_freshness", + "queued_rebuild", + "safe", + None, + None, + json!({}), + json!({"edges_rebuilt": edges}), + None, + None, + ) + .await? + }; + + Ok(Some(FixAction { + id, + check: "semantic_graph_freshness".to_string(), + action: "queued_rebuild".to_string(), + count: 1, + details: vec!["Semantic edge graph rebuild queued".to_string()], + })) +} + +/// Reset skipped-with-no-tags atoms to pending and run the tagging pipeline. Safe tier. +/// +/// These are atoms whose `tagging_status = 'skipped'` AND have zero tags assigned. +/// They were typically imported before auto-tagging was configured and never retried. +pub async fn fix_tagging_coverage( + core: &AtomicCore, + skipped_untagged_count: i32, + dry_run: bool, +) -> Result, AtomicCoreError> { + if skipped_untagged_count == 0 { + return Ok(None); + } + + let id = if dry_run { + "dry_run".to_string() + } else { + let reset = core + .storage() + .reset_skipped_untagged_to_pending_sync() + .await + .unwrap_or(0); + let processed = core.process_pending_tagging(|_| {}).await.unwrap_or(0); + tracing::info!(reset, processed, "tagging_coverage fix: skipped atoms re-queued"); + + audit::log_fix( + core, + "tagging_coverage", + "reset_skipped_untagged_to_pending", + "safe", + None, + None, + json!({"skipped_untagged": skipped_untagged_count}), + json!({"reset": reset, "processed": processed}), + None, + None, + ) + .await? + }; + + Ok(Some(FixAction { + id, + check: "tagging_coverage".to_string(), + action: "reset_skipped_untagged_to_pending".to_string(), + count: skipped_untagged_count, + details: vec![format!("{} atoms reset to pending for re-tagging", skipped_untagged_count)], + })) +} + +/// Delete orphan tags (tags with 0 atoms and no children). Low tier. +pub async fn fix_orphan_tags( + core: &AtomicCore, + raw: &HealthRawData, + dry_run: bool, +) -> Result, AtomicCoreError> { + if raw.orphan_tags.is_empty() { + return Ok(None); + } + + let count = raw.orphan_tags.len() as i32; + let names: Vec = raw.orphan_tags.iter().map(|(_, n)| n.clone()).collect(); + let ids: Vec = raw.orphan_tags.iter().map(|(id, _)| id.clone()).collect(); + + let before_state = json!(raw + .orphan_tags + .iter() + .map(|(id, name)| json!({"id": id, "name": name, "parent_id": null})) + .collect::>()); + + let id = if dry_run { + "dry_run".to_string() + } else { + for tag_id in &ids { + if let Err(e) = core.delete_tag(tag_id, false).await { + tracing::warn!(tag_id, error = %e, "failed to delete orphan tag"); + } + } + tracing::info!(count, "orphan_tags fix: deleted tags"); + + audit::log_fix( + core, + "orphan_tags", + "deleted_tags", + "low", + None, + Some(&ids), + before_state, + json!({"deleted": count}), + None, + None, + ) + .await? + }; + + Ok(Some(FixAction { + id, + check: "orphan_tags".to_string(), + action: "deleted_tags".to_string(), + count, + details: names, + })) +} + +/// Generate missing wiki articles for eligible tags. Low tier. +/// Rate-limited to 3 generations per fix run to avoid long waits. +pub async fn fix_wiki_coverage( + core: &AtomicCore, + raw: &HealthRawData, + dry_run: bool, +) -> Result, AtomicCoreError> { + let gaps = &raw.wiki_gaps; + let stale = &raw.wiki_stale; + + if gaps.is_empty() && stale.is_empty() { + return Ok(None); + } + + // Prioritise by atom count (highest first), max 3 total + let mut to_generate: Vec<(String, String)> = gaps + .iter() + .map(|g| (g.tag_id.clone(), g.tag_name.clone())) + .collect(); + // Then stale wikis + for s in stale { + to_generate.push((s.tag_id.clone(), s.tag_name.clone())); + } + to_generate.truncate(3); + + let count = to_generate.len() as i32; + let detail_names: Vec = to_generate.iter().map(|(_, n)| n.clone()).collect(); + + let id = if dry_run { + "dry_run".to_string() + } else { + for (tag_id, tag_name) in &to_generate { + match core.generate_wiki(tag_id, tag_name).await { + Ok(_) => tracing::info!(tag_id, "wiki generated"), + Err(e) => tracing::warn!(tag_id, error = %e, "wiki generation failed"), + } + } + + audit::log_fix( + core, + "wiki_coverage", + "generated_wikis", + "low", + None, + None, + json!({"gaps": gaps.len(), "stale": stale.len()}), + json!({"generated": count}), + None, + None, + ) + .await? + }; + + Ok(Some(FixAction { + id, + check: "wiki_coverage".to_string(), + action: "generated_wikis".to_string(), + count, + details: detail_names, + })) +} + +/// Deduplicate atoms with the exact same source_url. Medium tier. +/// Keeps newest; merges tags from all duplicates; deletes older copies. +pub async fn fix_source_uniqueness( + core: &AtomicCore, + raw: &HealthRawData, + dry_run: bool, +) -> Result, AtomicCoreError> { + if raw.duplicate_sources.is_empty() { + return Ok(None); + } + + let mut deleted_ids: Vec = Vec::new(); + let mut before_atoms: Vec = Vec::new(); + + for (source_url, atom_ids) in &raw.duplicate_sources { + if atom_ids.len() < 2 { + continue; + } + + // Fetch all atoms in this group to find newest + let mut atoms_with_dates: Vec<(String, String)> = Vec::new(); // (id, updated_at) + for id in atom_ids { + if let Ok(Some(a)) = core.get_atom(id).await { + atoms_with_dates.push((a.atom.id.clone(), a.atom.updated_at.clone())); + + // Capture before state + let tag_ids: Vec = a.tags.iter().map(|t| t.id.clone()).collect(); + before_atoms.push(json!({ + "id": a.atom.id, + "content": a.atom.content, + "source_url": a.atom.source_url, + "tag_ids": tag_ids, + })); + } + } + + // Sort by updated_at desc — newest first + atoms_with_dates.sort_by(|a, b| b.1.cmp(&a.1)); + let keep_id = atoms_with_dates[0].0.clone(); + let to_delete: Vec = atoms_with_dates[1..].iter().map(|(id, _)| id.clone()).collect(); + + if dry_run { + tracing::info!( + source_url, + keep = %keep_id, + delete = ?to_delete, + "dry_run: would merge source duplicates" + ); + } else { + // Collect all tags from duplicates into the keeper + let mut all_tag_ids: std::collections::HashSet = std::collections::HashSet::new(); + for id in &to_delete { + if let Ok(tag_ids) = core.storage().get_atom_tag_ids_impl(id).await { + all_tag_ids.extend(tag_ids); + } + } + // Merge tags onto keeper + if !all_tag_ids.is_empty() { + let tag_list: Vec = all_tag_ids.into_iter().collect(); + let _ = core + .storage() + .link_tags_to_atom_impl(&keep_id, &tag_list) + .await; + } + // Delete duplicates — but skip locked atoms so source-of-truth + // material never gets automatically merged away. + for id in &to_delete { + if core.is_atom_locked(id).await.unwrap_or(false) { + tracing::info!(id, "skipping locked atom in source-duplicate merge"); + continue; + } + if let Err(e) = core.delete_atom(id).await { + tracing::warn!(id, error = %e, "failed to delete source duplicate atom"); + } else { + deleted_ids.push(id.clone()); + } + } + } + } + + if deleted_ids.is_empty() && !dry_run { + return Ok(None); + } + + let count = if dry_run { + raw.duplicate_sources + .iter() + .map(|(_, ids)| (ids.len() as i32 - 1).max(0)) + .sum::() + } else { + deleted_ids.len() as i32 + }; + + let id = if dry_run { + "dry_run".to_string() + } else { + audit::log_fix( + core, + "source_uniqueness", + "deleted_atoms", + "medium", + Some(&deleted_ids), + None, + serde_json::Value::Array(before_atoms), + json!({"deleted": count}), + None, + None, + ) + .await? + }; + + Ok(Some(FixAction { + id, + check: "source_uniqueness".to_string(), + action: "deleted_atoms".to_string(), + count, + details: if dry_run { + raw.duplicate_sources + .iter() + .map(|(url, _)| url.clone()) + .collect() + } else { + deleted_ids + }, + })) +} + + +/// Delete single-atom tags where `is_autotag_target = true`. Low tier. +/// +/// Only removes tags that were produced by the auto-tagger (is_autotag_target = 1) +/// AND have exactly 1 atom attached. User-created single-atom tags (is_autotag_target = 0) +/// are left alone; those require human review. +pub async fn fix_tag_health_single_atom( + core: &AtomicCore, + raw: &HealthRawData, + dry_run: bool, +) -> Result, AtomicCoreError> { + // Use the pre-fetched list; filter to autotag-only entries. + let targets: Vec<_> = raw + .single_atom_tag_list + .iter() + .filter(|t| t.is_autotag) + .collect(); + + if targets.is_empty() { + return Ok(None); + } + + let count = targets.len() as i32; + let ids: Vec = targets.iter().map(|t| t.id.clone()).collect(); + let names: Vec = targets.iter().map(|t| t.name.clone()).collect(); + + let before_state = json!(targets + .iter() + .map(|t| json!({"id": t.id, "name": t.name, "is_autotag": t.is_autotag})) + .collect::>()); + + let id = if dry_run { + "dry_run".to_string() + } else { + for tag_id in &ids { + if let Err(e) = core.delete_tag(tag_id, false).await { + tracing::warn!(tag_id, error = %e, "failed to delete single-atom autotag"); + } + } + tracing::info!(count, "tag_health single-atom fix: deleted autotag-only tags"); + + audit::log_fix( + core, + "tag_health", + "deleted_single_atom_autotags", + "low", + None, + Some(&ids), + before_state, + json!({"deleted": count}), + None, + None, + ) + .await? + }; + + Ok(Some(FixAction { + id, + check: "tag_health".to_string(), + action: "deleted_single_atom_autotags".to_string(), + count, + details: names, + })) +} + +/// Resolve broken internal links in all atoms to `atom://id` URIs. Medium tier. +/// +/// For each atom with relative markdown links or `[[wikilinks]]`: +/// 1. Resolve the href to a candidate source URL using the atom's vault prefix. +/// 2. Look up the target atom by source URL. +/// 3. Replace the original href with `atom://target_id`. +/// +/// Unresolvable links are left untouched and reported in `details`. +pub async fn fix_broken_internal_links( + core: &AtomicCore, + dry_run: bool, +) -> Result, AtomicCoreError> { + use crate::health::link_resolution::{ + apply_link_replacements, extract_internal_links, markdown_stem_fallback, vault_root, + ResolvedLink, + }; + + let candidates = core.storage().get_link_candidate_atoms_sync().await?; + if candidates.is_empty() { + return Ok(None); + } + + let mut fixed_total = 0i32; + let mut unresolvable: Vec = Vec::new(); + let mut before_state: Vec = Vec::new(); + let mut atom_ids_changed: Vec = Vec::new(); + + for (atom_id, content, source_url) in &candidates { + let links = extract_internal_links(content, source_url.as_deref()); + if links.is_empty() { + continue; + } + + let candidate_urls: Vec = links + .iter() + .flat_map(|l| l.candidate_source_urls.iter().cloned()) + .collect(); + + let url_map = core + .storage() + .find_atoms_by_source_urls_sync(candidate_urls) + .await + .unwrap_or_default(); + + let vault_pfx = source_url + .as_deref() + .and_then(vault_root) + .map(|s| s.to_string()); + + let mut resolved: Vec = Vec::new(); + + for link in &links { + // Try exact source URL match first + let target_id = link + .candidate_source_urls + .iter() + .find_map(|u| url_map.get(u).cloned()); + + let target_id = if target_id.is_none() { + // Fall back to vault-wide name search. + // - wikilinks: exact wikilink name (already computed). + // - markdown [text](href.md): filename stem as implicit wikilink + // name. Mirrors the fix in `compute.rs` so the link resolver + // finds the same subdirectory matches the picker does. + if let Some(pfx) = &vault_pfx { + let name = link + .wikilink_name + .clone() + .or_else(|| markdown_stem_fallback(&link.href)); + if let Some(name) = name { + core.storage() + .find_atom_by_wikilink_name_sync(name, pfx.clone()) + .await + .unwrap_or(None) + .map(|(id, _)| id) + } else { + None + } + } else { + None + } + } else { + target_id + }; + + match target_id { + Some(id) => { + resolved.push(ResolvedLink { + original: link.original.clone(), + target_atom_id: id.clone(), + replacement: format!("atom://{}", id), + }); + fixed_total += 1; + } + None => { + unresolvable.push(format!("{} (in {})", link.href, atom_id)); + } + } + } + + if resolved.is_empty() { + continue; + } + + before_state.push(json!({ + "id": atom_id, + "content": content, + "source_url": source_url, + })); + + if !dry_run { + let new_content = apply_link_replacements(content, &resolved); + core.update_atom_content_only(atom_id, crate::UpdateAtomRequest { + content: new_content, + source_url: source_url.clone(), + published_at: None, + tag_ids: None, + }) + .await + .map_err(|e| { + tracing::warn!(atom_id, error = %e, "failed to update atom with resolved links"); + e + })?; + atom_ids_changed.push(atom_id.clone()); + } + } + + if fixed_total == 0 { + return Ok(None); + } + + let id = if dry_run { + "dry_run".to_string() + } else { + audit::log_fix( + core, + "broken_internal_links", + "resolve_internal_links", + "medium", + Some(&atom_ids_changed), + None, + serde_json::Value::Array(before_state), + json!({ + "resolved": fixed_total, + "unresolvable": unresolvable.len(), + }), + None, + None, + ) + .await? + }; + + tracing::info!( + fixed = fixed_total, + unresolvable = unresolvable.len(), + dry_run, + "broken_internal_links fix completed" + ); + + let mut details: Vec = atom_ids_changed + .iter() + .map(|id| format!("Updated: {}", id)) + .collect(); + details.extend( + unresolvable.iter().take(10).map(|s| format!("Unresolvable: {}", s)), + ); + + Ok(Some(FixAction { + id, + check: "broken_internal_links".to_string(), + action: "resolve_internal_links".to_string(), + count: fixed_total, + details, + })) +} + +/// Strip one unresolved link from an atom's content, replacing it with its +/// display text (for markdown links) or the name (for wikilinks). +/// +/// `link_raw` must exactly match the text as it appears in the atom content. +pub async fn remove_broken_link( + core: &AtomicCore, + atom_id: &str, + link_raw: &str, +) -> Result { + let atom = core + .get_atom(atom_id) + .await? + .ok_or_else(|| AtomicCoreError::NotFound(format!("atom {} not found", atom_id)))?; + + let content = &atom.atom.content; + + // Determine replacement text. + let replacement = if let Some(inner) = parse_markdown_link_text(link_raw) { + inner + } else if let Some(name) = parse_wikilink_name(link_raw) { + name + } else { + tracing::warn!(link_raw = %link_raw, "remove_broken_link: unrecognised link format, replacing with empty string"); + String::new() + }; + + let new_content = content.replacen(link_raw, &replacement, 1); + + // Record before state for undo. + let before_state = serde_json::json!([{ + "id": atom_id, + "content": content, + "source_url": atom.atom.source_url, + }]); + let after_state = serde_json::json!([{ + "id": atom_id, + "content": new_content, + "source_url": atom.atom.source_url, + }]); + + let tag_ids: Vec = atom.tags.iter().map(|t| t.id.clone()).collect(); + let upd = crate::UpdateAtomRequest { + content: new_content, + source_url: atom.atom.source_url.clone(), + published_at: atom.atom.published_at.clone(), + tag_ids: Some(tag_ids), + }; + core.update_atom(atom_id, upd, |_| {}).await?; + + let id = audit::log_fix( + core, + "broken_internal_links", + "remove_link", + "medium", + Some(&[atom_id.to_string()]), + None, + before_state, + after_state, + None, + None, + ) + .await + .unwrap_or_else(|_| uuid::Uuid::new_v4().to_string()); + + Ok(FixAction { + id, + check: "broken_internal_links".to_string(), + action: "remove_link".to_string(), + count: 1, + details: vec![format!("Removed link '{}' from atom {}", link_raw, atom_id)], + }) +} + +/// Relink a broken link in an atom to a target atom via `atom://` URI. +pub async fn relink_broken_link( + core: &AtomicCore, + atom_id: &str, + link_raw: &str, + target_atom_id: &str, +) -> Result { + tracing::debug!(atom_id, target_atom_id, link_raw, "relink_broken_link: begin"); + let atom = core + .get_atom(atom_id) + .await? + .ok_or_else(|| AtomicCoreError::NotFound(format!("atom {} not found", atom_id)))?; + + let target = core + .get_atom(target_atom_id) + .await? + .ok_or_else(|| AtomicCoreError::NotFound(format!("target atom {} not found", target_atom_id)))?; + + let content = &atom.atom.content; + + // Guard: link_raw must be present in the content. + if !content.contains(link_raw) { + return Err(AtomicCoreError::Validation(format!( + "Link '{link_raw}' not found in atom content; may have been already edited" + ))); + } + + // Build replacement: markdown form [display_text](atom://). + let display_text = if let Some(text) = parse_markdown_link_text(link_raw) { + text + } else if let Some(name) = parse_wikilink_name(link_raw) { + name + } else { + link_raw.to_string() + }; + let new_link = format!("[{}](atom://{})", display_text, target_atom_id); + let new_content = content.replacen(link_raw, &new_link, 1); + + if new_content == *content { + return Err(AtomicCoreError::Validation(format!( + "Link '{link_raw}' not found in atom content; may have been already edited" + ))); + } + + let before_state = serde_json::json!([{ + "id": atom_id, + "content": content, + "source_url": atom.atom.source_url, + }]); + let after_state = serde_json::json!([{ + "id": atom_id, + "content": new_content, + "source_url": atom.atom.source_url, + }]); + + let tag_ids: Vec = atom.tags.iter().map(|t| t.id.clone()).collect(); + let upd = crate::UpdateAtomRequest { + content: new_content.clone(), + source_url: atom.atom.source_url.clone(), + published_at: atom.atom.published_at.clone(), + tag_ids: Some(tag_ids), + }; + core.update_atom(atom_id, upd, |_| {}).await?; + tracing::info!(atom_id, target_atom_id, link_raw, new_link = %new_link, "relink_broken_link: success"); + + let target_title = crate::health::title_preview(&target.atom.content); + let id = audit::log_fix( + core, + "broken_internal_links", + "relink", + "medium", + Some(&[atom_id.to_string()]), + None, + before_state, + after_state, + None, + None, + ) + .await + .unwrap_or_else(|_| uuid::Uuid::new_v4().to_string()); + + Ok(FixAction { + id, + check: "broken_internal_links".to_string(), + action: "relink".to_string(), + count: 1, + details: vec![format!( + "Relinked '{}' in atom {} → atom://{} ('{}')", + link_raw, atom_id, target_atom_id, target_title + )], + }) +} + +/// Extract display text from `[text](url)` markdown link. +fn parse_markdown_link_text(s: &str) -> Option { + if !s.starts_with('[') { + return None; + } + let close_bracket = s.find("](")?; + Some(s[1..close_bracket].to_string()) +} + +/// Extract name from `[[name]]` or `[[name|alias]]` wikilink. +fn parse_wikilink_name(s: &str) -> Option { + let inner = s.strip_prefix("[[")?.strip_suffix("]]")? ; + let name = inner.split('|').next().unwrap_or(inner).trim(); + Some(name.to_string()) +} \ No newline at end of file diff --git a/crates/atomic-core/src/health/gc_task.rs b/crates/atomic-core/src/health/gc_task.rs new file mode 100644 index 00000000..4c17b017 --- /dev/null +++ b/crates/atomic-core/src/health/gc_task.rs @@ -0,0 +1,75 @@ +//! Weekly GC of `health_dismissals` entries. +//! +//! Deletes rows whose `expires_at` has passed, and rows that reference +//! atoms/tags that no longer exist. Safe to re-run; idempotent. + +use crate::scheduler::{state as task_state, ScheduledTask, TaskContext, TaskError, TaskEvent}; +use crate::AtomicCore; +use async_trait::async_trait; +use std::time::Duration; + +pub struct DismissalGcTask; + +const TASK_ID: &str = "health_dismissal_gc"; +const DEFAULT_INTERVAL: Duration = Duration::from_secs(7 * 24 * 60 * 60); // 7 days +const DEFAULT_ENABLED: bool = true; + +#[async_trait] +impl ScheduledTask for DismissalGcTask { + fn id(&self) -> &'static str { + TASK_ID + } + + fn display_name(&self) -> &'static str { + "Health dismissal GC" + } + + fn default_interval(&self) -> Duration { + DEFAULT_INTERVAL + } + + async fn run(&self, core: &AtomicCore, ctx: &TaskContext) -> Result<(), TaskError> { + if !task_state::is_enabled(core, TASK_ID, DEFAULT_ENABLED).await { + return Err(TaskError::Disabled); + } + if !task_state::is_due(core, TASK_ID, DEFAULT_INTERVAL, DEFAULT_ENABLED).await { + return Err(TaskError::NotDue); + } + + let db_id = core + .db_path() + .file_stem() + .and_then(|s| s.to_str()) + .map(String::from) + .unwrap_or_else(|| "default".to_string()); + + (ctx.event_cb)(TaskEvent::Started { + task_id: TASK_ID.to_string(), + db_id: db_id.clone(), + }); + + match core.storage().gc_dismissals_sync().await { + Ok(removed) => { + tracing::info!(removed, db_id = %db_id, "[dismissal_gc] cleanup complete"); + task_state::set_last_run(core, TASK_ID, chrono::Utc::now()) + .await + .ok(); + (ctx.event_cb)(TaskEvent::Completed { + task_id: TASK_ID.to_string(), + db_id, + result_id: None, + }); + Ok(()) + } + Err(e) => { + let msg = e.to_string(); + (ctx.event_cb)(TaskEvent::Failed { + task_id: TASK_ID.to_string(), + db_id, + error: msg.clone(), + }); + Err(TaskError::Other(msg)) + } + } + } +} diff --git a/crates/atomic-core/src/health/link_resolution.rs b/crates/atomic-core/src/health/link_resolution.rs new file mode 100644 index 00000000..8106d3d4 --- /dev/null +++ b/crates/atomic-core/src/health/link_resolution.rs @@ -0,0 +1,649 @@ +//! Generic internal-link extraction and resolution. +//! +//! Handles two link formats found in Obsidian-imported atoms: +//! +//! 1. **Markdown links** — `[text](relative/path.md)` or `[text](../other.md)` +//! 2. **Wikilinks** — `[[File Name]]` or `[[File Name|Display Text]]` +//! +//! A link is "internal" when its href contains no URI scheme (`://`) and is +//! not a bare fragment (`#anchor`). Absolute paths starting with `/` are +//! also excluded — those are server-rooted URLs, not vault-relative paths. +//! +//! Resolution maps each link to a candidate `source_url` (or a set of LIKE +//! patterns for wikilinks) so callers can look the target up in the atom +//! table. + +/// A single internal link found inside an atom's content. +#[derive(Debug, Clone)] +pub struct InternalLink { + /// The original text in the content that needs to be replaced. + /// For markdown: `[text](href)`. For wikilinks: `[[target]]`. + pub original: String, + /// The raw href or wikilink target extracted from the original. + pub href: String, + /// Candidate absolute source URLs to try (exact lookup). + /// Built from the current atom's source_url + relative path. + pub candidate_source_urls: Vec, + /// For wikilinks: search the atoms table with + /// `source_url LIKE '%/' || name || '.md'` across the vault. + pub wikilink_name: Option, +} + +/// A resolved match: an `InternalLink` with its target atom identified. +#[derive(Debug, Clone)] +pub struct ResolvedLink { + pub original: String, + pub target_atom_id: String, + /// Used as the replacement href: `atom://target_atom_id` + pub replacement: String, +} + +// ==================== Extraction ==================== + +/// Extract all internal links from `content`. +/// +/// `source_url` is the current atom's source URL; it is used to resolve +/// relative paths. Pass `None` for atoms without a known source. +pub fn extract_internal_links( + content: &str, + source_url: Option<&str>, +) -> Vec { + let mut links = Vec::new(); + links.extend(extract_markdown_links(content, source_url)); + links.extend(extract_wikilinks(content, source_url)); + links +} + +fn extract_markdown_links(content: &str, source_url: Option<&str>) -> Vec { + let mut links = Vec::new(); + let bytes = content.as_bytes(); + let mut i = 0; + + while i + 1 < bytes.len() { + // Scan for `](` + if bytes[i] != b']' || bytes[i + 1] != b'(' { + i += 1; + continue; + } + + // Find the matching `)` + let href_start = i + 2; + let mut j = href_start; + let mut depth = 1i32; + + while j < bytes.len() && depth > 0 { + match bytes[j] { + b'(' => depth += 1, + b')' => depth -= 1, + _ => {} + } + if depth > 0 { + j += 1; + } + } + + if depth != 0 { + i += 1; + continue; + } + + let raw_href = match std::str::from_utf8(&bytes[href_start..j]) { + Ok(s) => s, + Err(_) => { + i = j + 1; + continue; + } + }; + + // Strip optional inline title: `path.md "Title"` → `path.md` + let href = raw_href + .trim() + .split('"') + .next() + .unwrap_or("") + .split('\'') + .next() + .unwrap_or("") + .trim() + .to_string(); + + if is_internal_href(&href) && looks_like_document(&href) { + // Find the opening `[` to capture display text + full match + let (original, display) = scan_back_for_display_text(content, i, j); + + let candidate_source_urls = match source_url { + Some(su) => build_href_candidates(&href, su), + None => vec![], + }; + + links.push(InternalLink { + original, + href: href.clone(), + candidate_source_urls, + wikilink_name: None, + }); + + // Skip `[display](href)` — display text already consumed above + let _ = display; + } + + i = j + 1; + } + + links +} + +fn extract_wikilinks(content: &str, source_url: Option<&str>) -> Vec { + let mut links = Vec::new(); + let bytes = content.as_bytes(); + let mut i = 0; + + while i + 1 < bytes.len() { + if bytes[i] != b'[' || bytes[i + 1] != b'[' { + i += 1; + continue; + } + + let start = i + 2; + let mut j = start; + + while j + 1 < bytes.len() && !(bytes[j] == b']' && bytes[j + 1] == b']') { + j += 1; + } + + if j + 1 >= bytes.len() { + i += 1; + continue; + } + + let inner = match std::str::from_utf8(&bytes[start..j]) { + Ok(s) => s, + Err(_) => { + i = j + 2; + continue; + } + }; + + // `[[target|display text]]` — keep only the target + let target = inner.split('|').next().unwrap_or("").trim().to_string(); + + if !target.is_empty() { + let original = format!("[[{}]]", inner); + + let candidate_source_urls = match source_url { + Some(su) => build_wikilink_exact_candidates(&target, su), + None => vec![], + }; + + links.push(InternalLink { + original, + href: target.clone(), + candidate_source_urls, + wikilink_name: Some(target), + }); + } + + i = j + 2; + } + + links +} + +// ==================== Predicates ==================== + +/// A link is internal when it has no URI scheme, is not a bare fragment, +/// and does not start with `/` (server-root absolute path). +fn is_internal_href(href: &str) -> bool { + let h = href.trim(); + !h.is_empty() + && !h.starts_with('#') + && !h.starts_with('/') + && !h.contains("://") + && !h.starts_with("mailto:") + && !h.starts_with("tel:") +} + +/// The link looks like a document reference (not an image, anchor, etc.). +fn looks_like_document(href: &str) -> bool { + let h = href.trim().to_lowercase(); + // Explicit markdown/text extensions + if h.ends_with(".md") || h.ends_with(".txt") || h.ends_with(".org") { + return true; + } + // Relative path operators + if h.starts_with("./") || h.starts_with("../") { + return true; + } + // No extension + contains path separator → likely a document path + if !h.contains('.') && h.contains('/') { + return true; + } + false +} + +// ==================== URL resolution ==================== + +/// Extract the vault root from a source URL. +/// +/// `obsidian://ar-playbook/some/path.md` → `obsidian://ar-playbook/` +pub fn vault_root(source_url: &str) -> Option<&str> { + let scheme_end = source_url.find("://")?; + let after_scheme = &source_url[scheme_end + 3..]; + let vault_sep = after_scheme.find('/')?; + Some(&source_url[..scheme_end + 3 + vault_sep + 1]) +} + +/// Directory portion of a source URL (everything up to and including the +/// last `/`). +fn source_dir(source_url: &str) -> &str { + if let Some(pos) = source_url.rfind('/') { + &source_url[..pos + 1] + } else { + source_url + } +} + +/// Resolve a relative href against the current atom's source URL, returning +/// candidate source URL strings (with and without `.md`) to try. +fn build_href_candidates(href: &str, current_source_url: &str) -> Vec { + let href = href.trim(); + let Some(root) = vault_root(current_source_url) else { + return vec![]; + }; + let dir = source_dir(current_source_url); + + // Strip any #fragment or ?query — the atom lookup matches on source_url, + // which stores the file path only, never the anchor. Without this, a + // markdown link like `[x](./foo.md#section)` would try to resolve to a + // non-existent atom with the fragment baked into its URL. + let href = href.split(['#', '?']).next().unwrap_or(href); + + let mut out: Vec = Vec::new(); + if let Some(rest) = href.strip_prefix("./") { + // Explicit current-dir: resolve relative to current dir only. + out.extend(candidates_with_and_without_extension(&format!("{}{}", dir, rest))); + } else if let Some(rest) = href.strip_prefix("../") { + // Parent-relative: resolve via the parent-walk helper. + out.extend(candidates_with_and_without_extension( + &resolve_parent(dir, rest, root), + )); + } else { + // Bare relative path (no leading `./` or `../`). Obsidian's default + // resolution tries the *current directory first*, falling back to + // the vault root. Generate both so a naked `glossary.md` written + // next to `onboarding.md` resolves to the sibling file, not to + // `/glossary.md`. Without this, valid same-folder links get + // false-positive-flagged as broken. + out.extend(candidates_with_and_without_extension(&format!("{}{}", dir, href))); + out.extend(candidates_with_and_without_extension(&format!("{}{}", root, href))); + } + out.dedup(); + out +} + +fn resolve_parent(current_dir: &str, rest: &str, vault_root: &str) -> String { + let dir = current_dir.trim_end_matches('/'); + let parent = dir + .rfind('/') + .map(|p| &dir[..p + 1]) + .unwrap_or(vault_root); + if let Some(rest) = rest.strip_prefix("../") { + resolve_parent(parent, rest, vault_root) + } else { + format!("{}{}", parent, rest) + } +} + +/// For a wikilink `[[Name]]`, build exact-URL candidates to try first. +/// Wikilinks resolve by filename anywhere in the vault, so we generate: +/// - `vault_root/Name.md` +/// - `vault_root/name.md` (lower-case stem) +/// - `vault_root/name-with-dashes.md` (slug variant) +/// +/// The `find_atoms_by_wikilink_name` SQL fallback handles subdirectory +/// resolution when none of these exact hits land. +fn build_wikilink_exact_candidates(name: &str, current_source_url: &str) -> Vec { + let Some(root) = vault_root(current_source_url) else { + return vec![]; + }; + let slug = name.to_lowercase().replace(' ', "-"); + let mut candidates = vec![ + format!("{}{}.md", root, name), + format!("{}{}.md", root, name.to_lowercase()), + format!("{}{}.md", root, slug), + ]; + candidates.dedup(); + candidates +} + +/// Extract a filename stem from a markdown href so callers can perform +/// a vault-wide fallback lookup when exact candidates miss. +/// +/// Example: `../processes/onboarding.md#anchor` → `onboarding`. +/// +/// Returns `None` for hrefs that don't look like bare filename references +/// (e.g. fragments, external URLs — these should not hit the fallback). +/// The stem is safe to hand to `find_atom_by_wikilink_name_sync`, which +/// does a SQL `LIKE '%/stem%.md'` search across the vault. +pub fn markdown_stem_fallback(href: &str) -> Option { + // Strip fragment + query + let cleaned = href + .split('#').next().unwrap_or("") + .split('?').next().unwrap_or("") + .trim(); + if cleaned.is_empty() { + return None; + } + // Last path segment + let last = cleaned.rsplit('/').next().unwrap_or(cleaned); + if last.is_empty() { + return None; + } + // Drop markdown extensions; only return a stem for things that look like + // markdown files (mirrors `looks_like_document`'s accepted list). + let lower = last.to_lowercase(); + let stem = if let Some(s) = lower.strip_suffix(".md") { + s.to_string() + } else if let Some(s) = lower.strip_suffix(".markdown") { + s.to_string() + } else if let Some(s) = lower.strip_suffix(".mdx") { + s.to_string() + } else { + // No extension — treat the whole segment as a stem. + // `looks_like_document` permits extensionless targets. + lower + }; + if stem.is_empty() { None } else { Some(stem) } +} + +/// Return the URL itself plus a variant without the `.md` extension (and +/// vice-versa), so callers can match atoms stored either way. +fn candidates_with_and_without_extension(url: &str) -> Vec { + if let Some(stem) = url.strip_suffix(".md") { + vec![url.to_string(), stem.to_string()] + } else { + vec![url.to_string(), format!("{}.md", url)] + } +} + +// ==================== Display-text extraction ==================== + +/// Scan backwards from the `]` at byte index `bracket_pos` to find the +/// Walk backwards from `bracket_pos` (the `]` before `(`) to find the opening +/// `[`, returning `(full_original_text, display_text)`. +/// +/// `end_pos` is the position of the closing `)` in `content`, so the full +/// original span `[display](href…)` can be reconstructed. +fn scan_back_for_display_text(content: &str, bracket_pos: usize, end_pos: usize) -> (String, String) { + let bytes = content.as_bytes(); + if bracket_pos == 0 { + // `]` is at position 0 — no room for `[display]`, reconstruct from end_pos. + let original = std::str::from_utf8(&bytes[..end_pos + 1]) + .unwrap_or("") + .to_string(); + return (original, String::new()); + } + + // Walk backwards through the content to find the opening `[` + let mut depth = 1usize; + let mut k = bracket_pos.saturating_sub(1); + loop { + match bytes[k] { + b']' => depth += 1, + b'[' => { + depth -= 1; + if depth == 0 { + break; + } + } + _ => {} + } + if k == 0 { + break; + } + k -= 1; + } + + // Full match spans from `[` at position `k` to the `)` at `end_pos`. + let display = std::str::from_utf8(&bytes[k + 1..bracket_pos]) + .unwrap_or("") + .to_string(); + let original = std::str::from_utf8(&bytes[k..end_pos + 1]) + .unwrap_or("") + .to_string(); + (original, display) +} + +// ==================== Replacement ==================== + +/// Apply resolved link replacements to `content`, returning the updated string. +/// +/// Each replacement: `(original_text, new_href)` — the display text is +/// preserved; only the href portion is changed. +pub fn apply_link_replacements(content: &str, replacements: &[ResolvedLink]) -> String { + let mut result = content.to_string(); + + for resolved in replacements { + // For markdown links: [text](old_href) → [text](atom://id) + // For wikilinks: [[Name]] → [Name](atom://id) + let original = &resolved.original; + let new_href = &resolved.replacement; + + if original.starts_with("[[") { + // Wikilink → markdown link with atom:// href + let inner = &original[2..original.len() - 2]; + let display = inner.split('|').next().unwrap_or(inner).trim(); + let replacement = format!("[{}]({})", display, new_href); + result = result.replacen(original.as_str(), &replacement, 1); + } else if let (Some(open), Some(_close)) = (original.find("]("), original.rfind(')')) { + // Markdown link → update only the href part + let display = &original[1..open]; + let replacement = format!("[{}]({})", display, new_href); + result = result.replacen(original.as_str(), &replacement, 1); + } + } + + result +} + +/// Reconstruct the full original markdown link text `[display](href)` for a +/// given href and its position in `content`, so we can build `InternalLink.original`. +/// +/// Called after extraction to fill in the `original` field that +/// `scan_back_for_display_text` could not complete. +pub fn build_original_text(display: &str, href: &str) -> String { + format!("[{}]({})", display, href) +} + +// ==================== Tests ==================== + +#[cfg(test)] +mod tests { + use super::*; + + const VAULT: &str = "obsidian://ar-playbook/"; + const SOURCE: &str = "obsidian://ar-playbook/processes/deployment.md"; + + #[test] + fn test_bare_href_tries_current_dir_first() { + // Bare `glossary.md` next to `onboarding.md` must resolve to the + // sibling file, not to `/glossary.md`. + let source = "obsidian://ar-playbook/references/onboarding.md"; + let candidates = build_href_candidates("glossary.md", source); + assert!( + candidates.contains(&"obsidian://ar-playbook/references/glossary.md".to_string()), + "expected dir-relative candidate, got {candidates:?}" + ); + // Keep the vault-root candidate as a fallback for cases where the + // source note uses Obsidian's root-relative convention. + assert!( + candidates.contains(&"obsidian://ar-playbook/glossary.md".to_string()), + "expected vault-root fallback candidate, got {candidates:?}" + ); + } + + #[test] + fn test_href_fragment_is_stripped() { + let candidates = build_href_candidates( + "../processes/work-tracking.md#estimation-approach", + SOURCE, + ); + assert!( + candidates.contains(&"obsidian://ar-playbook/processes/work-tracking.md".to_string()), + "fragment should be stripped before resolution, got {candidates:?}" + ); + } + + #[test] + fn test_relative_href_resolves_to_vault_root() { + let candidates = build_href_candidates("processes/work-tracking.md", SOURCE); + assert!(candidates.contains(&"obsidian://ar-playbook/processes/work-tracking.md".to_string())); + } + + #[test] + fn test_dotslash_href_resolves_relative_to_current_dir() { + let candidates = build_href_candidates("./capacity-planning.md", SOURCE); + assert!(candidates.contains( + &"obsidian://ar-playbook/processes/capacity-planning.md".to_string() + )); + } + + #[test] + fn test_parent_href_resolves_correctly() { + let candidates = build_href_candidates("../docs/overview.md", SOURCE); + assert!(candidates.contains(&"obsidian://ar-playbook/docs/overview.md".to_string())); + } + + #[test] + fn test_absolute_url_not_internal() { + assert!(!is_internal_href("https://example.com/file.md")); + assert!(!is_internal_href("http://example.com")); + assert!(!is_internal_href("obsidian://vault/path.md")); + assert!(!is_internal_href("atom://some-id")); + } + + #[test] + fn test_relative_path_is_internal() { + assert!(is_internal_href("processes/work-tracking.md")); + assert!(is_internal_href("./capacity.md")); + assert!(is_internal_href("../docs/overview.md")); + } + + #[test] + fn test_fragment_not_internal() { + assert!(!is_internal_href("#section-heading")); + assert!(!is_internal_href("")); + } + + #[test] + fn test_extract_markdown_links() { + let content = "See [Work Tracking](processes/work-tracking.md) and [Metrics](../docs/metrics.md)."; + let links = extract_internal_links(content, Some(SOURCE)); + assert_eq!(links.len(), 2); + let hrefs: Vec<&str> = links.iter().map(|l| l.href.as_str()).collect(); + assert!(hrefs.contains(&"processes/work-tracking.md")); + assert!(hrefs.contains(&"../docs/metrics.md")); + } + + #[test] + fn test_extract_wikilinks() { + let content = "See [[Work Tracking]] and [[Metrics|Metrics Docs]]."; + let links = extract_internal_links(content, Some(SOURCE)); + assert_eq!(links.len(), 2); + assert_eq!(links[0].href, "Work Tracking"); + assert_eq!(links[1].href, "Metrics"); + } + + #[test] + fn test_no_links_in_plain_text() { + let content = "No links here. Just text."; + let links = extract_internal_links(content, Some(SOURCE)); + assert!(links.is_empty()); + } + + #[test] + fn test_absolute_links_ignored() { + let content = "See [Confluence](https://atlassian.net/wiki/page) and [Source](obsidian://vault/file.md)."; + let links = extract_internal_links(content, Some(SOURCE)); + assert!(links.is_empty()); + } + + #[test] + fn test_vault_root_extraction() { + assert_eq!( + vault_root("obsidian://ar-playbook/processes/deployment.md"), + Some("obsidian://ar-playbook/") + ); + } + + #[test] + fn test_apply_markdown_replacement() { + let content = "See [Work Tracking](processes/work-tracking.md)."; + let resolved = vec![ResolvedLink { + original: "[Work Tracking](processes/work-tracking.md)".to_string(), + target_atom_id: "abc123".to_string(), + replacement: "atom://abc123".to_string(), + }]; + let result = apply_link_replacements(content, &resolved); + assert_eq!(result, "See [Work Tracking](atom://abc123)."); + } + + #[test] + fn test_apply_wikilink_replacement() { + let content = "See [[Work Tracking]] for details."; + let resolved = vec![ResolvedLink { + original: "[[Work Tracking]]".to_string(), + target_atom_id: "abc123".to_string(), + replacement: "atom://abc123".to_string(), + }]; + let result = apply_link_replacements(content, &resolved); + assert_eq!(result, "See [Work Tracking](atom://abc123) for details."); + } + #[test] + fn test_markdown_link_original_is_populated() { + // `original` must be the full `[text](href)` span so callers can use it + // for replace operations. Previously scan_back_for_display_text always + // returned String::new() for `original`, causing `content: ""` to be + // sent to the server and failing the relink validation. + let content = "See [broken link](./missing.md) for details."; + let links = extract_internal_links(content, Some("obsidian://vault/index.md")); + assert_eq!(links.len(), 1, "one broken link extracted"); + assert_eq!(links[0].original, "[broken link](./missing.md)"); + } + + #[test] + fn test_markdown_link_at_position_zero_has_original() { + // Edge case: link at the very start of content. + let content = "[start link](./page.md) is here."; + let links = extract_internal_links(content, Some("obsidian://vault/index.md")); + assert_eq!(links.len(), 1); + assert_eq!(links[0].original, "[start link](./page.md)"); + } + + #[test] + fn test_markdown_stem_fallback_basic() { + assert_eq!(markdown_stem_fallback("glossary.md"), Some("glossary".to_string())); + assert_eq!( + markdown_stem_fallback("../processes/onboarding.md"), + Some("onboarding".to_string()), + ); + assert_eq!( + markdown_stem_fallback("shared/notes.md#section"), + Some("notes".to_string()), + ); + } + + #[test] + fn test_markdown_stem_fallback_empty() { + assert_eq!(markdown_stem_fallback(""), None); + assert_eq!(markdown_stem_fallback("#only-fragment"), None); + } + + #[test] + fn test_markdown_stem_fallback_extensionless() { + // Obsidian permits bare names; treat the last segment as stem. + assert_eq!(markdown_stem_fallback("glossary"), Some("glossary".to_string())); + assert_eq!(markdown_stem_fallback("shared/glossary"), Some("glossary".to_string())); + } + +} diff --git a/crates/atomic-core/src/health/llm_fixes.rs b/crates/atomic-core/src/health/llm_fixes.rs new file mode 100644 index 00000000..b29691c1 --- /dev/null +++ b/crates/atomic-core/src/health/llm_fixes.rs @@ -0,0 +1,1350 @@ +//! LLM-powered fix implementations. +//! +//! These fixes call the configured LLM provider to make judgment calls that +//! deterministic SQL cannot. All are logged for undo. +//! +//! Currently implemented: +//! - `fix_untagged_atoms` — re-run tagging pipeline on zero-tag complete atoms. +//! - `merge_duplicate_pair` — synthesise two high-similarity atoms into one. +//! - `verify_overlap_pair` — ask LLM if a flagged pair is a true duplicate. +//! - `verify_contradiction_pair` — ask LLM if a flagged pair truly contradicts. +//! - `merge_contradicting_pair` — LLM-reconcile two contradicting atoms into one. +use crate::error::AtomicCoreError; +use crate::health::{audit, FixAction}; +use crate::providers::{create_llm_provider, ProviderConfig}; + +use crate::providers::{LlmConfig}; +use crate::providers::types::Message; +use crate::AtomicCore; +use serde_json::json; + +// ==================== User-tunable prompt instructions ==================== +// +// Each health LLM fix sends a message with two pieces: an *instruction* +// (what to do, how to format) and a *data block* (the atom content under +// analysis). Only the instruction is user-tunable — the data block is +// assembled in code so placeholders can't be mis-spelled or elided. +// +// Overrides are read from the per-DB `settings` table under the keys +// below. An empty or missing value falls back to the builtin default. + +const MERGE_DUPLICATES_SETTING_KEY: &str = "health.merge_duplicates_prompt"; +const CONTRADICTION_DETECTION_SETTING_KEY: &str = "health.contradiction_detection_prompt"; +const STRIP_BOILERPLATE_SETTING_KEY: &str = "health.strip_boilerplate_prompt"; + +const DEFAULT_MERGE_DUPLICATES_INSTRUCTION: &str = "You are merging two duplicate knowledge base atoms into one definitive version.\n\n\ + Rules:\n\ + - Combine all unique information from both atoms into one coherent document\n\ + - If they contradict each other, prefer the more recent source\n\ + - Preserve all actionable details (URLs, commands, config values)\n\ + - Use clean markdown with proper headings\n\ + - Add a '## Sources' section at the bottom listing both original source URLs\n\ + - Do not add commentary — just produce the merged document\n\n\ + Output the merged markdown only."; + +const DEFAULT_CONTRADICTION_DETECTION_INSTRUCTION: &str = "Two knowledge base atoms may contradict each other. Write ONE sentence \ + (<= 25 words) describing what they disagree about. If they don't disagree, \ + reply exactly: NO_CONFLICT."; + +const DEFAULT_STRIP_BOILERPLATE_INSTRUCTION: &str = "You are editing a knowledge base note. The note may contain boilerplate template \ + sections (headers, field labels, empty placeholders) that are not unique to this topic. \ + Remove all boilerplate; keep only the content that is specific to this note's subject. \ + Preserve all factual information. If the whole note is boilerplate, reply exactly: EMPTY. \ + Do not add commentary."; + +/// Resolve a per-DB prompt override from the settings map, falling back to +/// the builtin default. Empty strings are treated as "not set" so a user +/// who clears the setting gets the default back. +fn resolve_prompt<'a>(settings: &'a std::collections::HashMap, key: &str, default: &'a str) -> &'a str { + settings + .get(key) + .map(|s| s.trim()) + .filter(|s| !s.is_empty()) + .unwrap_or(default) +} + + +/// Strip common LLM-response wrappers (markdown code fences, leading/trailing +/// whitespace) and return the JSON-candidate substring. +/// +/// Handles: ```json\n{...}\n```, ```\n{...}\n```, plain {...}, and responses +/// with leading/trailing prose by extracting the outermost {...} or [...] block. +pub(crate) fn strip_llm_json_fences(raw: &str) -> &str { + let s = raw.trim(); + // Strip ```json or ``` fences. + let s = s + .strip_prefix("```json") + .or_else(|| s.strip_prefix("```JSON")) + .or_else(|| s.strip_prefix("```")) + .map(|x| x.trim_start()) + .unwrap_or(s); + let s = s.strip_suffix("```").map(|x| x.trim_end()).unwrap_or(s); + let s = s.trim(); + // If there is still leading/trailing prose, extract the outermost JSON + // object or array. + if s.starts_with('{') || s.starts_with('[') { + return s; + } + let start_obj = s.find('{'); + let start_arr = s.find('['); + let start = match (start_obj, start_arr) { + (Some(o), Some(a)) => Some(o.min(a)), + (Some(o), None) => Some(o), + (None, Some(a)) => Some(a), + (None, None) => None, + }; + if let Some(start) = start { + let end_obj = s.rfind('}'); + let end_arr = s.rfind(']'); + let end = match (end_obj, end_arr) { + (Some(o), Some(a)) => Some(o.max(a)), + (Some(o), None) => Some(o), + (None, Some(a)) => Some(a), + (None, None) => None, + }; + if let Some(end) = end { + if end >= start { + return &s[start..=end]; + } + } + } + s +} + +/// Re-run the tagging pipeline on atoms that completed tagging but got 0 tags. +pub async fn fix_untagged_complete_atoms( + core: &AtomicCore, + untagged_ids: &[String], + dry_run: bool, +) -> Result, AtomicCoreError> { + if untagged_ids.is_empty() { + return Ok(None); + } + + let count = untagged_ids.len() as i32; + + let id = if dry_run { + "dry_run".to_string() + } else { + // Reset tagging status to pending so the pipeline picks them up + for atom_id in untagged_ids { + let _ = core + .storage() + .set_tagging_status_sync(atom_id, "pending", None) + .await; + } + + // Trigger the tagging pipeline + let processed = core.process_pending_tagging(|_| {}).await.unwrap_or(0); + tracing::info!(count, processed, "llm_fixes: re-queued untagged atoms for tagging"); + + audit::log_fix( + core, + "tagging_coverage", + "requeued_untagged_for_tagging", + "low", + Some(untagged_ids), + None, + json!({"atom_ids": untagged_ids}), + json!({"requeued": count, "processed": processed}), + None, + None, + ) + .await? + }; + + Ok(Some(FixAction { + id, + check: "tagging_coverage".to_string(), + action: "requeued_untagged_for_tagging".to_string(), + count, + details: untagged_ids.iter().take(10).cloned().collect(), + })) +} + +/// Merge two highly-similar atoms using the LLM. +/// +/// The LLM synthesises both atoms into one coherent document, then: +/// 1. Updates the newer atom with the merged content. +/// 2. Deletes the older atom. +/// 3. Re-queues the merged atom for embedding + tagging. +/// +/// This is a High-tier action and must be explicitly requested via +/// `POST /api/health/fix/{check}/{item_id}`. +pub async fn merge_duplicate_pair( + core: &AtomicCore, + atom_a_id: &str, + atom_b_id: &str, + dry_run: bool, +) -> Result, AtomicCoreError> { + let Some(atom_a) = core.get_atom(atom_a_id).await? else { + return Err(AtomicCoreError::NotFound(format!("atom {atom_a_id} not found"))); + }; + let Some(atom_b) = core.get_atom(atom_b_id).await? else { + return Err(AtomicCoreError::NotFound(format!("atom {atom_b_id} not found"))); + }; + + // Determine which is newer (keep) and which is older (delete) + let (keep, delete) = if atom_a.atom.updated_at >= atom_b.atom.updated_at { + (atom_a, atom_b) + } else { + (atom_b, atom_a) + }; + + let settings = core.get_settings_map().await.unwrap_or_default(); + let instruction = resolve_prompt( + &settings, + MERGE_DUPLICATES_SETTING_KEY, + DEFAULT_MERGE_DUPLICATES_INSTRUCTION, + ); + let merge_prompt = format!( + "{instruction}\n\n\ + ATOM A (source: {source_a}, created: {date_a}):\n{content_a}\n\n\ + ATOM B (source: {source_b}, created: {date_b}):\n{content_b}", + instruction = instruction, + source_a = keep.atom.source_url.as_deref().unwrap_or("manual"), + date_a = keep.atom.created_at, + content_a = &keep.atom.content, + source_b = delete.atom.source_url.as_deref().unwrap_or("manual"), + date_b = delete.atom.created_at, + content_b = &delete.atom.content, + ); + + if dry_run { + return Ok(Some(FixAction { + id: "dry_run".to_string(), + check: "duplicate_detection".to_string(), + action: "merge_with_llm".to_string(), + count: 1, + details: vec![format!("Would merge {} into {}", delete.atom.id, keep.atom.id)], + })); + } + + // Get LLM provider + let provider_config = ProviderConfig::from_settings(&settings); + let llm = create_llm_provider(&provider_config).map_err(|e| { + AtomicCoreError::Configuration(format!("LLM provider unavailable for merge: {e}")) + })?; + + let model = settings + .get("wiki_model") + .cloned() + .unwrap_or_else(|| "anthropic/claude-sonnet-4.6".to_string()); + + let messages = vec![Message::user(merge_prompt.clone())]; + let config = LlmConfig::new(model).with_params( + crate::providers::types::GenerationParams::new().with_max_tokens(4096), + ); + + let response = llm.complete(&messages, &config).await?; + let merged_content = response.content.clone(); + + if merged_content.is_empty() { + return Err(AtomicCoreError::Validation( + "LLM returned empty merged content".to_string(), + )); + } + + // Capture before state + let before_state = json!([ + { + "id": keep.atom.id, + "content": keep.atom.content, + "source_url": keep.atom.source_url, + "tag_ids": keep.tags.iter().map(|t| t.id.clone()).collect::>() + }, + { + "id": delete.atom.id, + "content": delete.atom.content, + "source_url": delete.atom.source_url, + "tag_ids": delete.tags.iter().map(|t| t.id.clone()).collect::>() + } + ]); + + // Merge tags from deleted atom into keeper + let delete_tag_ids: Vec = delete.tags.iter().map(|t| t.id.clone()).collect(); + if !delete_tag_ids.is_empty() { + let _ = core + .storage() + .link_tags_to_atom_impl(&keep.atom.id, &delete_tag_ids) + .await; + } + + // Update the keeper with merged content + let upd = crate::UpdateAtomRequest { + content: merged_content.clone(), + source_url: keep.atom.source_url.clone(), + published_at: None, + tag_ids: None, + }; + core.update_atom(&keep.atom.id, upd, |_| {}).await?; + + // Delete the older atom + core.delete_atom(&delete.atom.id).await?; + + let fix_id = audit::log_fix( + core, + "duplicate_detection", + "merge_with_llm", + "high", + Some(&[keep.atom.id.clone(), delete.atom.id.clone()]), + None, + before_state, + json!({ + "kept_id": keep.atom.id, + "deleted_id": delete.atom.id, + "merged_content_length": merged_content.len() + }), + Some(&merge_prompt), + Some(&merged_content), + ) + .await?; + + tracing::info!( + kept = %keep.atom.id, + deleted = %delete.atom.id, + "duplicate pair merged with LLM" + ); + + Ok(Some(FixAction { + id: fix_id, + check: "duplicate_detection".to_string(), + action: "merge_with_llm".to_string(), + count: 1, + details: vec![ + format!("Kept: {}", keep.atom.id), + format!("Deleted: {}", delete.atom.id), + ], + })) +} + + +/// Apply a user-edited merge. Caller provides final content; no LLM call. +/// Deletes the loser atom, merges tags into winner, updates winner content. +pub async fn apply_edited_merge( + core: &AtomicCore, + winner_id: &str, + loser_id: &str, + content: &str, +) -> Result { + let Some(winner) = core.get_atom(winner_id).await? else { + return Err(AtomicCoreError::NotFound(format!("atom {winner_id} not found"))); + }; + let Some(loser) = core.get_atom(loser_id).await? else { + return Err(AtomicCoreError::NotFound(format!("atom {loser_id} not found"))); + }; + if content.trim().is_empty() { + return Err(AtomicCoreError::Validation("edited content empty".into())); + } + + let before_state = json!([ + { "id": winner.atom.id, "content": winner.atom.content, "source_url": winner.atom.source_url, "tag_ids": winner.tags.iter().map(|t| t.id.clone()).collect::>() }, + { "id": loser.atom.id, "content": loser.atom.content, "source_url": loser.atom.source_url, "tag_ids": loser.tags.iter().map(|t| t.id.clone()).collect::>() }, + ]); + + let loser_tag_ids: Vec = loser.tags.iter().map(|t| t.id.clone()).collect(); + if !loser_tag_ids.is_empty() { + let _ = core.storage().link_tags_to_atom_impl(&winner.atom.id, &loser_tag_ids).await; + } + + let upd = crate::UpdateAtomRequest { + content: content.to_string(), + source_url: winner.atom.source_url.clone(), + published_at: None, + tag_ids: None, + }; + core.update_atom(&winner.atom.id, upd, |_| {}).await?; + core.delete_atom(&loser.atom.id).await?; + + let fix_id = audit::log_fix( + core, + "content_overlap", + "merge_with_edited_content", + "high", + Some(&[winner.atom.id.clone(), loser.atom.id.clone()]), + None, + before_state, + json!({ "kept_id": winner.atom.id, "deleted_id": loser.atom.id, "content_length": content.len() }), + None, + None, + ).await?; + + Ok(FixAction { + id: fix_id, + check: "content_overlap".to_string(), + action: "merge_with_edited_content".to_string(), + count: 1, + details: vec![format!("Kept: {}", winner.atom.id), format!("Deleted: {}", loser.atom.id)], + }) +} + +/// Ask the LLM to summarise the conflict between two atoms in one sentence. +pub async fn contradiction_summary( + core: &AtomicCore, + atom_a_id: &str, + atom_b_id: &str, +) -> Result { + let Some(a) = core.get_atom(atom_a_id).await? else { + return Err(AtomicCoreError::NotFound(format!("atom {atom_a_id} not found"))); + }; + let Some(b) = core.get_atom(atom_b_id).await? else { + return Err(AtomicCoreError::NotFound(format!("atom {atom_b_id} not found"))); + }; + let settings = core.get_settings_map().await.unwrap_or_default(); + let instruction = resolve_prompt( + &settings, + CONTRADICTION_DETECTION_SETTING_KEY, + DEFAULT_CONTRADICTION_DETECTION_INSTRUCTION, + ); + let prompt = format!( + "{instruction}\n\n\ + ATOM A:\n{}\n\n\ + ATOM B:\n{}\n\n\ + One-sentence summary:", + a.atom.content, b.atom.content, + instruction = instruction, + ); + let provider_config = ProviderConfig::from_settings(&settings); + let llm = create_llm_provider(&provider_config).map_err(|e| { + AtomicCoreError::Configuration(format!("LLM provider unavailable: {e}")) + })?; + let model = settings.get("chat_model").cloned() + .or_else(|| settings.get("wiki_model").cloned()) + .unwrap_or_else(|| "anthropic/claude-sonnet-4.6".to_string()); + let messages = vec![Message::user(prompt)]; + let config = LlmConfig::new(model).with_params( + crate::providers::types::GenerationParams::new().with_max_tokens(128), + ); + let response = llm.complete(&messages, &config).await?; + Ok(response.content.trim().to_string()) +} + +/// Ask the LLM to strip template boilerplate from an atom, keeping only unique content. +/// Returns the rewritten content. When dry_run=true, no writes happen. +pub async fn strip_boilerplate_atom( + core: &AtomicCore, + atom_id: &str, + dry_run: bool, +) -> Result<(String, Option), AtomicCoreError> { + let Some(atom) = core.get_atom(atom_id).await? else { + return Err(AtomicCoreError::NotFound(format!("atom {atom_id} not found"))); + }; + if atom.atom.is_locked { + return Err(AtomicCoreError::Validation(format!( + "atom {atom_id} is locked — unlock it before running automated fixes" + ))); + } + let settings = core.get_settings_map().await.unwrap_or_default(); + let instruction = resolve_prompt( + &settings, + STRIP_BOILERPLATE_SETTING_KEY, + DEFAULT_STRIP_BOILERPLATE_INSTRUCTION, + ); + let prompt = format!( + "{instruction}\n\n\ + NOTE:\n{content}\n\n\ + Rewritten note:", + instruction = instruction, + content = atom.atom.content, + ); + let provider_config = ProviderConfig::from_settings(&settings); + let llm = create_llm_provider(&provider_config).map_err(|e| { + AtomicCoreError::Configuration(format!("LLM provider unavailable: {e}")) + })?; + let model = settings + .get("wiki_model") + .cloned() + .unwrap_or_else(|| "anthropic/claude-sonnet-4.6".to_string()); + let messages = vec![Message::user(prompt.clone())]; + let config = LlmConfig::new(model).with_params( + crate::providers::types::GenerationParams::new().with_max_tokens(4096), + ); + let response = llm.complete(&messages, &config).await?; + let new_content = response.content.trim().to_string(); + + if new_content == "EMPTY" { + return Err(AtomicCoreError::Validation( + "LLM reports atom is entirely boilerplate; refusing to clear it".into(), + )); + } + if new_content.is_empty() { + return Err(AtomicCoreError::Validation("LLM returned empty content".into())); + } + + if dry_run { + return Ok((new_content, None)); + } + + let before_state = json!({ + "id": atom.atom.id, + "content": atom.atom.content, + "source_url": atom.atom.source_url, + }); + let upd = crate::UpdateAtomRequest { + content: new_content.clone(), + source_url: atom.atom.source_url.clone(), + published_at: None, + tag_ids: None, + }; + core.update_atom(&atom.atom.id, upd, |_| {}).await?; + + let fix_id = audit::log_fix( + core, + "boilerplate_pollution", + "strip_boilerplate", + "medium", + Some(std::slice::from_ref(&atom.atom.id)), + None, + before_state, + json!({"new_length": new_content.len()}), + Some(&prompt), + Some(&new_content), + ) + .await?; + + Ok(( + new_content.clone(), + Some(FixAction { + id: fix_id, + check: "boilerplate_pollution".to_string(), + action: "strip_boilerplate".to_string(), + count: 1, + details: vec![format!("Stripped boilerplate from {}", atom.atom.id)], + }), + )) +} + +// ==================== Broken-link auto-resolution ==================== + +/// The outcome of an LLM-powered broken-link resolution attempt. +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +#[serde(tag = "outcome", rename_all = "snake_case")] +pub enum AutoResolveOutcome { + /// The link was rewritten to point at an existing atom. + Relinked { + target_atom_id: String, + title: String, + confidence: f32, + }, + /// The link was stripped because no suitable target was found. + Removed { reason: String }, + /// The LLM was uncertain — link left unchanged. + Skipped { reason: String }, +} + +/// Batch result returned by `auto_resolve_all_broken_links`. +#[derive(Debug, serde::Serialize, serde::Deserialize)] +pub struct AutoResolveBatchResult { + pub checked: u32, + pub relinked: u32, + pub removed: u32, + pub skipped: u32, + pub per_atom: Vec<(String, String, AutoResolveOutcome)>, +} + +/// Fetch up to `limit` candidate atoms for a broken link query. +/// Extract the display text from a markdown link `[text](href)` or wikilink `[[name]]`. +/// Returns an empty string when the format is unrecognised. +fn extract_link_display_text(original: &str) -> String { + if let Some(close) = original.find("](") +{ + if original.starts_with('[') { + return original[1..close].to_string(); + } + } + if let Some(inner) = original.strip_prefix("[[").and_then(|s| s.strip_suffix("]]")) { + return inner.split('|').next().unwrap_or(inner).trim().to_string(); + } + String::new() +} + +/// Extract the *target* (href or wikilink name) from a raw link string. +/// +/// Used as the fuzzy-search query for broken-link auto-resolution: passing the +/// whole raw string (`"[Glossary](glossary.md)"`) to a LIKE search never +/// matches; passing the extracted target (`"glossary.md"` or `"Glossary"`) +/// does. For markdown links the anchor/fragment/query is stripped too. +/// Returns an empty string when the input isn't a recognised link form. +fn extract_link_target(original: &str) -> String { + // Markdown: [text](href) + if let (Some(open), Some(close)) = (original.find("]("), original.rfind(')')) { + if original.starts_with('[') && close > open { + let href = &original[open + 2..close]; + return href.split(['#', '?']).next().unwrap_or(href).trim().to_string(); + } + } + // Wikilink: [[name|alias]] — name comes before the pipe. + if let Some(inner) = original.strip_prefix("[[").and_then(|s| s.strip_suffix("]]")) { + return inner.split('|').next().unwrap_or(inner).trim().to_string(); + } + String::new() +} + +/// Returns `(atom_id, title, source_url, score)`. +pub(crate) async fn suggest_link_targets( + core: &AtomicCore, + q: &str, + limit: i32, +) -> Result, f32)>, AtomicCoreError> { + core.storage() + .suggest_atoms_by_query_sync(q.to_string(), limit) + .await +} + +/// Ask the LLM which (if any) candidate is the true target for a broken link, +/// then relink or remove the link accordingly. +pub async fn auto_resolve_broken_link( + core: &AtomicCore, + atom_id: &str, + link_raw: &str, + link_text: &str, +) -> Result { + // Locked atoms are not auto-rewritten. Skip without error so batch flows + // can continue past them. + if core.is_atom_locked(atom_id).await.unwrap_or(false) { + return Ok(AutoResolveOutcome::Skipped { + reason: "atom is locked".to_string(), + }); + } + + // Build candidate list by searching on the extracted link target (href or + // wikilink name) rather than the raw link string. Passing the whole raw + // string (`"[Glossary](glossary.md)"`) to the LIKE-based suggest never + // matches — that was the source of the "no candidates → link removed" + // regression. If searching by target yields nothing, fall back to the + // display text (`"Glossary"`), which often matches the H1 of the intended + // atom even when the path is stale. + let target = extract_link_target(link_raw); + let display = extract_link_display_text(link_raw); + let mut candidates = if !target.is_empty() { + suggest_link_targets(core, &target, 8).await? + } else { + Vec::new() + }; + if candidates.is_empty() && !display.is_empty() && display != target { + candidates = suggest_link_targets(core, &display, 8).await?; + } + if candidates.is_empty() { + // No candidates found by fuzzy search. Previously this branch quietly + // stripped the link — destructive, and dangerous when the cause is a + // *missing atom* (the link's true target was deleted/renamed). Users + // saw "auto-fix concluded with no matches" and found links had been + // silently removed. Now we skip and surface the reason so the user can + // manually repair or dismiss. The explicit "Remove" button remains + // the escape hatch for the destructive path. + let reason = format!( + "no candidates matched target {:?} or display {:?}", + target, display, + ); + let outcome = AutoResolveOutcome::Skipped { reason: reason.clone() }; + audit::log_fix( + core, + "broken_internal_links", + "auto_resolve_skipped", + "low", + Some(&[atom_id.to_string()]), + None, + json!({"atom_id": atom_id, "link_raw": link_raw}), + json!({"outcome": "skipped", "reason": reason}), + None, + None, + ) + .await?; + return Ok(outcome); + } + + // Build candidate list for the prompt. + let candidate_lines: String = candidates + .iter() + .enumerate() + .map(|(i, (id, title, source_url, _score))| { + let src = source_url.as_deref().unwrap_or(""); + format!("{n}. id={id} title={title} source={src}", n = i + 1, id = id, title = title, src = src) + }) + .collect::>() + .join("\n"); + + let prompt = format!( + "You are resolving a broken markdown link. The source note references a target that no \ +longer resolves. Choose the best candidate from the list below OR say NONE if none match.\n\n\ +Link text: `{link_text}`\n\ +Link href: `{link_raw}`\n\n\ +Candidates:\n{candidates}\n\n\ +Output JSON only (no markdown fences): {{\"target_atom_id\": \"\" or null, \"confidence\": 0..1, \"reason\": \"...\"}}.", + link_text = link_text, + link_raw = link_raw, + candidates = candidate_lines, + ); + + let settings = core.get_settings_map().await.unwrap_or_default(); + let provider_config = ProviderConfig::from_settings(&settings); + let llm = create_llm_provider(&provider_config).map_err(|e| { + AtomicCoreError::Configuration(format!("LLM provider unavailable for auto_resolve: {e}")) + })?; + let model = settings + .get("wiki_model") + .cloned() + .unwrap_or_else(|| "anthropic/claude-sonnet-4.6".to_string()); + let messages = vec![Message::user(prompt.clone())]; + let config = LlmConfig::new(model).with_params( + crate::providers::types::GenerationParams::new().with_max_tokens(512), + ); + + let response = llm.complete(&messages, &config).await?; + let raw = response.content.trim().to_string(); + + let json_str = strip_llm_json_fences(&raw); + + #[derive(serde::Deserialize)] + struct LlmAnswer { + target_atom_id: Option, + confidence: f32, + reason: String, + } + + let answer: LlmAnswer = serde_json::from_str(json_str).map_err(|e| { + AtomicCoreError::Validation(format!("auto_resolve_broken_link: failed to parse LLM JSON: {e} — got: {json_str}")) + })?; + + let outcome = if let Some(ref target_id) = answer.target_atom_id { + if answer.confidence >= 0.6 { + // Relink. + let _ = super::fixes::relink_broken_link(core, atom_id, link_raw, target_id).await; + let title = candidates + .iter() + .find(|(id, _, _, _)| id == target_id) + .map(|(_, t, _, _)| t.clone()) + .unwrap_or_else(|| target_id.clone()); + AutoResolveOutcome::Relinked { + target_atom_id: target_id.clone(), + title, + confidence: answer.confidence, + } + } else { + AutoResolveOutcome::Skipped { + reason: format!("low confidence {:.2}: {}", answer.confidence, answer.reason), + } + } + } else { + AutoResolveOutcome::Skipped { + reason: format!("LLM returned null target: {}", answer.reason), + } + }; + + audit::log_fix( + core, + "broken_internal_links", + "auto_resolve", + "medium", + Some(&[atom_id.to_string()]), + None, + json!({"atom_id": atom_id, "link_raw": link_raw}), + serde_json::to_value(&outcome).unwrap_or_default(), + Some(&prompt), + Some(&raw), + ) + .await?; + + Ok(outcome) +} + +/// Resolve up to `max` broken (atom, link) pairs using the LLM. +pub async fn auto_resolve_all_broken_links( + core: &AtomicCore, + max: usize, +) -> Result { + use crate::health::link_resolution::{extract_internal_links}; + + let candidates = core.storage().get_link_candidate_atoms_sync().await?; + + // Build a flat list of (atom_id, content, source_url, link_raw, link_text) pairs. + let mut pairs: Vec<(String, String, String)> = Vec::new(); // (atom_id, link_raw, link_text) + + 'outer: for (atom_id, content, source_url) in &candidates { + let links = extract_internal_links(content, source_url.as_deref()); + for link in &links { + if pairs.len() >= max { + break 'outer; + } + // Only include unresolved links. + let candidate_urls: Vec = link.candidate_source_urls.to_vec(); + let url_map = core + .storage() + .find_atoms_by_source_urls_sync(candidate_urls) + .await + .unwrap_or_default(); + if url_map.is_empty() { + let link_text = extract_link_display_text(&link.original); + pairs.push((atom_id.clone(), link.original.clone(), link_text)); + } + } + } + + let checked = pairs.len() as u32; + let mut relinked = 0u32; + let mut removed = 0u32; + let mut skipped = 0u32; + let mut per_atom: Vec<(String, String, AutoResolveOutcome)> = Vec::new(); + + for (atom_id, link_raw, link_text) in pairs { + let outcome = auto_resolve_broken_link(core, &atom_id, &link_raw, &link_text).await?; + match &outcome { + AutoResolveOutcome::Relinked { .. } => relinked += 1, + AutoResolveOutcome::Removed { .. } => removed += 1, + AutoResolveOutcome::Skipped { .. } => skipped += 1, + } + per_atom.push((atom_id, link_raw, outcome)); + } + + Ok(AutoResolveBatchResult { + checked, + relinked, + removed, + skipped, + per_atom, + }) +} + +/// Ask the LLM whether two atoms flagged as duplicates are a true semantic duplicate +/// or a false positive. Returns `(is_duplicate, reason)`. +/// +/// On false-positive the pair is dismissed under `content_overlap` and the decision +/// is logged via `audit::log_fix`. +pub async fn verify_overlap_pair( + core: &AtomicCore, + atom_a_id: &str, + atom_b_id: &str, +) -> Result<(bool, String), AtomicCoreError> { + let Some(a) = core.get_atom(atom_a_id).await? else { + return Err(AtomicCoreError::NotFound(format!("atom {atom_a_id} not found"))); + }; + let Some(b) = core.get_atom(atom_b_id).await? else { + return Err(AtomicCoreError::NotFound(format!("atom {atom_b_id} not found"))); + }; + + let prompt = format!( + "Below are two knowledge-base notes flagged as possible duplicates. \ +Determine whether they cover substantially the same subject or are coincidentally \ +similar (different topics, similar vocabulary). Reply with STRICT JSON: \ +{{\"duplicate\": true|false, \"reason\": \"one short sentence\"}}. Nothing else.\n\n\ +ATOM A (source: {source_a}, created: {date_a}):\n{content_a}\n\n\ +ATOM B (source: {source_b}, created: {date_b}):\n{content_b}", + source_a = a.atom.source_url.as_deref().unwrap_or("manual"), + date_a = a.atom.created_at, + content_a = a.atom.content, + source_b = b.atom.source_url.as_deref().unwrap_or("manual"), + date_b = b.atom.created_at, + content_b = b.atom.content, + ); + + let settings = core.get_settings_map().await.unwrap_or_default(); + let provider_config = ProviderConfig::from_settings(&settings); + let llm = create_llm_provider(&provider_config).map_err(|e| { + AtomicCoreError::Configuration(format!("LLM provider unavailable: {e}")) + })?; + let model = settings + .get("wiki_model") + .cloned() + .unwrap_or_else(|| "anthropic/claude-sonnet-4.6".to_string()); + let messages = vec![Message::user(prompt.clone())]; + let config = LlmConfig::new(model).with_params( + crate::providers::types::GenerationParams::new().with_max_tokens(256), + ); + let response = llm.complete(&messages, &config).await?; + let raw = response.content.trim(); + + #[derive(serde::Deserialize)] + struct VerifyOverlapResp { duplicate: bool, reason: String } + let parsed: VerifyOverlapResp = serde_json::from_str(strip_llm_json_fences(raw)).map_err(|e| { + AtomicCoreError::Validation(format!("LLM response parse error: {e} — raw: {raw}")) + })?; + + if !parsed.duplicate { + let key = crate::health::pair_key(atom_a_id, atom_b_id); + let _ = core + .dismiss_health_item("content_overlap", &key, "llm_false_positive", None) + .await; + let _ = audit::log_fix( + core, + "content_overlap", + "verify_with_llm", + "low", + Some(&[atom_a_id.to_string(), atom_b_id.to_string()]), + None, + json!({"atom_a": atom_a_id, "atom_b": atom_b_id}), + json!({"is_duplicate": false, "reason": parsed.reason.clone()}), + Some(&prompt), + Some(raw), + ) + .await; + } + + Ok((parsed.duplicate, parsed.reason)) +} + +/// Ask the LLM whether two atoms flagged as contradicting actually assert conflicting +/// facts. Returns `(is_real, reason)`. +/// +/// On not-real: dismisses under `contradiction_detection` with reason `llm_false_positive`. +pub async fn verify_contradiction_pair( + core: &AtomicCore, + atom_a_id: &str, + atom_b_id: &str, +) -> Result<(bool, String), AtomicCoreError> { + let Some(a) = core.get_atom(atom_a_id).await? else { + return Err(AtomicCoreError::NotFound(format!("atom {atom_a_id} not found"))); + }; + let Some(b) = core.get_atom(atom_b_id).await? else { + return Err(AtomicCoreError::NotFound(format!("atom {atom_b_id} not found"))); + }; + + let prompt = format!( + "Two knowledge base atoms have been flagged as possibly contradicting each other. \ +Reply with STRICT JSON: {{\"contradiction\": true|false, \"reason\": \"one short sentence\"}}. \ +Contradiction means the two atoms assert directly conflicting facts about the same subject. \ +Nothing else.\n\n\ +ATOM A (source: {source_a}):\n{content_a}\n\n\ +ATOM B (source: {source_b}):\n{content_b}", + source_a = a.atom.source_url.as_deref().unwrap_or("manual"), + content_a = a.atom.content, + source_b = b.atom.source_url.as_deref().unwrap_or("manual"), + content_b = b.atom.content, + ); + + let settings = core.get_settings_map().await.unwrap_or_default(); + let provider_config = ProviderConfig::from_settings(&settings); + let llm = create_llm_provider(&provider_config).map_err(|e| { + AtomicCoreError::Configuration(format!("LLM provider unavailable: {e}")) + })?; + let model = settings + .get("wiki_model") + .cloned() + .unwrap_or_else(|| "anthropic/claude-sonnet-4.6".to_string()); + let messages = vec![Message::user(prompt.clone())]; + let config = LlmConfig::new(model).with_params( + crate::providers::types::GenerationParams::new().with_max_tokens(256), + ); + let response = llm.complete(&messages, &config).await?; + let raw = response.content.trim(); + + #[derive(serde::Deserialize)] + struct VerifyContradictionResp { contradiction: bool, reason: String } + let parsed: VerifyContradictionResp = serde_json::from_str(strip_llm_json_fences(raw)).map_err(|e| { + AtomicCoreError::Validation(format!("LLM response parse error: {e} — raw: {raw}")) + })?; + + if !parsed.contradiction { + let key = crate::health::pair_key(atom_a_id, atom_b_id); + let _ = core + .dismiss_health_item("contradiction_detection", &key, "llm_false_positive", None) + .await; + let _ = audit::log_fix( + core, + "contradiction_detection", + "verify_with_llm", + "low", + Some(&[atom_a_id.to_string(), atom_b_id.to_string()]), + None, + json!({"atom_a": atom_a_id, "atom_b": atom_b_id}), + json!({"is_contradiction": false, "reason": parsed.reason.clone()}), + Some(&prompt), + Some(raw), + ) + .await; + } + + Ok((parsed.contradiction, parsed.reason)) +} + +/// LLM-reconcile two contradicting atoms into one document that acknowledges the +/// disagreement, records both positions, and prefers the more recent/authoritative +/// source where clear. +/// +/// Writes merged content to the newer atom, deletes the older atom, and dismisses +/// the pair under `contradiction_detection`. +pub async fn merge_contradicting_pair( + core: &AtomicCore, + atom_a_id: &str, + atom_b_id: &str, + dry_run: bool, +) -> Result, AtomicCoreError> { + let Some(atom_a) = core.get_atom(atom_a_id).await? else { + return Err(AtomicCoreError::NotFound(format!("atom {atom_a_id} not found"))); + }; + let Some(atom_b) = core.get_atom(atom_b_id).await? else { + return Err(AtomicCoreError::NotFound(format!("atom {atom_b_id} not found"))); + }; + if atom_a.atom.is_locked || atom_b.atom.is_locked { + return Err(AtomicCoreError::Validation( + "one or both atoms are locked — unlock before auto-merging. Contradictions in locked source material should stay recorded as-is.".to_string() + )); + } + + // Newer atom is the keeper + let (keep, delete) = if atom_a.atom.updated_at >= atom_b.atom.updated_at { + (atom_a, atom_b) + } else { + (atom_b, atom_a) + }; + + let merge_prompt = format!( + "The two atoms contradict each other. Produce ONE reconciled document that \ +acknowledges the disagreement and records both positions with attribution (date + source). \ +Prefer the more recent/authoritative source where clear. Use clean markdown. \ +Do not add commentary beyond the reconciled document.\n\n\ +ATOM A (source: {source_a}, created: {date_a}):\n{content_a}\n\n\ +ATOM B (source: {source_b}, created: {date_b}):\n{content_b}\n\n\ +Reconciled document:", + source_a = keep.atom.source_url.as_deref().unwrap_or("manual"), + date_a = keep.atom.created_at, + content_a = keep.atom.content, + source_b = delete.atom.source_url.as_deref().unwrap_or("manual"), + date_b = delete.atom.created_at, + content_b = delete.atom.content, + ); + + if dry_run { + return Ok(Some(FixAction { + id: "dry_run".to_string(), + check: "contradiction_detection".to_string(), + action: "merge_with_llm".to_string(), + count: 1, + details: vec![format!("Would merge {} into {}", delete.atom.id, keep.atom.id)], + })); + } + + let settings = core.get_settings_map().await.unwrap_or_default(); + let provider_config = ProviderConfig::from_settings(&settings); + let llm = create_llm_provider(&provider_config).map_err(|e| { + AtomicCoreError::Configuration(format!("LLM provider unavailable: {e}")) + })?; + let model = settings + .get("wiki_model") + .cloned() + .unwrap_or_else(|| "anthropic/claude-sonnet-4.6".to_string()); + let messages = vec![Message::user(merge_prompt.clone())]; + let config = LlmConfig::new(model).with_params( + crate::providers::types::GenerationParams::new().with_max_tokens(4096), + ); + let response = llm.complete(&messages, &config).await?; + let merged_content = response.content.trim().to_string(); + + if merged_content.is_empty() { + return Err(AtomicCoreError::Validation( + "LLM returned empty merged content".to_string(), + )); + } + + let before_state = json!([ + { + "id": keep.atom.id, + "content": keep.atom.content, + "source_url": keep.atom.source_url, + "tag_ids": keep.tags.iter().map(|t| t.id.clone()).collect::>() + }, + { + "id": delete.atom.id, + "content": delete.atom.content, + "source_url": delete.atom.source_url, + "tag_ids": delete.tags.iter().map(|t| t.id.clone()).collect::>() + } + ]); + + // Merge tags from deleted atom into keeper + let delete_tag_ids: Vec = delete.tags.iter().map(|t| t.id.clone()).collect(); + if !delete_tag_ids.is_empty() { + let _ = core + .storage() + .link_tags_to_atom_impl(&keep.atom.id, &delete_tag_ids) + .await; + } + + // Update keeper with reconciled content + let upd = crate::UpdateAtomRequest { + content: merged_content.clone(), + source_url: keep.atom.source_url.clone(), + published_at: None, + tag_ids: None, + }; + core.update_atom(&keep.atom.id, upd, |_| {}).await?; + core.delete_atom(&delete.atom.id).await?; + + // Dismiss the pair + let pair_key = crate::health::pair_key(atom_a_id, atom_b_id); + let _ = core + .dismiss_health_item("contradiction_detection", &pair_key, "merged", None) + .await; + + let fix_id = audit::log_fix( + core, + "contradiction_detection", + "merge_with_llm", + "high", + Some(&[keep.atom.id.clone(), delete.atom.id.clone()]), + None, + before_state, + json!({ + "kept_id": keep.atom.id, + "deleted_id": delete.atom.id, + "merged_content_length": merged_content.len() + }), + Some(&merge_prompt), + Some(&merged_content), + ) + .await?; + + tracing::info!( + kept = %keep.atom.id, + deleted = %delete.atom.id, + "contradiction pair merged with LLM" + ); + + Ok(Some(FixAction { + id: fix_id, + check: "contradiction_detection".to_string(), + action: "merge_with_llm".to_string(), + count: 1, + details: vec![ + format!("Kept: {}", keep.atom.id), + format!("Deleted: {}", delete.atom.id), + ], + })) +} + +// ==================== Tag structure proposal ==================== + +/// Raw LLM output shape — parsed before enriching with the DB-generated UUID. +#[derive(serde::Deserialize)] +struct RawTagProposalResponse { + summary: String, + actions: Vec, +} + +/// Ask the LLM to propose merges, renames, reparentings and deletions for the +/// current tag tree. The proposal is persisted in `tag_proposals` and returned. +pub async fn propose_tag_restructure( + core: &AtomicCore, +) -> Result { + // 1. Load flat tag list. + let tags = core.get_all_tags_filtered(0).await?; + + // 2. Build compact JSON capped at 500 tags. + let mut tag_rows: Vec = tags + .iter() + .take(500) + .map(|t| { + json!({ + "id": t.tag.id, + "name": t.tag.name, + "parent_id": t.tag.parent_id, + "atom_count": t.atom_count, + }) + }) + .collect(); + // Sort by atom_count desc so the most relevant tags appear first in the cap. + tag_rows.sort_by(|a, b| { + let ca = a["atom_count"].as_i64().unwrap_or(0); + let cb = b["atom_count"].as_i64().unwrap_or(0); + cb.cmp(&ca) + }); + let tag_tree_json = serde_json::to_string_pretty(&tag_rows) + .unwrap_or_else(|_| "[]".to_string()); + + // 3. Build prompt. + let prompt = format!( + "You are a knowledge-base curator. Analyse the tag tree below and propose a \ +better organisation.\n\n\ +Rules:\n\ +- Propose MERGES for near-duplicate tag names (same concept, different spelling or casing).\n\ +- Propose RENAMES for tags whose names are unclear or inconsistent.\n\ +- Propose REPARENTINGS for orphan tags or tags placed under the wrong parent.\n\ +- Propose DELETIONS only for single-use, irrelevant, or clearly erroneous tags.\n\ +- Limit total actions to 25.\n\ +- Every action must include a human-readable `reason`.\n\n\ +Output STRICT JSON only (no markdown fences, no commentary) matching this schema:\n\ +{{\"summary\": \"\", \"actions\": [, ...]}}\n\n\ +Each action is one of:\n\ + {{\"kind\":\"merge\", \"from_id\":\"\", \"into_id\":\"\", \"from_name\":\"\", \"into_name\":\"\", \"reason\":\"...\"}}\n\ + {{\"kind\":\"rename\", \"tag_id\":\"\", \"old_name\":\"\", \"new_name\":\"\", \"reason\":\"...\"}}\n\ + {{\"kind\":\"reparent\", \"tag_id\":\"\", \"tag_name\":\"\", \"new_parent_id\":null|\"\", \"new_parent_name\":null|\"\", \"reason\":\"...\"}}\n\ + {{\"kind\":\"delete\", \"tag_id\":\"\", \"tag_name\":\"\", \"reason\":\"...\"}}\n\n\ +Current tag tree ({count} tags):\n{tree}", + count = tag_rows.len(), + tree = tag_tree_json, + ); + + // 4. Call LLM. + let settings = core.get_settings_map().await.unwrap_or_default(); + let provider_config = ProviderConfig::from_settings(&settings); + let llm = create_llm_provider(&provider_config).map_err(|e| { + AtomicCoreError::Configuration(format!("LLM provider unavailable for tag proposal: {e}")) + })?; + let model = settings + .get("wiki_model") + .cloned() + .unwrap_or_else(|| "anthropic/claude-sonnet-4.6".to_string()); + let messages = vec![Message::user(prompt.clone())]; + let config = LlmConfig::new(model).with_params( + crate::providers::types::GenerationParams::new().with_max_tokens(4096), + ); + let response = llm.complete(&messages, &config).await?; + + // 5. Parse. + let raw: RawTagProposalResponse = serde_json::from_str(strip_llm_json_fences(&response.content)).map_err(|e| { + AtomicCoreError::Validation(format!( + "LLM returned unparseable proposal: {e}. Raw: {}", + &response.content[..response.content.len().min(200)] + )) + })?; + + let proposal = crate::health::TagProposal { + id: uuid::Uuid::new_v4().to_string(), + summary: raw.summary, + actions: raw.actions, + generated_at: chrono::Utc::now().to_rfc3339(), + }; + + // 6. Persist. + core.storage().save_tag_proposal_sync(proposal.clone()).await?; + + Ok(proposal) +} + +/// Apply accepted actions from a persisted proposal. +pub async fn apply_tag_proposal( + core: &AtomicCore, + proposal_id: &str, + accepted_indices: &[usize], +) -> Result, AtomicCoreError> { + let proposal = core + .storage() + .get_tag_proposal_sync(proposal_id) + .await? + .ok_or_else(|| AtomicCoreError::NotFound(format!("tag proposal {proposal_id} not found")))?; + + let mut fix_actions = Vec::new(); + + for &idx in accepted_indices { + let action = proposal.actions.get(idx).ok_or_else(|| { + AtomicCoreError::Validation(format!("accepted index {idx} out of range")) + })?; + + match action { + crate::health::TagProposalAction::Merge { + from_id, + into_id, + from_name, + into_name, + reason, + } => { + let merge = crate::compaction::TagMerge { + winner_name: into_name.clone(), + loser_name: from_name.clone(), + reason: reason.clone(), + }; + let result = core.apply_tag_merges(&[merge]).await; + let detail = match &result { + Ok(r) => format!("Merged '{}' into '{}': {} atoms retagged", from_name, into_name, r.atoms_retagged), + Err(e) => format!("Merge '{}' into '{}' failed: {e}", from_name, into_name), + }; + let fix_id = audit::log_fix( + core, "tag_health", "tag_proposal_merge", "medium", + None, + Some(&[from_id.clone(), into_id.clone()]), + json!({"from_id": from_id, "from_name": from_name}), + json!({"into_id": into_id, "into_name": into_name, "detail": detail}), + None, None, + ).await?; + fix_actions.push(FixAction { + id: fix_id, + check: "tag_health".to_string(), + action: "tag_proposal_merge".to_string(), + count: 1, + details: vec![detail], + }); + } + crate::health::TagProposalAction::Rename { + tag_id, + old_name, + new_name, + reason, + } => { + // Fetch current parent so we only change the name. + let current = core.storage().get_tag_by_id_sync(tag_id).await?; + let parent_id = current.as_ref().and_then(|(_, p)| p.as_deref().map(|s| s.to_string())); + let result = core.update_tag(tag_id, new_name, parent_id.as_deref()).await; + let detail = match &result { + Ok(_) => format!("Renamed '{}' → '{}'", old_name, new_name), + Err(e) => format!("Rename '{}' failed: {e}", old_name), + }; + let fix_id = audit::log_fix( + core, "tag_health", "tag_proposal_rename", "low", + None, Some(std::slice::from_ref(&tag_id)), + json!({"tag_id": tag_id, "old_name": old_name}), + json!({"new_name": new_name, "reason": reason, "detail": detail}), + None, None, + ).await?; + fix_actions.push(FixAction { + id: fix_id, + check: "tag_health".to_string(), + action: "tag_proposal_rename".to_string(), + count: 1, + details: vec![detail], + }); + } + crate::health::TagProposalAction::Reparent { + tag_id, + tag_name, + new_parent_id, + reason, + .. + } => { + let result = core.update_tag(tag_id, tag_name, new_parent_id.as_deref()).await; + let detail = match &result { + Ok(_) => format!("Reparented '{}' → parent {:?}", tag_name, new_parent_id), + Err(e) => format!("Reparent '{}' failed: {e}", tag_name), + }; + let fix_id = audit::log_fix( + core, "tag_health", "tag_proposal_reparent", "low", + None, Some(std::slice::from_ref(&tag_id)), + json!({"tag_id": tag_id, "tag_name": tag_name}), + json!({"new_parent_id": new_parent_id, "reason": reason, "detail": detail}), + None, None, + ).await?; + fix_actions.push(FixAction { + id: fix_id, + check: "tag_health".to_string(), + action: "tag_proposal_reparent".to_string(), + count: 1, + details: vec![detail], + }); + } + crate::health::TagProposalAction::Delete { tag_id, tag_name, reason } => { + let result = core.delete_tag(tag_id, false).await; + let detail = match &result { + Ok(_) => format!("Deleted tag '{}'", tag_name), + Err(e) => format!("Delete '{}' failed: {e}", tag_name), + }; + let fix_id = audit::log_fix( + core, "tag_health", "tag_proposal_delete", "medium", + None, Some(std::slice::from_ref(&tag_id)), + json!({"tag_id": tag_id, "tag_name": tag_name}), + json!({"reason": reason, "detail": detail}), + None, None, + ).await?; + fix_actions.push(FixAction { + id: fix_id, + check: "tag_health".to_string(), + action: "tag_proposal_delete".to_string(), + count: 1, + details: vec![detail], + }); + } + } + } + + // Mark proposal applied. + core.storage().mark_tag_proposal_applied_sync(proposal_id).await?; + + Ok(fix_actions) +} \ No newline at end of file diff --git a/crates/atomic-core/src/health/mod.rs b/crates/atomic-core/src/health/mod.rs new file mode 100644 index 00000000..be88cf94 --- /dev/null +++ b/crates/atomic-core/src/health/mod.rs @@ -0,0 +1,69 @@ +//! Knowledge-base health monitoring and auto-remediation. +//! +//! This module computes a scored health report across 10+ checks, each +//! targeting a distinct class of data-quality issue. Deterministic fixes +//! (orphan-tag deletion, retry pipelines, graph rebuild) run automatically +//! at "safe" or "low" tier. LLM-powered fixes (merge duplicates, enrich +//! stubs, structure content) are available but always logged and undoable. +//! +//! # Layout +//! | module | role | +//! |----------------------|---------------------------------------------------| +//! | [`compute`] | `compute_health`, `compute_single_check`, dismiss | +//! | [`run_fix`] | auto-fix orchestrator | +//! | [`checks`] | individual sync check implementations | +//! | [`fixes`] | deterministic fix implementations | +//! | [`llm_fixes`] | LLM-powered fixes (merge, enrich, reorg) | +//! | [`custom`] | user-defined custom rules | +//! | [`audit`] | health_fix_log read/write | +//! | [`score`] | weighted aggregation | +//! | [`types`] | public data types | +//! | [`link_resolution`] | wikilink + markdown link parsing | +//! | [`task`], [`gc_task`]| background scheduled jobs | +//! +//! # Flow +//! 1. `compute_health(core)` → runs all checks → returns `HealthReport` +//! 2. `run_fix(core, req)` → applies fixes by tier → returns `FixResponse` +//! 3. `undo_fix(core, fix_id)` → restores pre-fix state from audit log + +pub mod audit; +pub mod checks; +pub mod compute; +pub mod custom; +pub mod fixes; +pub mod gc_task; +pub mod link_resolution; +pub mod llm_fixes; +pub mod run_fix; +pub mod score; +pub mod task; +pub mod types; + +// Re-export the public surface so existing callers +// (`use crate::health::HealthReport`) keep working. +pub use compute::{compute_health, compute_single_check}; +pub use run_fix::run_fix; +pub use score::aggregate_score; +pub use types::{ + AtomPreview, BoilerplateAtomEntry, ContradictionAtom, ContradictionPairEntry, + DuplicatePair, FixAction, FixRequest, FixResponse, FixTier, HealthCheckOverride, + HealthCheckResult, HealthConfig, HealthReport, HealthStatus, HealthThresholds, + RootlessTagEntry, SingleAtomTagEntry, SkippedFix, TagProposal, TagProposalAction, + WikiGap, WikiStaleEntry, +}; + +// Internal cross-module references. +pub(crate) use compute::{apply_dismissals, title_preview}; + +/// Build a stable item key for a pair. Sorts atom IDs lexicographically so +/// key ordering is independent of which atom is A vs B. +pub fn pair_key(a: &str, b: &str) -> String { + if a <= b { + format!("{}__{}", a, b) + } else { + format!("{}__{}", b, a) + } +} + +#[cfg(test)] +mod tests; diff --git a/crates/atomic-core/src/health/run_fix.rs b/crates/atomic-core/src/health/run_fix.rs new file mode 100644 index 00000000..6685457c --- /dev/null +++ b/crates/atomic-core/src/health/run_fix.rs @@ -0,0 +1,176 @@ +//! Auto-fix orchestration. +//! +//! `run_fix` dispatches deterministic fixes by tier (Safe / Low / Medium). +//! High-tier fixes are surfaced as `SkippedFix` entries with a reason, never +//! executed automatically. LLM-powered fixes (merge, summary, reorg) live in +//! `llm_fixes.rs` and are invoked explicitly by the UI. + +use super::score::aggregate_score; +use super::types::{FixAction, FixRequest, FixResponse, FixTier, SkippedFix}; +use super::{checks, fixes}; +use crate::error::AtomicCoreError; +use crate::AtomicCore; + +/// Run auto-fixes up to the requested tier. +pub async fn run_fix( + core: &AtomicCore, + req: &FixRequest, +) -> Result { + let config = core.get_health_config().await.unwrap_or_default(); + let raw = core.storage().health_check_data_sync(config.thresholds.clone()).await?; + let checks = checks::run_all(&raw, &config.thresholds); + let max_tier = req.max_tier(); + let dry_run = req.is_dry_run(); + + let mut actions_taken: Vec = Vec::new(); + let mut skipped: Vec = Vec::new(); + + // Helper: should we run this check's fix? + let should_run = |check_name: &str| -> bool { + if let Some(filter) = &req.checks { + filter.iter().any(|c| c == check_name) + } else { + true + } + }; + + // --- Safe tier --- + + if should_run("embedding_coverage") { + if let Some(check) = checks.get("embedding_coverage") { + if check.auto_fixable && check.status != "ok" { + match fixes::fix_embedding_coverage(core, dry_run).await { + Ok(Some(action)) => actions_taken.push(action), + Ok(None) => {} + Err(e) => { + tracing::warn!(error = %e, "embedding_coverage fix failed"); + } + } + } + } + } + + if should_run("semantic_graph_freshness") { + if let Some(check) = checks.get("semantic_graph_freshness") { + if check.auto_fixable && check.status != "ok" { + match fixes::fix_graph_freshness(core, dry_run).await { + Ok(Some(action)) => actions_taken.push(action), + Ok(None) => {} + Err(e) => { + tracing::warn!(error = %e, "semantic_graph_freshness fix failed"); + } + } + } + } + } + + if should_run("tagging_coverage") { + if let Some(check) = checks.get("tagging_coverage") { + if check.auto_fixable && check.status != "ok" { + let skipped_untagged = raw.skipped_untagged; + match fixes::fix_tagging_coverage(core, skipped_untagged, dry_run).await { + Ok(Some(action)) => actions_taken.push(action), + Ok(None) => {} + Err(e) => { + tracing::warn!(error = %e, "tagging_coverage fix failed"); + } + } + } + } + } + + // --- Low tier --- + + if matches!(max_tier, FixTier::Low | FixTier::Medium | FixTier::High) { + if should_run("orphan_tags") { + if let Some(check) = checks.get("orphan_tags") { + if check.auto_fixable && check.status != "ok" { + match fixes::fix_orphan_tags(core, &raw, dry_run).await { + Ok(Some(action)) => actions_taken.push(action), + Ok(None) => {} + Err(e) => tracing::warn!(error = %e, "orphan_tags fix failed"), + } + } + } + } + + if should_run("tag_health") { + if let Some(check) = checks.get("tag_health") { + if check.auto_fixable && check.status != "ok" { + match fixes::fix_tag_health_single_atom(core, &raw, dry_run).await { + Ok(Some(action)) => actions_taken.push(action), + Ok(None) => {} + Err(e) => tracing::warn!(error = %e, "tag_health single-atom fix failed"), + } + } + } + } + + if should_run("wiki_coverage") { + if let Some(check) = checks.get("wiki_coverage") { + if check.auto_fixable && check.status != "ok" { + match fixes::fix_wiki_coverage(core, &raw, dry_run).await { + Ok(Some(action)) => actions_taken.push(action), + Ok(None) => {} + Err(e) => tracing::warn!(error = %e, "wiki_coverage fix failed"), + } + } + } + } + + if should_run("broken_internal_links") + && matches!(checks.get("broken_internal_links"), Some(c) if c.auto_fixable && c.status != "ok") { + match fixes::fix_broken_internal_links(core, dry_run).await { + Ok(Some(action)) => actions_taken.push(action), + Ok(None) => tracing::debug!("broken_internal_links: no links to fix"), + Err(e) => tracing::warn!(error = %e, "broken_internal_links fix failed"), + } + } + } + + // --- Medium tier --- + + if matches!(max_tier, FixTier::Medium | FixTier::High) + && should_run("source_uniqueness") { + if let Some(check) = checks.get("source_uniqueness") { + if check.auto_fixable && check.status != "ok" { + match fixes::fix_source_uniqueness(core, &raw, dry_run).await { + Ok(Some(action)) => actions_taken.push(action), + Ok(None) => {} + Err(e) => tracing::warn!(error = %e, "source_uniqueness fix failed"), + } + } + } + } + // Mark high-tier issues as skipped with reason + for (check_name, check) in &checks { + if check.requires_review && check.status != "ok" && !should_run(check_name) { + skipped.push(SkippedFix { + check: check_name.clone(), + reason: "requires_review".to_string(), + count: check.data.get("count").and_then(|v| v.as_i64()).unwrap_or(0) + as i32, + }); + } + } + + // Recompute score after fixes (if not dry run) — always weight with + // the caller DB's current HealthConfig so the number matches compute_health. + let new_score = if !dry_run && !actions_taken.is_empty() { + let new_raw = core + .storage() + .health_check_data_sync(config.thresholds.clone()) + .await?; + let new_checks = checks::run_all(&new_raw, &config.thresholds); + aggregate_score(&new_checks, Some(&config)) + } else { + aggregate_score(&checks, Some(&config)) + }; + + Ok(FixResponse { + mode: req.mode.clone(), + actions_taken, + skipped, + new_score, + }) +} diff --git a/crates/atomic-core/src/health/score.rs b/crates/atomic-core/src/health/score.rs new file mode 100644 index 00000000..17dea727 --- /dev/null +++ b/crates/atomic-core/src/health/score.rs @@ -0,0 +1,74 @@ +//! Health score aggregation. +//! +//! The overall health score is a weighted mean of individual check scores. +//! Weights can be overridden per database via `HealthConfig`. Informational +//! checks (e.g. `content_quality`) are excluded from the score by default +//! and can be opted in by setting an explicit weight. + +use super::types::{HealthCheckResult, HealthConfig}; +use std::collections::HashMap; + +/// Default check weights. Must sum to 1.0. +/// +/// Design: defaults include only checks that represent near-universal data +/// integrity problems (coverage of the pipeline, orphaned references, broken +/// links, accidental duplicates). Opinionated "completeness" checks +/// (wiki_coverage, content_quality, contradiction_detection, boilerplate_pollution) +/// are returned with `informational: true` and excluded from the overall score +/// by default. Users can opt-in to weighting them via per-DB HealthConfig. +pub(crate) const CHECK_WEIGHTS: &[(&str, f64)] = &[ + ("embedding_coverage", 0.20), + ("tagging_coverage", 0.20), + ("orphan_tags", 0.15), + ("source_uniqueness", 0.10), + ("semantic_graph_freshness", 0.10), + ("tag_health", 0.10), + ("broken_internal_links", 0.10), + ("content_overlap", 0.05), +]; + +/// Aggregate individual check scores into a single 0-100 overall score. +/// +/// Rules: +/// - Informational checks contribute **only** when the user supplied an +/// explicit `weight` override via `HealthConfig`. +/// - Disabled checks (`enabled: false`) contribute nothing. +/// - Default-weighted checks fall back to `CHECK_WEIGHTS` when the config is +/// empty, matching the no-config behaviour. +pub fn aggregate_score( + checks: &HashMap, + config: Option<&HealthConfig>, +) -> u32 { + let default_weights: HashMap<&str, f64> = CHECK_WEIGHTS.iter().copied().collect(); + let mut total = 0.0_f64; + let mut weight_sum = 0.0_f64; + for (name, check) in checks { + // Respect enabled flag. + let override_entry = config.and_then(|c| c.overrides.get(name)); + if let Some(o) = override_entry { + if !o.enabled { + continue; + } + } + // Effective weight: explicit override wins; else default (0 for informational). + let weight = match override_entry.and_then(|o| o.weight) { + Some(w) => w, + None => { + if check.informational { + 0.0 + } else { + default_weights.get(name.as_str()).copied().unwrap_or(0.0) + } + } + }; + if weight <= 0.0 { + continue; + } + total += (check.score as f64) * weight; + weight_sum += weight; + } + if weight_sum == 0.0 { + return 100; + } + ((total / weight_sum).round() as u32).min(100) +} diff --git a/crates/atomic-core/src/health/task.rs b/crates/atomic-core/src/health/task.rs new file mode 100644 index 00000000..aa6623d7 --- /dev/null +++ b/crates/atomic-core/src/health/task.rs @@ -0,0 +1,107 @@ +//! Nightly health maintenance scheduled task. +//! +//! Runs daily at ~3 AM (configurable). Automatically applies Safe + Low tier +//! fixes and records the health report for trending. If the score drops below +//! 70, the next briefing run will include a health summary. + +use crate::health::{self, FixRequest}; +use crate::scheduler::{state as task_state, ScheduledTask, TaskContext, TaskError, TaskEvent}; +use crate::AtomicCore; +use async_trait::async_trait; +use std::time::Duration; + +pub struct HealthMaintenanceTask; + +const TASK_ID: &str = "health_maintenance"; +const DEFAULT_INTERVAL: Duration = Duration::from_secs(24 * 60 * 60); +const DEFAULT_ENABLED: bool = true; + +#[async_trait] +impl ScheduledTask for HealthMaintenanceTask { + fn id(&self) -> &'static str { + TASK_ID + } + + fn display_name(&self) -> &'static str { + "Knowledge health maintenance" + } + + fn default_interval(&self) -> Duration { + DEFAULT_INTERVAL + } + + async fn run(&self, core: &AtomicCore, ctx: &TaskContext) -> Result<(), TaskError> { + if !task_state::is_enabled(core, TASK_ID, DEFAULT_ENABLED).await { + return Err(TaskError::Disabled); + } + if !task_state::is_due(core, TASK_ID, DEFAULT_INTERVAL, DEFAULT_ENABLED).await { + return Err(TaskError::NotDue); + } + + let db_id = core + .db_path() + .file_stem() + .and_then(|s| s.to_str()) + .map(|s| s.to_string()) + .unwrap_or_else(|| "default".to_string()); + + (ctx.event_cb)(TaskEvent::Started { + task_id: TASK_ID.to_string(), + db_id: db_id.clone(), + }); + + // Run health check + let report = match health::compute_health(core).await { + Ok(r) => r, + Err(e) => { + let msg = e.to_string(); + (ctx.event_cb)(TaskEvent::Failed { + task_id: TASK_ID.to_string(), + db_id, + error: msg.clone(), + }); + return Err(TaskError::Other(msg)); + } + }; + + let score_before = report.overall_score; + tracing::info!( + score = score_before, + status = %report.overall_status, + "[health_maintenance] initial score" + ); + + // Auto-fix Safe + Low tier issues + let fix_req = FixRequest { + checks: None, + mode: "auto".to_string(), + include_medium: false, + }; + + match health::run_fix(core, &fix_req).await { + Ok(fix_resp) => { + tracing::info!( + fixes = fix_resp.actions_taken.len(), + new_score = fix_resp.new_score, + "[health_maintenance] fixes applied" + ); + } + Err(e) => { + tracing::warn!(error = %e, "[health_maintenance] fix run failed"); + } + } + + // Persist last_run + task_state::set_last_run(core, TASK_ID, chrono::Utc::now()) + .await + .ok(); + + (ctx.event_cb)(TaskEvent::Completed { + task_id: TASK_ID.to_string(), + db_id, + result_id: None, + }); + + Ok(()) + } +} diff --git a/crates/atomic-core/src/health/tests.rs b/crates/atomic-core/src/health/tests.rs new file mode 100644 index 00000000..e229ce6e --- /dev/null +++ b/crates/atomic-core/src/health/tests.rs @@ -0,0 +1,1415 @@ +//! Unit tests for health check functions. +//! +//! Tests use manually constructed `HealthRawData` fixtures to validate +//! scoring, `requires_review` logic, and JSON data shapes — no database required. + +#[cfg(test)] +mod tests { + use super::super::checks; + use super::super::{ + AtomPreview, BoilerplateAtomEntry, ContradictionAtom, ContradictionPairEntry, + DuplicatePair, RootlessTagEntry, WikiGap, WikiStaleEntry, + }; + use crate::storage::sqlite::health::HealthRawData; + + fn base_raw() -> HealthRawData { + HealthRawData { + total_atoms: 50, + embedding_complete: 50, + tagging_complete: 50, + ..Default::default() + } + } + + // --- embedding_coverage --- + + #[test] + fn test_embedding_coverage_perfect() { + let mut raw = base_raw(); + raw.embedding_complete = 50; + let result = checks::embedding_coverage(&raw); + assert_eq!(result.status, "ok"); + assert_eq!(result.score, 100); + assert!(!result.requires_review); + assert!(!result.auto_fixable); + } + + #[test] + fn test_embedding_coverage_with_failures() { + let mut raw = base_raw(); + raw.embedding_failed = 5; + let result = checks::embedding_coverage(&raw); + assert_ne!(result.status, "ok"); + assert!(result.auto_fixable); + assert!(result.score < 100); + } + + #[test] + fn test_embedding_coverage_all_pending() { + let mut raw = base_raw(); + raw.embedding_pending = 50; + raw.embedding_complete = 0; + let result = checks::embedding_coverage(&raw); + assert!(result.score < 100); + assert!(result.auto_fixable); + } + + // --- tagging_coverage --- + + #[test] + fn test_tagging_coverage_perfect() { + let raw = base_raw(); + let result = checks::tagging_coverage(&raw); + assert_eq!(result.status, "ok"); + assert_eq!(result.score, 100); + assert!(!result.requires_review); + } + + #[test] + fn test_tagging_coverage_untagged_atoms() { + let mut raw = base_raw(); + raw.untagged_complete = 10; + let result = checks::tagging_coverage(&raw); + assert_ne!(result.status, "ok"); + assert!(result.auto_fixable); + } + + // --- content_overlap --- + + #[test] + fn test_content_overlap_no_pairs() { + let raw = base_raw(); + let result = checks::content_overlap(&raw); + assert_eq!(result.status, "ok"); + assert!(!result.requires_review); + } + + #[test] + fn test_content_overlap_with_pairs() { + let mut raw = base_raw(); + raw.duplicate_pairs.push(DuplicatePair { + pair_id: "p1".to_string(), + atom_a_id: "a1".to_string(), + atom_a_title: "Article A".to_string(), + atom_a_source: Some("https://source1.com/a".to_string()), + atom_b_id: "b1".to_string(), + atom_b_title: "Article B".to_string(), + atom_b_source: Some("https://source2.com/b".to_string()), + similarity: 0.72, + shared_tag_count: 3, + atom_a_created_at: None, + atom_b_created_at: None, + }); + let result = checks::content_overlap(&raw); + assert_ne!(result.status, "ok"); + assert!(result.requires_review); + assert!(!result.auto_fixable); + // Verify pairs appear in data + let pairs = result.data["pairs"].as_array().unwrap(); + assert_eq!(pairs.len(), 1); + assert_eq!(pairs[0]["atom_a"]["id"], "a1"); + assert_eq!(pairs[0]["atom_a"]["title"], "Article A"); + } + + #[test] + fn test_content_overlap_created_at_in_json() { + let mut raw = base_raw(); + raw.duplicate_pairs.push(DuplicatePair { + pair_id: "p2".to_string(), + atom_a_id: "a2".to_string(), + atom_a_title: "Article A".to_string(), + atom_a_source: None, + atom_b_id: "b2".to_string(), + atom_b_title: "Article B".to_string(), + atom_b_source: None, + similarity: 0.70, + shared_tag_count: 2, + atom_a_created_at: Some("2026-01-01T00:00:00Z".to_string()), + atom_b_created_at: Some("2026-02-01T00:00:00Z".to_string()), + }); + let result = checks::content_overlap(&raw); + let pairs = result.data["pairs"].as_array().unwrap(); + assert_eq!(pairs[0]["atom_a"]["created_at"], "2026-01-01T00:00:00Z"); + assert_eq!(pairs[0]["atom_b"]["created_at"], "2026-02-01T00:00:00Z"); + } + // --- content_quality --- + + #[test] + fn test_content_quality_perfect() { + let raw = base_raw(); + let result = checks::content_quality(&raw); + assert_eq!(result.status, "ok"); + assert!(!result.requires_review); + } + + #[test] + fn test_content_quality_no_source_atoms() { + let mut raw = base_raw(); + raw.no_source_atoms.push(AtomPreview { + id: "atom-1".to_string(), + title: "My Note".to_string(), + created_at: "2026-01-01T00:00:00Z".to_string(), + }); + raw.no_source_atoms.push(AtomPreview { + id: "atom-2".to_string(), + title: "Another Note".to_string(), + created_at: "2026-01-02T00:00:00Z".to_string(), + }); + let result = checks::content_quality(&raw); + assert!(result.requires_review); + // Check data shape + let atoms = &result.data["issues"]["no_source"]["atoms"]; + assert_eq!(atoms.as_array().unwrap().len(), 2); + assert_eq!(atoms[0]["id"], "atom-1"); + assert_eq!(atoms[0]["title"], "My Note"); + assert_eq!(atoms[0]["created_at"], "2026-01-01T00:00:00Z"); + // auto_fixable should be false for no_source + assert_eq!(result.data["issues"]["no_source"]["auto_fixable"], false); + } + + #[test] + fn test_content_quality_short_atoms() { + let mut raw = base_raw(); + raw.very_short_atoms.push("short-1".to_string()); + let result = checks::content_quality(&raw); + assert!(result.auto_fixable); + assert_eq!(result.data["issues"]["very_short"]["count"], 1); + } + + // --- boilerplate_pollution --- + + #[test] + fn test_boilerplate_no_pollution() { + let raw = base_raw(); + let result = checks::boilerplate_pollution(&raw, &crate::health::HealthThresholds::default()); + assert_eq!(result.status, "ok"); + assert!(!result.requires_review); + assert_eq!(result.data["count"], 0); + assert_eq!(result.score, 100, "clean state must show 100"); + } + + #[test] + fn test_boilerplate_with_affected_atoms() { + let mut raw = base_raw(); + raw.boilerplate_affected_atoms.push(BoilerplateAtomEntry { + id: "atom-bp-1".to_string(), + title: "Boilerplate Article".to_string(), + clone_count: 5, + }); + raw.boilerplate_affected_atoms.push(BoilerplateAtomEntry { + id: "atom-bp-2".to_string(), + title: "Template Note".to_string(), + clone_count: 3, + }); + let result = checks::boilerplate_pollution(&raw, &crate::health::HealthThresholds::default()); + assert_ne!(result.status, "ok"); + assert!(result.requires_review); + assert_eq!(result.data["count"], 2); + assert!( + result.score < 100, + "score must reflect issues exist (got {})", + result.score + ); + assert!( + result.score >= 50, + "score must stay above floor (got {})", + result.score + ); + let atoms = result.data["affected_atoms"].as_array().unwrap(); + assert_eq!(atoms.len(), 2); + assert_eq!(atoms[0]["id"], "atom-bp-1"); + assert_eq!(atoms[0]["title"], "Boilerplate Article"); + assert_eq!(atoms[0]["clone_count"], 5); + } + + // --- contradiction_detection --- + + #[test] + fn test_contradiction_no_pairs() { + let raw = base_raw(); + let result = checks::contradiction_detection(&raw); + assert_eq!(result.status, "ok"); + assert!(!result.requires_review); + assert_eq!(result.data["potential_contradictions"], 0); + assert!(result.data["pairs"].as_array().unwrap().is_empty()); + } + + #[test] + fn test_contradiction_with_pairs() { + let mut raw = base_raw(); + raw.contradiction_pairs.push(ContradictionPairEntry { + pair_id: "cp1".to_string(), + atom_a: ContradictionAtom { + id: "ca1".to_string(), + title: "Article on Topic X - Version 1".to_string(), + source: Some("https://site1.com/x".to_string()), + created_at: None, + }, + atom_b: ContradictionAtom { + id: "cb1".to_string(), + title: "Article on Topic X - Version 2".to_string(), + source: Some("https://site2.com/x".to_string()), + created_at: None, + }, + similarity: 0.85, + shared_tag_count: 2, + }); + raw.contradiction_candidate_count = 1; + let result = checks::contradiction_detection(&raw); + assert_ne!(result.status, "ok"); + assert!(result.requires_review); + let pairs = result.data["pairs"].as_array().unwrap(); + assert_eq!(pairs.len(), 1); + assert_eq!(pairs[0]["pair_id"], "cp1"); + assert_eq!(pairs[0]["atom_a"]["title"], "Article on Topic X - Version 1"); + // f32 serializes with limited precision; compare as f64 with tolerance + let sim = pairs[0]["similarity"].as_f64().unwrap(); + assert!((sim - 0.85).abs() < 0.001, "expected ~0.85, got {sim}"); + } + + + #[test] + fn test_contradiction_created_at_in_json() { + let mut raw = base_raw(); + raw.contradiction_pairs.push(ContradictionPairEntry { + pair_id: "cp2".to_string(), + atom_a: ContradictionAtom { + id: "ca2".to_string(), + title: "Topic A".to_string(), + source: None, + created_at: Some("2026-01-15T00:00:00Z".to_string()), + }, + atom_b: ContradictionAtom { + id: "cb2".to_string(), + title: "Topic B".to_string(), + source: None, + created_at: Some("2026-03-15T00:00:00Z".to_string()), + }, + similarity: 0.88, + shared_tag_count: 1, + }); + let result = checks::contradiction_detection(&raw); + let pairs = result.data["pairs"].as_array().unwrap(); + assert_eq!(pairs[0]["atom_a"]["created_at"], "2026-01-15T00:00:00Z"); + assert_eq!(pairs[0]["atom_b"]["created_at"], "2026-03-15T00:00:00Z"); + } + // --- tag_health --- + + #[test] + fn test_tag_health_perfect() { + let raw = base_raw(); + let result = checks::tag_health(&raw, &crate::health::HealthThresholds::default()); + assert_eq!(result.status, "ok"); + assert!(!result.requires_review); + let rootless_list = result.data["rootless_tag_list"].as_array().unwrap(); + assert!(rootless_list.is_empty()); + } + + #[test] + fn test_tag_health_rootless_tags() { + let mut raw = base_raw(); + raw.rootless_tag_list.push(RootlessTagEntry { + id: "tag-1".to_string(), + name: "Orphaned Category".to_string(), + atom_count: 7, + }); + raw.rootless_tag_list.push(RootlessTagEntry { + id: "tag-2".to_string(), + name: "Floating Topic".to_string(), + atom_count: 3, + }); + raw.rootless_tags = 2; + let result = checks::tag_health(&raw, &crate::health::HealthThresholds::default()); + assert!(result.requires_review); + let list = result.data["rootless_tag_list"].as_array().unwrap(); + assert_eq!(list.len(), 2); + assert_eq!(list[0]["id"], "tag-1"); + assert_eq!(list[0]["name"], "Orphaned Category"); + assert_eq!(list[0]["atom_count"], 7); + } + + + #[test] + fn test_tag_health_similar_name_pairs_list() { + let mut raw = base_raw(); + raw.similar_name_pairs_list = vec![ + ("id-a".to_string(), "Machine Learning".to_string(), "id-b".to_string(), "Learning".to_string()), + ]; + raw.similar_name_pair_count = 1; + let result = checks::tag_health(&raw, &crate::health::HealthThresholds::default()); + assert_eq!(result.status, "warning"); + let pair_list = result.data["similar_name_pair_list"].as_array().unwrap(); + assert_eq!(pair_list.len(), 1); + assert_eq!(pair_list[0]["a_name"], "Machine Learning"); + assert_eq!(pair_list[0]["b_name"], "Learning"); + assert_eq!(pair_list[0]["pair_id"], "id-a__id-b"); + } + // --- aggregate_score --- + + #[test] + fn test_aggregate_score_all_perfect() { + use std::collections::HashMap; + use crate::health::HealthCheckResult; + let mut checks_map = HashMap::new(); + for name in &["content_overlap", "embedding_coverage", "tagging_coverage", + "source_uniqueness", "wiki_coverage", "semantic_graph_freshness", + "content_quality", "orphan_tags", "tag_health", "broken_internal_links"] { + checks_map.insert(name.to_string(), HealthCheckResult { + status: "ok".to_string(), + score: 100, + auto_fixable: false, + requires_review: false, + informational: false, + fix_action: None, + data: serde_json::Value::Null, + }); + } + let score = crate::health::aggregate_score(&checks_map, None); + assert_eq!(score, 100); + } + + #[test] + fn test_aggregate_score_mixed() { + use std::collections::HashMap; + use crate::health::HealthCheckResult; + let mut checks_map = HashMap::new(); + // tagging_coverage at 0 (weight 0.20) → expected ~80 + for name in &["content_overlap", "embedding_coverage", "source_uniqueness", + "wiki_coverage", "semantic_graph_freshness", + "content_quality", "orphan_tags", "tag_health", "broken_internal_links"] { + checks_map.insert(name.to_string(), HealthCheckResult { + status: "ok".to_string(), + score: 100, + auto_fixable: false, + requires_review: false, + informational: false, + fix_action: None, + data: serde_json::Value::Null, + }); + } + checks_map.insert("tagging_coverage".to_string(), HealthCheckResult { + status: "error".to_string(), + score: 0, + auto_fixable: true, + requires_review: false, + informational: false, + fix_action: Some("retry_tagging_pipeline".to_string()), + data: serde_json::Value::Null, + }); + let score = crate::health::aggregate_score(&checks_map, None); + // tagging = 0.0 * 0.20 + others = 1.0 * 0.80 → 80 + assert_eq!(score, 80); + } + + #[test] + fn test_aggregate_score_excludes_informational_by_default() { + use std::collections::HashMap; + use crate::health::HealthCheckResult; + let mut checks_map = HashMap::new(); + // All default-weighted checks at 100. + for name in &["content_overlap", "embedding_coverage", "tagging_coverage", + "source_uniqueness", "semantic_graph_freshness", + "orphan_tags", "tag_health", "broken_internal_links"] { + checks_map.insert(name.to_string(), HealthCheckResult { + status: "ok".to_string(), + score: 100, + auto_fixable: false, + requires_review: false, + informational: false, + fix_action: None, + data: serde_json::Value::Null, + }); + } + // An informational check scoring 0 must NOT drag the overall score down + // when the user has not assigned it a weight. + checks_map.insert("wiki_coverage".to_string(), HealthCheckResult { + status: "warning".to_string(), + score: 0, + auto_fixable: false, + requires_review: false, + informational: true, + fix_action: None, + data: serde_json::Value::Null, + }); + let score = crate::health::aggregate_score(&checks_map, None); + assert_eq!(score, 100, "informational check at 0 should not affect default score"); + } + + #[test] + fn test_aggregate_score_config_lifts_informational_into_scoring() { + use std::collections::HashMap; + use crate::health::{HealthCheckResult, HealthConfig, HealthCheckOverride}; + let mut checks_map = HashMap::new(); + checks_map.insert("embedding_coverage".to_string(), HealthCheckResult { + status: "ok".to_string(), + score: 100, + auto_fixable: false, + requires_review: false, + informational: false, + fix_action: None, + data: serde_json::Value::Null, + }); + checks_map.insert("wiki_coverage".to_string(), HealthCheckResult { + status: "warning".to_string(), + score: 0, + auto_fixable: false, + requires_review: false, + informational: true, + fix_action: None, + data: serde_json::Value::Null, + }); + // User explicitly weights wiki_coverage at 0.20 (same as embedding_coverage default). + let mut overrides = HashMap::new(); + overrides.insert("wiki_coverage".to_string(), HealthCheckOverride { enabled: true, weight: Some(0.20) }); + let config = HealthConfig { overrides, thresholds: Default::default() }; + let score = crate::health::aggregate_score(&checks_map, Some(&config)); + // embedding_coverage (0.20 default) * 100 + wiki_coverage (0.20 override) * 0 → 50 + assert_eq!(score, 50); + } + + #[test] + fn test_aggregate_score_config_disabled_check_is_skipped() { + use std::collections::HashMap; + use crate::health::{HealthCheckResult, HealthConfig, HealthCheckOverride}; + let mut checks_map = HashMap::new(); + checks_map.insert("embedding_coverage".to_string(), HealthCheckResult { + status: "error".to_string(), + score: 0, + auto_fixable: false, + requires_review: false, + informational: false, + fix_action: None, + data: serde_json::Value::Null, + }); + checks_map.insert("tagging_coverage".to_string(), HealthCheckResult { + status: "ok".to_string(), + score: 100, + auto_fixable: false, + requires_review: false, + informational: false, + fix_action: None, + data: serde_json::Value::Null, + }); + let mut overrides = HashMap::new(); + // Disable embedding_coverage; its 0 score must not drag the overall down. + overrides.insert("embedding_coverage".to_string(), HealthCheckOverride { enabled: false, weight: None }); + let config = HealthConfig { overrides, thresholds: Default::default() }; + let score = crate::health::aggregate_score(&checks_map, Some(&config)); + assert_eq!(score, 100); + } + + // --- boilerplate_indices integration --- + + #[test] + fn test_boilerplate_filtering_preserves_unique_chunks() { + use crate::boilerplate::{boilerplate_indices, content_hash}; + use std::collections::HashMap; + let chunks = vec![ + "# Privacy Policy\n\nAll rights reserved.".to_string(), + "This atom is about machine learning and neural networks.".to_string(), + "# Privacy Policy\n\nAll rights reserved.".to_string(), + ]; + let mut counts = HashMap::new(); + let bp_hash = content_hash("# Privacy Policy\n\nAll rights reserved."); + counts.insert(bp_hash, 20i64); + let indices = boilerplate_indices(&chunks, &counts, 5); + assert!(indices.contains(&0)); + assert!(!indices.contains(&1)); + assert!(indices.contains(&2)); + } + + // --- pair_key and apply_dismissals --- + + #[test] + fn test_pair_key_sorted() { + use crate::health::pair_key; + assert_eq!(pair_key("a", "b"), "a__b"); + assert_eq!(pair_key("b", "a"), "a__b"); + assert_eq!(pair_key("z1", "z2"), "z1__z2"); + } + + + #[test] + fn test_apply_dismissals_recomputes_contradiction_score() { + // Regression: the health dashboard rendered "Contradictions 4 → red" + // next to "0 atom pairs" because dismissals updated the pair list + // and the potential_contradictions count but left `score` frozen at + // the pre-dismissal baseline. The UI row reads from both fields; a + // score that doesn't track the count is a self-contradicting row. + use crate::health::{apply_dismissals, pair_key, HealthCheckResult}; + use std::collections::HashSet; + let mut result = HealthCheckResult { + status: "warning".into(), + score: 4, // matches checks::contradiction_detection for 12 pairs. + auto_fixable: false, + requires_review: true, + informational: true, + fix_action: None, + data: serde_json::json!({ + "potential_contradictions": 12, + "pairs": [ + {"pair_id": "p1", "atom_a": {"id": "a1"}, "atom_b": {"id": "b1"}}, + {"pair_id": "p2", "atom_a": {"id": "a2"}, "atom_b": {"id": "b2"}}, + ] + }), + }; + let mut dismissed = HashSet::new(); + dismissed.insert(pair_key("a1", "b1")); + dismissed.insert(pair_key("a2", "b2")); + apply_dismissals("contradiction_detection", &mut result, &dismissed); + assert_eq!( + result.data["potential_contradictions"] + .as_u64() + .unwrap(), + 0 + ); + // With zero pairs the check score must be the healthy ceiling, + // not a stale 4. Otherwise the UI shows a red row next to "0 pairs". + assert_eq!(result.score, 100); + assert_eq!(result.status, "ok"); + assert!(!result.requires_review); + } + + fn test_apply_dismissals_filters_content_overlap_pairs() { + use crate::health::{apply_dismissals, pair_key, HealthCheckResult}; + use std::collections::HashSet; + let mut result = HealthCheckResult { + status: "warning".into(), + score: 60, + auto_fixable: false, + requires_review: true, + informational: false, + fix_action: None, + data: serde_json::json!({ + "count": 2, + "cross_source_overlaps": 2, + "pairs": [ + {"atom_a": {"id": "a1"}, "atom_b": {"id": "b1"}}, + {"atom_a": {"id": "a2"}, "atom_b": {"id": "b2"}}, + ] + }), + }; + let mut dismissed = HashSet::new(); + dismissed.insert(pair_key("a1", "b1")); + apply_dismissals("content_overlap", &mut result, &dismissed); + let pairs = result.data["pairs"].as_array().unwrap(); + assert_eq!(pairs.len(), 1); + assert_eq!(pairs[0]["atom_a"]["id"], "a2"); + assert_eq!(result.data["count"], 1); + } + + #[test] + fn test_apply_dismissals_filters_no_source() { + use crate::health::{apply_dismissals, HealthCheckResult}; + use std::collections::HashSet; + let mut result = HealthCheckResult { + status: "warning".into(), + score: 70, + auto_fixable: false, + requires_review: true, + informational: false, + fix_action: None, + data: serde_json::json!({ + "issues": { + "no_source": { + "count": 2, + "atoms": [ + {"id": "a1", "title": "A"}, + {"id": "a2", "title": "B"} + ] + } + } + }), + }; + let mut dismissed = HashSet::new(); + dismissed.insert("a1".to_string()); + apply_dismissals("content_quality", &mut result, &dismissed); + let atoms = result.data["issues"]["no_source"]["atoms"].as_array().unwrap(); + assert_eq!(atoms.len(), 1); + assert_eq!(atoms[0]["id"], "a2"); + assert_eq!(result.data["issues"]["no_source"]["count"], 1); + } + + #[test] + fn test_apply_dismissals_filters_rootless_tags() { + use crate::health::{apply_dismissals, HealthCheckResult}; + use std::collections::HashSet; + let mut result = HealthCheckResult { + status: "warning".into(), + score: 80, + auto_fixable: false, + requires_review: true, + informational: false, + fix_action: None, + data: serde_json::json!({ + "rootless_tags": 2, + "rootless_tag_list": [ + {"id": "t1", "name": "Foo", "atom_count": 3}, + {"id": "t2", "name": "Bar", "atom_count": 1} + ] + }), + }; + let mut dismissed = HashSet::new(); + dismissed.insert("t1".to_string()); + apply_dismissals("tag_health", &mut result, &dismissed); + let tags = result.data["rootless_tag_list"].as_array().unwrap(); + assert_eq!(tags.len(), 1); + assert_eq!(tags[0]["id"], "t2"); + assert_eq!(result.data["rootless_tags"], 1); + } + + #[test] + fn test_apply_dismissals_empty_set_noop() { + use crate::health::{apply_dismissals, HealthCheckResult}; + use std::collections::HashSet; + let mut result = HealthCheckResult { + status: "warning".into(), + score: 60, + auto_fixable: false, + requires_review: true, + informational: false, + fix_action: None, + data: serde_json::json!({"count": 1, "pairs": [{"atom_a": {"id": "a"}, "atom_b": {"id": "b"}}]}), + }; + apply_dismissals("content_overlap", &mut result, &HashSet::new()); + assert_eq!(result.data["pairs"].as_array().unwrap().len(), 1); + } + + #[test] + fn test_apply_dismissals_clears_requires_review_when_empty() { + use crate::health::{apply_dismissals, HealthCheckResult}; + use std::collections::HashSet; + let mut result = HealthCheckResult { + status: "warning".into(), + score: 60, + auto_fixable: false, + requires_review: true, + informational: false, + fix_action: None, + data: serde_json::json!({ + "count": 1, + "affected_atoms": [{"id": "a1", "title": "x", "clone_count": 3}] + }), + }; + let mut d = HashSet::new(); + d.insert("a1".to_string()); + apply_dismissals("boilerplate_pollution", &mut result, &d); + assert!(!result.requires_review); + assert_eq!(result.data["count"], 0); + } + + // --- tag_health: single_atom_tag_list --- + + #[test] + fn test_tag_health_single_atom_tag_list() { + use crate::health::SingleAtomTagEntry; + let mut raw = base_raw(); + // Tag A: 1 atom, autotag=true + raw.single_atom_tag_list.push(SingleAtomTagEntry { + id: "tag-a".to_string(), + name: "AutoTag".to_string(), + is_autotag: true, + }); + // Tag B: 1 atom, autotag=false (user-created) + raw.single_atom_tag_list.push(SingleAtomTagEntry { + id: "tag-b".to_string(), + name: "UserTag".to_string(), + is_autotag: false, + }); + raw.single_atom_tags = 2; + + let result = checks::tag_health(&raw, &crate::health::HealthThresholds::default()); + + // Expect the list in JSON data + let list = result.data["single_atom_tag_list"].as_array().unwrap(); + assert_eq!(list.len(), 2); + assert_eq!(list[0]["id"], "tag-a"); + assert_eq!(list[0]["is_autotag"], true); + assert_eq!(list[1]["id"], "tag-b"); + assert_eq!(list[1]["is_autotag"], false); + } + + #[test] + fn test_tag_health_auto_fixable_requires_autotag_threshold() { + use crate::health::SingleAtomTagEntry; + let mut raw = base_raw(); + // Only 2 autotag single-atom tags — below threshold of 3 + for i in 0..2 { + raw.single_atom_tag_list.push(SingleAtomTagEntry { + id: format!("tag-{}", i), + name: format!("Tag{}", i), + is_autotag: true, + }); + } + raw.single_atom_tags = 2; + let result = checks::tag_health(&raw, &crate::health::HealthThresholds::default()); + // auto_fixable = false because count <= 3 + assert!(!result.auto_fixable); + + // Now add enough to exceed threshold + let mut raw2 = base_raw(); + for i in 0..4 { + raw2.single_atom_tag_list.push(SingleAtomTagEntry { + id: format!("tag-{}", i), + name: format!("Tag{}", i), + is_autotag: true, + }); + } + raw2.single_atom_tags = 4; + let result2 = checks::tag_health(&raw2, &crate::health::HealthThresholds::default()); + assert!(result2.auto_fixable); + } + + #[test] + fn test_apply_dismissals_filters_single_atom_tag_list() { + use crate::health::{apply_dismissals, HealthCheckResult, SingleAtomTagEntry}; + use std::collections::HashSet; + + let mut raw = base_raw(); + raw.single_atom_tag_list.push(SingleAtomTagEntry { id: "tag-x".to_string(), name: "X".to_string(), is_autotag: false }); + raw.single_atom_tag_list.push(SingleAtomTagEntry { id: "tag-y".to_string(), name: "Y".to_string(), is_autotag: true }); + raw.single_atom_tags = 2; + let mut result = checks::tag_health(&raw, &crate::health::HealthThresholds::default()); + + let mut dismissed = HashSet::new(); + dismissed.insert("tag-x".to_string()); + apply_dismissals("tag_health", &mut result, &dismissed); + + let list = result.data["single_atom_tag_list"].as_array().unwrap(); + assert_eq!(list.len(), 1); + assert_eq!(list[0]["id"], "tag-y"); + // Count updated + assert_eq!(result.data["single_atom_tags"], 1); + } + // --- requires_review covers similar too --- + + #[test] + fn test_tag_health_requires_review_when_similar() { + let mut raw = base_raw(); + raw.similar_name_pairs_list = vec![( + "id-a".to_string(), "AI".to_string(), + "id-b".to_string(), "Artificial Intelligence".to_string(), + )]; + raw.similar_name_pair_count = 1; + let result = checks::tag_health(&raw, &crate::health::HealthThresholds::default()); + assert!(result.requires_review); + } +} + +#[cfg(test)] +mod integration_tests { + use tempfile::TempDir; + use crate::AtomicCore; + use crate::health::{compute_health, fixes}; + + fn open_core() -> (AtomicCore, TempDir) { + let dir = TempDir::new().expect("tempdir"); + let core = AtomicCore::open_or_create(dir.path().join("health_test.db")).unwrap(); + (core, dir) + } + + #[tokio::test] + async fn test_broken_link_check_detects_unresolved_markdown_link() { + let (core, _dir) = open_core(); + + // Atom A — exists with a known source URL + core.create_atom(crate::CreateAtomRequest { + content: "Alpha content".to_string(), + source_url: Some("vault://notes/alpha.md".to_string()), + published_at: None, + tag_ids: vec![], + skip_if_source_exists: false, + }, |_| {}).await.expect("create atom A"); + + // Atom B — has a broken link to ./bravo.md which doesn't exist + let atom_b = core.create_atom(crate::CreateAtomRequest { + content: "see [bravo](./bravo.md) for more".to_string(), + source_url: Some("vault://notes/beta.md".to_string()), + published_at: None, + tag_ids: vec![], + skip_if_source_exists: false, + }, |_| {}).await.expect("create atom B").expect("atom B created"); + + let report = compute_health(&core).await.expect("compute_health"); + let link_check = report.checks.get("broken_internal_links").expect("check present"); + + assert_eq!(link_check.status, "warning", "should be warning"); + let list = link_check.data["broken_link_list"].as_array().expect("broken_link_list array"); + assert_eq!(list.len(), 1, "one atom with broken link"); + assert_eq!(list[0]["atom_id"].as_str().unwrap(), atom_b.atom.id); + let links = list[0]["links"].as_array().expect("links array"); + assert_eq!(links.len(), 1); + assert_eq!(links[0]["raw"].as_str().unwrap(), "[bravo](./bravo.md)"); + assert_eq!(links[0]["kind"].as_str().unwrap(), "markdown"); + } + + #[tokio::test] + async fn test_remove_broken_link_strips_markdown_link() { + let (core, _dir) = open_core(); + + let atom = core.create_atom(crate::CreateAtomRequest { + content: "see [bravo](./bravo.md) for details".to_string(), + source_url: Some("vault://notes/beta.md".to_string()), + published_at: None, + tag_ids: vec![], + skip_if_source_exists: false, + }, |_| {}).await.expect("create atom").expect("atom created"); + + fixes::remove_broken_link(&core, &atom.atom.id, "[bravo](./bravo.md)") + .await + .expect("remove_broken_link"); + + let updated = core.get_atom(&atom.atom.id).await.expect("get_atom").expect("atom exists"); + assert_eq!(updated.atom.content, "see bravo for details"); + } + + #[tokio::test] + async fn test_dismiss_broken_link_filters_from_check() { + let (core, _dir) = open_core(); + + let atom_b = core.create_atom(crate::CreateAtomRequest { + content: "see [bravo](./bravo.md)".to_string(), + source_url: Some("vault://notes/beta.md".to_string()), + published_at: None, + tag_ids: vec![], + skip_if_source_exists: false, + }, |_| {}).await.expect("create atom").expect("atom created"); + + // Verify it appears as broken first + let report = compute_health(&core).await.expect("compute_health"); + let check = report.checks.get("broken_internal_links").expect("check"); + assert_eq!(check.status, "warning"); + + // Dismiss the atom + core.dismiss_health_item("broken_internal_links", &atom_b.atom.id, "ignored_broken_links", None) + .await + .expect("dismiss"); + + // Re-run — broken_link_list for B should be filtered out + let report2 = compute_health(&core).await.expect("compute_health 2"); + let check2 = report2.checks.get("broken_internal_links").expect("check2"); + let list2 = check2.data["broken_link_list"].as_array().expect("list"); + assert!( + list2.iter().all(|e| e["atom_id"].as_str().unwrap() != atom_b.atom.id), + "dismissed atom should be filtered out" + ); + assert_eq!(check2.data["broken_count"].as_i64().unwrap(), 0); + } + + #[tokio::test] + async fn test_suggest_atoms_by_query_source_url_exact() { + let (core, _dir) = open_core(); + + // Atom A with known source_url + let atom_a = core.create_atom(crate::CreateAtomRequest { + content: "# Bravo Notes\n\nContent here".to_string(), + source_url: Some("vault://notes/bravo.md".to_string()), + published_at: None, + tag_ids: vec![], + skip_if_source_exists: false, + }, |_| {}).await.expect("create A").expect("A created"); + + // Atom B — no source_url + core.create_atom(crate::CreateAtomRequest { + content: "# Other Atom".to_string(), + source_url: None, + published_at: None, + tag_ids: vec![], + skip_if_source_exists: false, + }, |_| {}).await.expect("create B"); + + let results = core + .suggest_atoms_for_broken_link("bravo.md", 5) + .await + .expect("suggest"); + + assert!(!results.is_empty(), "should return at least one result"); + let top = &results[0]; + assert_eq!(top.0, atom_a.atom.id, "top hit should be atom A"); + assert!((top.3 - 1.0f32).abs() < 0.01, "score should be 1.0 for exact suffix match"); + } + + #[tokio::test] + async fn test_relink_broken_link_rewrites_markdown() { + let (core, _dir) = open_core(); + + // Atom A — the target + let atom_a = core.create_atom(crate::CreateAtomRequest { + content: "# Bravo Notes".to_string(), + source_url: Some("vault://notes/bravo.md".to_string()), + published_at: None, + tag_ids: vec![], + skip_if_source_exists: false, + }, |_| {}).await.expect("create A").expect("A created"); + + // Atom C — has the broken link + let atom_c = core.create_atom(crate::CreateAtomRequest { + content: "see [bravo](./bravo.md) for details".to_string(), + source_url: Some("vault://notes/c.md".to_string()), + published_at: None, + tag_ids: vec![], + skip_if_source_exists: false, + }, |_| {}).await.expect("create C").expect("C created"); + + fixes::relink_broken_link(&core, &atom_c.atom.id, "[bravo](./bravo.md)", &atom_a.atom.id) + .await + .expect("relink_broken_link"); + + let updated = core.get_atom(&atom_c.atom.id).await.expect("get C").expect("C exists"); + let expected = format!("see [bravo](atom://{}) for details", atom_a.atom.id); + assert_eq!(updated.atom.content, expected, "link should be rewritten to atom://"); + } +} + +#[cfg(test)] +mod llm_tests { + //! Unit tests for `verify_overlap_pair`, `verify_contradiction_pair`, and + //! `merge_contradicting_pair`. Each test spins up a `wiremock::MockServer` + //! acting as an OpenAI-compatible endpoint, configures the core settings to + //! use it, then asserts the expected behaviour without a real LLM. + + use tempfile::TempDir; + use wiremock::matchers::{method, path}; + use wiremock::{Mock, MockServer, ResponseTemplate}; + use crate::AtomicCore; + use crate::health::llm_fixes; + + async fn open_core_with_llm(mock_url: &str) -> (AtomicCore, TempDir) { + let dir = TempDir::new().expect("tempdir"); + let core = AtomicCore::open_or_create(dir.path().join("llm_test.db")).unwrap(); + // Point the core's LLM provider at the mock server via openai_compat. + for (k, v) in [ + ("provider", "openai_compat"), + ("openai_compat_base_url", mock_url), + ("openai_compat_llm_model", "test-model"), + ("wiki_model", "test-model"), + ] { + core.storage() + .set_setting_sync(k, v) + .await.expect("set setting"); + } + (core, dir) + } + + fn chat_completion_body(content: &str) -> serde_json::Value { + serde_json::json!({ + "id": "chatcmpl-test", + "object": "chat.completion", + "created": 1699000000u64, + "model": "test-model", + "choices": [{ + "index": 0, + "message": {"role": "assistant", "content": content}, + "finish_reason": "stop" + }], + "usage": {"prompt_tokens": 10, "completion_tokens": 20, "total_tokens": 30} + }) + } + + #[tokio::test] + async fn test_verify_overlap_pair_false_positive_is_dismissed() { + let server = MockServer::start().await; + Mock::given(method("POST")) + .and(path("/v1/chat/completions")) + .respond_with( + ResponseTemplate::new(200).set_body_json(chat_completion_body( + r#"{"duplicate": false, "reason": "different topics"} "#, + )), + ) + .mount(&server) + .await; + + let (core, _dir) = open_core_with_llm(&server.uri()).await; + let atom_a = core.create_atom(crate::CreateAtomRequest { + content: "Rust ownership rules".to_string(), + source_url: None, published_at: None, tag_ids: vec![], skip_if_source_exists: false, + }, |_| {}).await.unwrap().unwrap(); + let atom_b = core.create_atom(crate::CreateAtomRequest { + content: "Python GIL internals".to_string(), + source_url: None, published_at: None, tag_ids: vec![], skip_if_source_exists: false, + }, |_| {}).await.unwrap().unwrap(); + + let (is_dup, reason) = + llm_fixes::verify_overlap_pair(&core, &atom_a.atom.id, &atom_b.atom.id) + .await + .expect("verify_overlap_pair"); + + assert!(!is_dup, "should report not duplicate"); + assert!(!reason.is_empty()); + } + + #[tokio::test] + async fn test_verify_contradiction_pair_false_positive_is_dismissed() { + let server = MockServer::start().await; + Mock::given(method("POST")) + .and(path("/v1/chat/completions")) + .respond_with( + ResponseTemplate::new(200).set_body_json(chat_completion_body( + r#"{"contradiction": false, "reason": "no conflict found"} "#, + )), + ) + .mount(&server) + .await; + + let (core, _dir) = open_core_with_llm(&server.uri()).await; + let atom_a = core.create_atom(crate::CreateAtomRequest { + content: "The sky is blue".to_string(), + source_url: None, published_at: None, tag_ids: vec![], skip_if_source_exists: false, + }, |_| {}).await.unwrap().unwrap(); + let atom_b = core.create_atom(crate::CreateAtomRequest { + content: "Water is H2O".to_string(), + source_url: None, published_at: None, tag_ids: vec![], skip_if_source_exists: false, + }, |_| {}).await.unwrap().unwrap(); + + let (is_real, reason) = + llm_fixes::verify_contradiction_pair(&core, &atom_a.atom.id, &atom_b.atom.id) + .await + .expect("verify_contradiction_pair"); + + assert!(!is_real, "should report no real contradiction"); + assert!(!reason.is_empty()); + } + + #[tokio::test] + async fn test_merge_contradicting_pair_dry_run_no_llm() { + // dry_run returns immediately without calling LLM + let dir = TempDir::new().expect("tempdir"); + let core = AtomicCore::open_or_create(dir.path().join("merge_test.db")).unwrap(); + let atom_a = core.create_atom(crate::CreateAtomRequest { + content: "Speed of light is 300,000 km/s".to_string(), + source_url: None, published_at: None, tag_ids: vec![], skip_if_source_exists: false, + }, |_| {}).await.unwrap().unwrap(); + let atom_b = core.create_atom(crate::CreateAtomRequest { + content: "Speed of light is 299,792 km/s".to_string(), + source_url: None, published_at: None, tag_ids: vec![], skip_if_source_exists: false, + }, |_| {}).await.unwrap().unwrap(); + + let action = llm_fixes::merge_contradicting_pair( + &core, &atom_a.atom.id, &atom_b.atom.id, true, + ) + .await + .expect("merge_contradicting_pair dry_run"); + + let fa = action.expect("dry_run returns Some(FixAction)"); + assert_eq!(fa.id, "dry_run"); + assert_eq!(fa.check, "contradiction_detection"); + assert_eq!(fa.action, "merge_with_llm"); + } + + #[tokio::test] + async fn test_propose_tag_restructure_parses_and_persists() { + let proposal_json = r#"{ + "summary": "Merge near-duplicate technology tags.", + "actions": [ + {"kind": "merge", "from_id": "t1", "into_id": "t2", "from_name": "rust-lang", "into_name": "rust", "reason": "same concept"}, + {"kind": "rename", "tag_id": "t3", "old_name": "ML", "new_name": "machine-learning", "reason": "spell out abbreviation"} + ] +}"#; + + let server = MockServer::start().await; + Mock::given(method("POST")) + .and(path("/v1/chat/completions")) + .respond_with( + ResponseTemplate::new(200).set_body_json(chat_completion_body(proposal_json)), + ) + .mount(&server) + .await; + + let (core, _dir) = open_core_with_llm(&server.uri()).await; + + let proposal = llm_fixes::propose_tag_restructure(&core) + .await + .expect("propose_tag_restructure"); + + assert_eq!(proposal.summary, "Merge near-duplicate technology tags."); + assert_eq!(proposal.actions.len(), 2); + + // Verify it was persisted. + let latest = core + .get_latest_tag_proposal() + .await + .expect("get_latest_tag_proposal") + .expect("should have a pending proposal"); + assert_eq!(latest.id, proposal.id); + assert_eq!(latest.actions.len(), 2); + } + + #[test] + fn test_strip_llm_json_fences_plain() { + let raw = r#"{"duplicate": false, "reason": "ok"}"#; + assert_eq!(llm_fixes::strip_llm_json_fences(raw), raw); + } + + #[test] + fn test_strip_llm_json_fences_json_fence() { + let raw = "```json\n{\"duplicate\": true, \"reason\": \"same\"}\n```"; + let cleaned = llm_fixes::strip_llm_json_fences(raw); + assert_eq!(cleaned, r#"{"duplicate": true, "reason": "same"}"#); + } + + #[test] + fn test_strip_llm_json_fences_bare_fence() { + let raw = "```\n[{\"kind\":\"merge\"}]\n```"; + let cleaned = llm_fixes::strip_llm_json_fences(raw); + assert_eq!(cleaned, r#"[{"kind":"merge"}]"#); + } + + #[test] + fn test_strip_llm_json_fences_prose_wrapper() { + let raw = "Here is the answer: {\"duplicate\": false} — hope that helps!"; + let cleaned = llm_fixes::strip_llm_json_fences(raw); + assert_eq!(cleaned, r#"{"duplicate": false}"#); + } + + #[test] + fn test_strip_llm_json_fences_roundtrip_parse() { + // Exact shape that broke in production. + let raw = "```json\n{\"duplicate\": false, \"reason\": \"completely different topics\"}\n```"; + let cleaned = llm_fixes::strip_llm_json_fences(raw); + let v: serde_json::Value = serde_json::from_str(cleaned).expect("must parse"); + assert_eq!(v["duplicate"], false); + } + + #[tokio::test] + async fn test_verify_overlap_pair_handles_fenced_response() { + // Regression: model wraps JSON in ```json fences. + let server = MockServer::start().await; + Mock::given(method("POST")) + .and(path("/v1/chat/completions")) + .respond_with( + ResponseTemplate::new(200).set_body_json(chat_completion_body( + "```json\n{\"duplicate\": false, \"reason\": \"completely different topics\"}\n```", + )), + ) + .mount(&server) + .await; + + let (core, _dir) = open_core_with_llm(&server.uri()).await; + let atom_a = core.create_atom(crate::CreateAtomRequest { + content: "Rust ownership rules".to_string(), + source_url: None, published_at: None, tag_ids: vec![], skip_if_source_exists: false, + }, |_| {}).await.unwrap().unwrap(); + let atom_b = core.create_atom(crate::CreateAtomRequest { + content: "Python GIL internals".to_string(), + source_url: None, published_at: None, tag_ids: vec![], skip_if_source_exists: false, + }, |_| {}).await.unwrap().unwrap(); + + let (is_dup, reason) = + llm_fixes::verify_overlap_pair(&core, &atom_a.atom.id, &atom_b.atom.id) + .await + .expect("verify_overlap_pair must handle fenced response"); + assert!(!is_dup); + assert_eq!(reason, "completely different topics"); + } + + #[tokio::test] + async fn test_verify_contradiction_pair_handles_fenced_response() { + let server = MockServer::start().await; + Mock::given(method("POST")) + .and(path("/v1/chat/completions")) + .respond_with( + ResponseTemplate::new(200).set_body_json(chat_completion_body( + "```json\n{\"contradiction\": false, \"reason\": \"different subjects\"}\n```", + )), + ) + .mount(&server) + .await; + + let (core, _dir) = open_core_with_llm(&server.uri()).await; + let atom_a = core.create_atom(crate::CreateAtomRequest { + content: "A".to_string(), + source_url: None, published_at: None, tag_ids: vec![], skip_if_source_exists: false, + }, |_| {}).await.unwrap().unwrap(); + let atom_b = core.create_atom(crate::CreateAtomRequest { + content: "B".to_string(), + source_url: None, published_at: None, tag_ids: vec![], skip_if_source_exists: false, + }, |_| {}).await.unwrap().unwrap(); + + let (is_real, reason) = + llm_fixes::verify_contradiction_pair(&core, &atom_a.atom.id, &atom_b.atom.id) + .await + .expect("verify_contradiction_pair must handle fenced response"); + assert!(!is_real); + assert_eq!(reason, "different subjects"); + } + + #[tokio::test] + async fn test_propose_tag_restructure_handles_fenced_response() { + let fenced = "```json\n{\"summary\":\"test\",\"actions\":[]}\n```"; + let server = MockServer::start().await; + Mock::given(method("POST")) + .and(path("/v1/chat/completions")) + .respond_with( + ResponseTemplate::new(200).set_body_json(chat_completion_body(fenced)), + ) + .mount(&server) + .await; + + let (core, _dir) = open_core_with_llm(&server.uri()).await; + let proposal = llm_fixes::propose_tag_restructure(&core) + .await + .expect("propose_tag_restructure must handle fenced response"); + assert_eq!(proposal.summary, "test"); + assert_eq!(proposal.actions.len(), 0); + } + + #[tokio::test] + async fn test_set_atom_locked_persists_and_roundtrips() { + let server = MockServer::start().await; + let (core, _dir) = open_core_with_llm(&server.uri()).await; + let atom = core.create_atom(crate::CreateAtomRequest { + content: "locked content".to_string(), + source_url: None, published_at: None, tag_ids: vec![], skip_if_source_exists: false, + }, |_| {}).await.unwrap().unwrap(); + + assert!(!core.is_atom_locked(&atom.atom.id).await.unwrap()); + core.set_atom_locked(&atom.atom.id, true).await.unwrap(); + assert!(core.is_atom_locked(&atom.atom.id).await.unwrap()); + + // The Atom struct read back from DB must reflect the flag too. + let refreshed = core.get_atom(&atom.atom.id).await.unwrap().unwrap(); + assert!(refreshed.atom.is_locked); + + core.set_atom_locked(&atom.atom.id, false).await.unwrap(); + assert!(!core.is_atom_locked(&atom.atom.id).await.unwrap()); + } + + #[tokio::test] + async fn test_strip_boilerplate_atom_refuses_locked() { + let server = MockServer::start().await; + let (core, _dir) = open_core_with_llm(&server.uri()).await; + let atom = core.create_atom(crate::CreateAtomRequest { + content: "# Book\n\nSource-of-truth content".to_string(), + source_url: None, published_at: None, tag_ids: vec![], skip_if_source_exists: false, + }, |_| {}).await.unwrap().unwrap(); + core.set_atom_locked(&atom.atom.id, true).await.unwrap(); + + let err = llm_fixes::strip_boilerplate_atom(&core, &atom.atom.id, false) + .await + .expect_err("locked atom must refuse strip"); + let msg = format!("{err}"); + assert!(msg.contains("locked"), "error must mention lock: {msg}"); + } + + #[tokio::test] + async fn test_merge_contradicting_pair_refuses_when_either_locked() { + let server = MockServer::start().await; + let (core, _dir) = open_core_with_llm(&server.uri()).await; + let a = core.create_atom(crate::CreateAtomRequest { + content: "Claim X.".to_string(), + source_url: None, published_at: None, tag_ids: vec![], skip_if_source_exists: false, + }, |_| {}).await.unwrap().unwrap(); + let b = core.create_atom(crate::CreateAtomRequest { + content: "Claim not-X.".to_string(), + source_url: None, published_at: None, tag_ids: vec![], skip_if_source_exists: false, + }, |_| {}).await.unwrap().unwrap(); + core.set_atom_locked(&a.atom.id, true).await.unwrap(); + + let err = llm_fixes::merge_contradicting_pair(&core, &a.atom.id, &b.atom.id, false) + .await + .expect_err("must refuse when one atom is locked"); + assert!(format!("{err}").contains("locked")); + } + + #[tokio::test] + async fn test_auto_resolve_broken_link_skips_locked() { + let server = MockServer::start().await; + let (core, _dir) = open_core_with_llm(&server.uri()).await; + let atom = core.create_atom(crate::CreateAtomRequest { + content: "see [x](./x.md)".to_string(), + source_url: Some("vault://notes/y.md".to_string()), + published_at: None, tag_ids: vec![], skip_if_source_exists: false, + }, |_| {}).await.unwrap().unwrap(); + core.set_atom_locked(&atom.atom.id, true).await.unwrap(); + + let outcome = llm_fixes::auto_resolve_broken_link(&core, &atom.atom.id, "[x](./x.md)", "x") + .await + .expect("must return without error for locked atom"); + match outcome { + llm_fixes::AutoResolveOutcome::Skipped { reason } => { + assert!(reason.contains("locked"), "skip reason should mention lock: {reason}"); + } + other => panic!("expected Skipped, got {:?}", other), + } + } + + #[tokio::test] + async fn test_wiki_excluded_tag_ids_roundtrip() { + let server = MockServer::start().await; + let (core, _dir) = open_core_with_llm(&server.uri()).await; + + // Empty by default. + let loaded = core.get_wiki_excluded_tag_ids().await.unwrap(); + assert!(loaded.is_empty()); + + // Save a set. + let ids = vec!["tag-private".to_string(), "tag-draft".to_string()]; + core.set_wiki_excluded_tag_ids(&ids).await.unwrap(); + let loaded = core.get_wiki_excluded_tag_ids().await.unwrap(); + assert_eq!(loaded, ids); + + // Clearing works. + core.set_wiki_excluded_tag_ids(&[]).await.unwrap(); + let loaded = core.get_wiki_excluded_tag_ids().await.unwrap(); + assert!(loaded.is_empty()); + } + // ==================== HealthThresholds::validate ==================== + + #[test] + fn test_thresholds_default_validates() { + let t = crate::health::HealthThresholds::default(); + assert!(t.validate().is_empty(), "defaults must be valid: {:?}", t.validate()); + } + + #[test] + fn test_thresholds_similarity_out_of_range() { + let mut t = crate::health::HealthThresholds::default(); + t.boilerplate_similarity = 1.5; + let errs = t.validate(); + assert!(errs.iter().any(|e| e.contains("boilerplate_similarity"))); + } + + #[test] + fn test_thresholds_similarity_nan() { + let mut t = crate::health::HealthThresholds::default(); + t.content_overlap_similarity_max = f32::NAN; + let errs = t.validate(); + assert!(errs.iter().any(|e| e.contains("finite"))); + } + + #[test] + fn test_thresholds_inverted_contradiction_window() { + let mut t = crate::health::HealthThresholds::default(); + t.contradiction_similarity_min = 0.95; + t.contradiction_similarity_max = 0.90; + let errs = t.validate(); + assert!(errs.iter().any(|e| e.contains("contradiction_similarity_min"))); + } + + #[test] + fn test_thresholds_negative_counts() { + let mut t = crate::health::HealthThresholds::default(); + t.boilerplate_min_clones = -1; + t.content_quality_short_chars = -10; + let errs = t.validate(); + assert!(errs.iter().any(|e| e.contains("boilerplate_min_clones"))); + assert!(errs.iter().any(|e| e.contains("content_quality_short_chars"))); + } + + #[test] + fn test_thresholds_wiki_min_atoms_zero_rejected() { + let mut t = crate::health::HealthThresholds::default(); + t.wiki_min_atoms_per_tag = 0; + let errs = t.validate(); + assert!(errs.iter().any(|e| e.contains("wiki_min_atoms_per_tag"))); + } + + #[test] + fn test_thresholds_short_geq_long_rejected() { + let mut t = crate::health::HealthThresholds::default(); + t.content_quality_short_chars = 20_000; + t.content_quality_long_chars = 15_000; + let errs = t.validate(); + assert!(errs.iter().any(|e| e.contains("content_quality_short_chars"))); + } +} \ No newline at end of file diff --git a/crates/atomic-core/src/health/types.rs b/crates/atomic-core/src/health/types.rs new file mode 100644 index 00000000..459f07de --- /dev/null +++ b/crates/atomic-core/src/health/types.rs @@ -0,0 +1,529 @@ +//! Health data types. +//! +//! Split out of `mod.rs` to keep the orchestrator module focused on control +//! flow. All public types crossing the `atomic-core` → server boundary live +//! here; check-specific rows (`DuplicatePair`, `WikiGap`, …) also live here +//! because they're part of the JSON payload returned by the health API. + +use serde::{Deserialize, Serialize}; +use std::collections::HashMap; + +// ==================== Core types ==================== + +/// Overall status derived from the numeric score. +#[cfg_attr(feature = "openapi", derive(utoipa::ToSchema))] +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +#[serde(rename_all = "snake_case")] +pub enum HealthStatus { + Healthy, + NeedsAttention, + Degraded, + Unhealthy, +} + +impl HealthStatus { + pub fn from_score(score: u32) -> Self { + match score { + 90..=100 => Self::Healthy, + 70..=89 => Self::NeedsAttention, + 50..=69 => Self::Degraded, + _ => Self::Unhealthy, + } + } + pub fn as_str(&self) -> &'static str { + match self { + Self::Healthy => "healthy", + Self::NeedsAttention => "needs_attention", + Self::Degraded => "degraded", + Self::Unhealthy => "unhealthy", + } + } +} + +/// Result for one individual health check. +#[cfg_attr(feature = "openapi", derive(utoipa::ToSchema))] +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct HealthCheckResult { + /// "ok" | "warning" | "error" + pub status: String, + /// 0–100 contribution to the overall score + pub score: u32, + pub auto_fixable: bool, + pub requires_review: bool, + /// When true, this check is opinionated ("completeness-style") and does + /// NOT contribute to the overall score. Shown as a diagnostic. The user + /// can opt-in via health config to give it a non-zero weight. + #[serde(default)] + pub informational: bool, + #[serde(skip_serializing_if = "Option::is_none")] + pub fix_action: Option, + /// Check-specific numbers, lists, pairs, etc. + pub data: serde_json::Value, +} + +/// Complete health report returned by `GET /api/health/knowledge`. +#[cfg_attr(feature = "openapi", derive(utoipa::ToSchema))] +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct HealthReport { + pub overall_score: u32, + pub overall_status: String, + pub computed_at: String, + pub atom_count: i32, + pub checks: HashMap, + pub auto_fixable: i32, + pub requires_review: i32, + #[serde(skip_serializing_if = "Option::is_none")] + pub previous_score: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub previous_check_scores: Option>, +} + +/// A single action taken (or that would be taken) during a fix run. +#[cfg_attr(feature = "openapi", derive(utoipa::ToSchema))] +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct FixAction { + /// ID of the `health_fix_log` row (for undo). + pub id: String, + pub check: String, + pub action: String, + pub count: i32, + pub details: Vec, +} + +/// An issue that was skipped (too high tier, or no-op). +#[cfg_attr(feature = "openapi", derive(utoipa::ToSchema))] +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SkippedFix { + pub check: String, + pub reason: String, + pub count: i32, +} + +/// Response from `POST /api/health/fix`. +#[cfg_attr(feature = "openapi", derive(utoipa::ToSchema))] +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct FixResponse { + pub mode: String, + pub actions_taken: Vec, + pub skipped: Vec, + pub new_score: u32, +} + +/// Fix safety tier. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum FixTier { + /// Retry pipelines, process pending — zero risk. + Safe, + /// Delete orphan tags, generate missing wikis — logged, undoable. + Low, + /// Modify content (add headings, merge exact-source dupes) — dry-run first. + Medium, + /// Merges, splits, deletes — always requires user confirmation. + High, +} + +impl FixTier { + pub fn as_str(&self) -> &'static str { + match self { + Self::Safe => "safe", + Self::Low => "low", + Self::Medium => "medium", + Self::High => "high", + } + } +} + +/// What the caller wants the fix run to do. +#[cfg_attr(feature = "openapi", derive(utoipa::ToSchema))] +#[derive(Debug, Clone, Deserialize, Serialize)] +pub struct FixRequest { + /// Which checks to fix; `None` = all auto-fixable checks. + pub checks: Option>, + /// "auto" = execute changes; "dry_run" = report without executing. + pub mode: String, + /// Include Medium-tier fixes (default false). + #[serde(default)] + pub include_medium: bool, +} + +impl FixRequest { + pub fn is_dry_run(&self) -> bool { + self.mode == "dry_run" + } + pub fn max_tier(&self) -> FixTier { + if self.include_medium { + FixTier::Medium + } else { + FixTier::Low + } + } +} + +// ==================== Raw data types used across checks ==================== + +/// Atom pair with high similarity (potential duplicate). +#[cfg_attr(feature = "openapi", derive(utoipa::ToSchema))] +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct DuplicatePair { + pub pair_id: String, + pub atom_a_id: String, + pub atom_a_title: String, + pub atom_a_source: Option, + pub atom_b_id: String, + pub atom_b_title: String, + pub atom_b_source: Option, + pub similarity: f32, + /// Number of tags shared between the two atoms (higher = more likely related). + pub shared_tag_count: i32, + pub atom_a_created_at: Option, + pub atom_b_created_at: Option, +} + +/// Tag eligible for wiki that doesn't have one yet. +#[cfg_attr(feature = "openapi", derive(utoipa::ToSchema))] +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct WikiGap { + pub tag_id: String, + pub tag_name: String, + pub atom_count: i32, +} + +/// Wiki that exists but is out of date. +#[cfg_attr(feature = "openapi", derive(utoipa::ToSchema))] +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct WikiStaleEntry { + pub tag_id: String, + pub tag_name: String, + pub new_atom_count: i32, +} + +/// Atom preview for review sections that need title + date without full content. +#[cfg_attr(feature = "openapi", derive(utoipa::ToSchema))] +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +pub struct AtomPreview { + pub id: String, + pub title: String, + pub created_at: String, +} + +/// Boilerplate-affected atom with clone count for prioritised review. +#[cfg_attr(feature = "openapi", derive(utoipa::ToSchema))] +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +pub struct BoilerplateAtomEntry { + pub id: String, + pub title: String, + /// Number of semantic edges at similarity ≥0.99 from this atom. + pub clone_count: i32, +} + +/// Atom stub used inside contradiction pair entries. +#[cfg_attr(feature = "openapi", derive(utoipa::ToSchema))] +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +pub struct ContradictionAtom { + pub id: String, + pub title: String, + pub source: Option, + pub created_at: Option, +} + +/// Pair of high-similarity atoms surfaced for manual contradiction review. +#[cfg_attr(feature = "openapi", derive(utoipa::ToSchema))] +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +pub struct ContradictionPairEntry { + pub pair_id: String, + pub atom_a: ContradictionAtom, + pub atom_b: ContradictionAtom, + /// Similarity score 0.0–1.0 (expected range 0.75–0.92 for contradictions). + pub similarity: f32, + pub shared_tag_count: i32, +} + +/// Rootless tag entry for the tag-health review list. +#[cfg_attr(feature = "openapi", derive(utoipa::ToSchema))] +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +pub struct RootlessTagEntry { + pub id: String, + pub name: String, + pub atom_count: i32, +} + +#[derive(Debug, Clone, Default)] +pub struct SingleAtomTagEntry { + pub id: String, + pub name: String, + pub is_autotag: bool, +} + +// ==================== Tag Proposal Types ==================== + +/// One proposed structural change to the tag tree. +#[derive(Serialize, Deserialize, Debug, Clone)] +#[cfg_attr(feature = "openapi", derive(utoipa::ToSchema))] +#[serde(tag = "kind", rename_all = "snake_case")] +pub enum TagProposalAction { + Merge { + from_id: String, + into_id: String, + from_name: String, + into_name: String, + reason: String, + }, + Rename { + tag_id: String, + old_name: String, + new_name: String, + reason: String, + }, + Reparent { + tag_id: String, + tag_name: String, + new_parent_id: Option, + new_parent_name: Option, + reason: String, + }, + Delete { + tag_id: String, + tag_name: String, + reason: String, + }, +} + +/// An LLM-generated proposal to reorganise the tag tree. +#[derive(Serialize, Deserialize, Debug, Clone)] +#[cfg_attr(feature = "openapi", derive(utoipa::ToSchema))] +pub struct TagProposal { + /// UUID used to apply the proposal later. + pub id: String, + /// One-paragraph LLM rationale. + pub summary: String, + pub actions: Vec, + /// RFC-3339 timestamp of generation. + pub generated_at: String, +} + +/// Per-DB health configuration. +/// +/// Stored as JSON under the `health_config` setting key in each data DB. +/// Empty / missing → all defaults (informational checks score-excluded, +/// default-weighted checks use CHECK_WEIGHTS). +#[cfg_attr(feature = "openapi", derive(utoipa::ToSchema))] +#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize, PartialEq)] +pub struct HealthConfig { + /// Per-check overrides. `enabled: false` suppresses the check entirely; + /// `weight: Some(w)` contributes it to the overall score at that weight + /// (sum of effective weights is renormalized). + #[serde(default)] + pub overrides: std::collections::HashMap, + + /// Detection thresholds shared across the synchronous health checks. + /// Missing / partial values fall back to `HealthThresholds::default()`. + #[serde(default)] + pub thresholds: HealthThresholds, +} + +/// Tunable detection thresholds. Every field has a sane default baked in via +/// `Default` so a fresh DB works without any config. Serialised with +/// `#[serde(default)]` on each field so adding new thresholds is forward- +/// compatible — older configs deserialise into current defaults. +#[cfg_attr(feature = "openapi", derive(utoipa::ToSchema))] +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, PartialEq)] +pub struct HealthThresholds { + // ---- boilerplate_pollution ---- + /// Edges at/above this similarity are treated as template clones. Default 0.99. + #[serde(default = "default_boilerplate_similarity")] + pub boilerplate_similarity: f32, + /// Minimum clone-edge count before an atom is flagged. Default 2. + #[serde(default = "default_boilerplate_min_clones")] + pub boilerplate_min_clones: i32, + + // ---- contradiction_detection ---- + /// Lower bound (inclusive) of the contradiction similarity window. Default 0.80. + #[serde(default = "default_contradiction_sim_min")] + pub contradiction_similarity_min: f32, + /// Upper bound (exclusive) of the contradiction similarity window. Default 0.92. + #[serde(default = "default_contradiction_sim_max")] + pub contradiction_similarity_max: f32, + /// Minimum shared-tag count for a pair to surface. Default 1. + #[serde(default = "default_contradiction_shared_tags")] + pub contradiction_shared_tags_min: i32, + /// Token-Jaccard upper bound for contradiction pairs. Pairs whose atom + /// contents overlap at/above this fraction of unique tokens are treated + /// as template/boilerplate clones and filtered out of the contradiction + /// list — real contradictions express *different* claims and therefore + /// different token sets. Default 0.70. + #[serde(default = "default_contradiction_max_jaccard")] + pub contradiction_max_content_jaccard: f32, + + // ---- content_overlap (cross-source near-duplicates) ---- + /// Lower bound (inclusive) of the cross-source overlap window. Default 0.55. + #[serde(default = "default_overlap_sim_min")] + pub content_overlap_similarity_min: f32, + /// Upper bound (inclusive) of the cross-source overlap window. Default 0.85. + #[serde(default = "default_overlap_sim_max")] + pub content_overlap_similarity_max: f32, + /// Minimum shared-tag count for a pair to surface. Default 2. + #[serde(default = "default_overlap_shared_tags")] + pub content_overlap_shared_tags_min: i32, + + // ---- content_quality ---- + /// Atoms shorter than this are flagged as `very_short`. Default 100. + #[serde(default = "default_short_chars")] + pub content_quality_short_chars: i32, + /// Atoms longer than this are flagged as `very_long`. Default 15_000. + #[serde(default = "default_long_chars")] + pub content_quality_long_chars: i32, + + // ---- wiki_coverage ---- + /// Minimum atoms per tag for the tag to be "wiki-eligible". Default 5. + #[serde(default = "default_wiki_min_atoms")] + pub wiki_min_atoms_per_tag: i32, + + // ---- tag_health ---- + /// Max autotag single-atom tags before the check penalises. Default 3. + #[serde(default = "default_single_atom_tag_threshold")] + pub tag_health_single_atom_threshold: i32, + + // ---- semantic_graph_freshness ---- + /// Atoms added since last rebuild before score drops from warning to error. Default 20. + #[serde(default = "default_graph_freshness_warning")] + pub semantic_graph_freshness_warning: i32, +} + +impl Default for HealthThresholds { + fn default() -> Self { + Self { + boilerplate_similarity: default_boilerplate_similarity(), + boilerplate_min_clones: default_boilerplate_min_clones(), + contradiction_similarity_min: default_contradiction_sim_min(), + contradiction_similarity_max: default_contradiction_sim_max(), + contradiction_shared_tags_min: default_contradiction_shared_tags(), + contradiction_max_content_jaccard: default_contradiction_max_jaccard(), + content_overlap_similarity_min: default_overlap_sim_min(), + content_overlap_similarity_max: default_overlap_sim_max(), + content_overlap_shared_tags_min: default_overlap_shared_tags(), + content_quality_short_chars: default_short_chars(), + content_quality_long_chars: default_long_chars(), + wiki_min_atoms_per_tag: default_wiki_min_atoms(), + tag_health_single_atom_threshold: default_single_atom_tag_threshold(), + semantic_graph_freshness_warning: default_graph_freshness_warning(), + } + } +} + +fn default_boilerplate_similarity() -> f32 { 0.99 } +fn default_boilerplate_min_clones() -> i32 { 2 } +fn default_contradiction_sim_min() -> f32 { 0.80 } +fn default_contradiction_sim_max() -> f32 { 0.92 } +fn default_contradiction_shared_tags() -> i32 { 1 } +fn default_contradiction_max_jaccard() -> f32 { 0.70 } +fn default_overlap_sim_min() -> f32 { 0.55 } +fn default_overlap_sim_max() -> f32 { 0.85 } +fn default_overlap_shared_tags() -> i32 { 2 } +fn default_short_chars() -> i32 { 100 } +fn default_long_chars() -> i32 { 15_000 } +fn default_wiki_min_atoms() -> i32 { 5 } +fn default_single_atom_tag_threshold() -> i32 { 3 } +fn default_graph_freshness_warning() -> i32 { 20 } + + +impl HealthThresholds { + /// Validate user-supplied thresholds. Returns a list of problems, empty on success. + /// + /// Rules are deliberately lenient — we only reject values that would cause the + /// SQL / score math to misbehave (NaN, negative counts, similarities outside [0,1], + /// inverted min/max windows). Tightening beyond that is an editorial choice, left + /// to the UI. + pub fn validate(&self) -> Vec { + let mut errs = Vec::new(); + + // ---- similarities must be finite and within [0.0, 1.0] ---- + let sims: [(&str, f32); 6] = [ + ("boilerplate_similarity", self.boilerplate_similarity), + ("contradiction_similarity_min", self.contradiction_similarity_min), + ("contradiction_similarity_max", self.contradiction_similarity_max), + ("contradiction_max_content_jaccard", self.contradiction_max_content_jaccard), + ("content_overlap_similarity_min", self.content_overlap_similarity_min), + ("content_overlap_similarity_max", self.content_overlap_similarity_max), + ]; + for (name, v) in sims { + if !v.is_finite() { + errs.push(format!("{name} must be a finite number")); + } else if !(0.0..=1.0).contains(&v) { + errs.push(format!("{name} must be in [0.0, 1.0] (got {v})")); + } + } + + // ---- min/max windows must not be inverted ---- + if self.contradiction_similarity_min >= self.contradiction_similarity_max { + errs.push(format!( + "contradiction_similarity_min ({}) must be < contradiction_similarity_max ({})", + self.contradiction_similarity_min, self.contradiction_similarity_max, + )); + } + if self.content_overlap_similarity_min > self.content_overlap_similarity_max { + errs.push(format!( + "content_overlap_similarity_min ({}) must be ≤ content_overlap_similarity_max ({})", + self.content_overlap_similarity_min, self.content_overlap_similarity_max, + )); + } + + // ---- non-negative integer counts ---- + let non_neg: [(&str, i32); 7] = [ + ("boilerplate_min_clones", self.boilerplate_min_clones), + ("contradiction_shared_tags_min", self.contradiction_shared_tags_min), + ("content_overlap_shared_tags_min", self.content_overlap_shared_tags_min), + ("content_quality_short_chars", self.content_quality_short_chars), + ("content_quality_long_chars", self.content_quality_long_chars), + ("tag_health_single_atom_threshold", self.tag_health_single_atom_threshold), + ("semantic_graph_freshness_warning", self.semantic_graph_freshness_warning), + ]; + for (name, v) in non_neg { + if v < 0 { + errs.push(format!("{name} must be ≥ 0 (got {v})")); + } + } + + // ---- wiki min_atoms must be ≥ 1 (0 would make every tag "wiki-eligible") ---- + if self.wiki_min_atoms_per_tag < 1 { + errs.push(format!( + "wiki_min_atoms_per_tag must be ≥ 1 (got {})", + self.wiki_min_atoms_per_tag, + )); + } + + // ---- short_chars must be < long_chars (else every atom is both) ---- + if self.content_quality_short_chars >= self.content_quality_long_chars { + errs.push(format!( + "content_quality_short_chars ({}) must be < content_quality_long_chars ({})", + self.content_quality_short_chars, self.content_quality_long_chars, + )); + } + + errs + } +} + +#[cfg_attr(feature = "openapi", derive(utoipa::ToSchema))] +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, PartialEq)] +pub struct HealthCheckOverride { + /// When false, the check is not run and not displayed. Default: true. + #[serde(default = "default_enabled")] + pub enabled: bool, + /// When `Some`, use this weight in the overall score (overrides default + /// and lifts informational checks into scoring if > 0). + #[serde(default, skip_serializing_if = "Option::is_none")] + pub weight: Option, +} + +fn default_enabled() -> bool { + true +} + +impl Default for HealthCheckOverride { + fn default() -> Self { + Self { + enabled: true, + weight: None, + } + } +} diff --git a/crates/atomic-core/src/lib.rs b/crates/atomic-core/src/lib.rs index 703282c3..5b6f2d1f 100644 --- a/crates/atomic-core/src/lib.rs +++ b/crates/atomic-core/src/lib.rs @@ -30,6 +30,7 @@ pub mod agent; pub(crate) mod atom_links; pub mod briefing; +pub(crate) mod boilerplate; pub mod canvas_level; pub mod chat; pub mod chunking; @@ -42,6 +43,7 @@ pub mod executor; pub mod export; pub mod extraction; pub mod graph_maintenance; +pub mod health; pub mod import; pub mod ingest; pub mod manager; @@ -507,6 +509,19 @@ impl AtomicCore { &self.storage } + /// Suggest atom candidates for a broken link query. + /// Returns vec of (atom_id, title, source_url, score). + pub async fn suggest_atoms_for_broken_link( + &self, + q: &str, + limit: i32, + ) -> Result, f32)>, crate::error::AtomicCoreError> { + self.storage + .suggest_atoms_by_query_sync(q.to_string(), limit) + .await + } + + // ==================== Settings ==================== // // Resolution model (see `settings::WORKSPACE_ONLY_KEYS` and @@ -2151,6 +2166,58 @@ impl AtomicCore { Ok(result) } + /// Get a tag name and parent_id by ID. + pub async fn get_tag_by_id( + &self, + tag_id: &str, + ) -> Result)>, AtomicCoreError> { + self.storage.get_tag_by_id_sync(tag_id).await + } + + /// Get the most recent un-applied tag proposal, if any. + pub async fn get_latest_tag_proposal( + &self, + ) -> Result, AtomicCoreError> { + self.storage.get_latest_tag_proposal_sync().await + } + + /// Persist a health dismissal (insert or update). + pub async fn dismiss_health_item( + &self, + check_name: &str, + item_key: &str, + reason: &str, + expires_at: Option<&str>, + ) -> Result<(), AtomicCoreError> { + self.storage + .dismiss_health_item_sync(check_name, item_key, reason, expires_at) + .await + } + + /// Remove a health dismissal. + pub async fn undismiss_health_item( + &self, + check_name: &str, + item_key: &str, + ) -> Result<(), AtomicCoreError> { + self.storage + .undismiss_health_item_sync(check_name, item_key) + .await + } + + /// GC stale health dismissal rows (expired TTL + orphaned atom/tag refs). + pub async fn gc_health_dismissals(&self) -> Result { + self.storage.gc_dismissals_sync().await + } + + /// List active dismissals for a check. Returns (item_key, reason) pairs. + pub async fn list_dismissed_keys( + &self, + check_name: &str, + ) -> Result, AtomicCoreError> { + self.storage.list_dismissed_keys_sync(check_name).await + } + // ==================== Chat Operations ==================== /// Create a new conversation @@ -3571,6 +3638,174 @@ impl AtomicCore { pub async fn recompute_all_tag_embeddings(&self) -> Result { self.storage.recompute_all_tag_embeddings_sync().await } + + // ==================== Health ==================== + + /// Compute a full health report across all 10 checks. + pub async fn compute_health(&self) -> Result { + crate::health::compute_health(self).await + } + + /// Load this database's `HealthConfig`. Missing/invalid → defaults. + /// + /// Reads from the per-database `settings` table (NOT the registry), so + /// each data DB can have its own config. See `AGENTS.md` § Multi-DB + /// Gotchas for why we bypass `AtomicCore::get_setting` here. + pub async fn get_health_config(&self) -> Result { + let raw = self.storage().get_setting_sync("health_config").await?; + match raw { + Some(s) => Ok(serde_json::from_str(&s).unwrap_or_default()), + None => Ok(crate::health::HealthConfig::default()), + } + } + + /// Persist this database's `HealthConfig`. + pub async fn set_health_config( + &self, + config: &crate::health::HealthConfig, + ) -> Result<(), AtomicCoreError> { + let errs = config.thresholds.validate(); + if !errs.is_empty() { + return Err(AtomicCoreError::Validation(format!( + "invalid health thresholds: {}", + errs.join("; "), + ))); + } + let json = serde_json::to_string(config) + .map_err(|e| AtomicCoreError::Validation(format!("serialize health_config: {e}")))?; + self.storage().set_setting_sync("health_config", &json).await + } + + /// Set the lock flag on an atom. Locked atoms are protected from automated + /// health fixes (strip-boilerplate, merge-duplicate, resolve-contradiction, + /// relink-broken-link). + pub async fn set_atom_locked(&self, atom_id: &str, locked: bool) -> Result<(), AtomicCoreError> { + let sqlite = self.storage.as_sqlite().ok_or_else(|| { + AtomicCoreError::Configuration( + "Atom lock is not yet supported with Postgres backend".to_string(), + ) + })?; + let conn = sqlite.db.conn.lock().map_err(|e| AtomicCoreError::Lock(e.to_string()))?; + let affected = conn.execute( + "UPDATE atoms SET is_locked = ?1, updated_at = ?2 WHERE id = ?3", + rusqlite::params![locked as i64, chrono::Utc::now().to_rfc3339(), atom_id], + ).map_err(AtomicCoreError::Database)?; + if affected == 0 { + return Err(AtomicCoreError::Validation(format!("atom not found: {atom_id}"))); + } + Ok(()) + } + + /// True when the atom's `is_locked` flag is set. Non-existent atoms return false. + pub async fn is_atom_locked(&self, atom_id: &str) -> Result { + let sqlite = self.storage.as_sqlite().ok_or_else(|| { + AtomicCoreError::Configuration( + "Atom lock is not yet supported with Postgres backend".to_string(), + ) + })?; + let conn = sqlite.db.conn.lock().map_err(|e| AtomicCoreError::Lock(e.to_string()))?; + let locked: i64 = conn + .query_row( + "SELECT COALESCE(is_locked, 0) FROM atoms WHERE id = ?1", + rusqlite::params![atom_id], + |row| row.get(0), + ) + .unwrap_or(0); + Ok(locked != 0) + } + + /// Load the per-DB list of tag ids to exclude from wiki generation. + /// + /// Any atom tagged with ANY excluded tag must not be visible to wiki LLM + /// prompts. Stored as a JSON array of strings under the per-DB + /// `wiki_excluded_tag_ids` setting. Missing/invalid → empty. + pub async fn get_wiki_excluded_tag_ids(&self) -> Result, AtomicCoreError> { + let raw = self.storage().get_setting_sync("wiki_excluded_tag_ids").await?; + match raw { + Some(s) => Ok(serde_json::from_str(&s).unwrap_or_default()), + None => Ok(Vec::new()), + } + } + + /// Persist the per-DB list of excluded tag ids for wiki generation. + pub async fn set_wiki_excluded_tag_ids(&self, tag_ids: &[String]) -> Result<(), AtomicCoreError> { + let json = serde_json::to_string(tag_ids) + .map_err(|e| AtomicCoreError::Validation(format!("serialize excluded tags: {e}")))?; + self.storage().set_setting_sync("wiki_excluded_tag_ids", &json).await + } + + /// Load this database's custom health checks. Stored per-DB as a JSON + /// array under the `custom_health_checks` setting key (NOT registry). + pub async fn get_custom_health_checks( + &self, + ) -> Result, AtomicCoreError> { + let raw = self.storage().get_setting_sync("custom_health_checks").await?; + match raw { + Some(s) => Ok(serde_json::from_str(&s).unwrap_or_default()), + None => Ok(Vec::new()), + } + } + + /// Persist this database's custom health checks. + pub async fn set_custom_health_checks( + &self, + checks: &[crate::health::custom::CustomCheck], + ) -> Result<(), AtomicCoreError> { + let json = serde_json::to_string(checks) + .map_err(|e| AtomicCoreError::Validation(format!("serialize custom_health_checks: {e}")))?; + self.storage().set_setting_sync("custom_health_checks", &json).await + } + + /// Evaluate a rule against this DB without saving it. Used by the UI + /// for live preview while users author custom checks. + pub async fn preview_custom_health_check( + &self, + rule: &crate::health::custom::CustomRule, + ) -> Result { + let sqlite = self.storage.as_sqlite().ok_or_else(|| { + AtomicCoreError::Configuration( + "Custom health checks are not yet supported with Postgres backend".to_string(), + ) + })?; + crate::health::custom::preview_rule(sqlite, rule) + } + + /// Run auto-fixes up to the requested tier. Returns a `FixResponse` with + /// actions taken, skipped issues, and the new score. + pub async fn run_health_fix( + &self, + req: &crate::health::FixRequest, + ) -> Result { + crate::health::run_fix(self, req).await + } + + /// Undo a previously applied fix by its log ID. + pub async fn undo_health_fix(&self, fix_id: &str) -> Result<(), AtomicCoreError> { + crate::health::audit::undo(self, fix_id).await + } + + /// Fetch the most recently stored health report without recomputing. + pub async fn get_latest_health_report( + &self, + ) -> Result, AtomicCoreError> { + self.storage.get_latest_health_report_sync().await + } + + /// Fetch recent stored health reports for trend display. + pub async fn get_health_reports( + &self, + limit: i32, + ) -> Result, AtomicCoreError> { + self.storage.get_health_reports_sync(limit).await + } + + /// Fetch recent fix log entries (most recent first). + pub async fn get_recent_health_fixes( + &self, + limit: i32, + ) -> Result, AtomicCoreError> { + self.storage.get_recent_fixes_sync(limit).await + } } fn oauth_unavailable() -> AtomicCoreError { @@ -4108,10 +4343,10 @@ pub(crate) fn parse_source(source_url: &str) -> String { } /// Standard SELECT columns for reading an Atom from the DB. -pub(crate) const ATOM_COLUMNS: &str = "id, content, title, snippet, source_url, source, published_at, created_at, updated_at, COALESCE(embedding_status, 'pending'), COALESCE(tagging_status, 'pending'), embedding_error, tagging_error"; +pub(crate) const ATOM_COLUMNS: &str = "id, content, title, snippet, source_url, source, published_at, created_at, updated_at, COALESCE(embedding_status, 'pending'), COALESCE(tagging_status, 'pending'), embedding_error, tagging_error, COALESCE(is_locked, 0)"; /// Same columns but table-aliased for JOINs. -pub(crate) const ATOM_COLUMNS_A: &str = "a.id, a.content, a.title, a.snippet, a.source_url, a.source, a.published_at, a.created_at, a.updated_at, COALESCE(a.embedding_status, 'pending'), COALESCE(a.tagging_status, 'pending'), a.embedding_error, a.tagging_error"; +pub(crate) const ATOM_COLUMNS_A: &str = "a.id, a.content, a.title, a.snippet, a.source_url, a.source, a.published_at, a.created_at, a.updated_at, COALESCE(a.embedding_status, 'pending'), COALESCE(a.tagging_status, 'pending'), a.embedding_error, a.tagging_error, COALESCE(a.is_locked, 0)"; /// Parse an Atom from a row selected with ATOM_COLUMNS. pub(crate) fn atom_from_row(row: &rusqlite::Row) -> rusqlite::Result { @@ -4129,6 +4364,7 @@ pub(crate) fn atom_from_row(row: &rusqlite::Row) -> rusqlite::Result { tagging_status: row.get(10)?, embedding_error: row.get(11)?, tagging_error: row.get(12)?, + is_locked: row.get::<_, i64>(13).unwrap_or(0) != 0, }) } diff --git a/crates/atomic-core/src/models.rs b/crates/atomic-core/src/models.rs index 5be0473b..c3115461 100644 --- a/crates/atomic-core/src/models.rs +++ b/crates/atomic-core/src/models.rs @@ -22,6 +22,11 @@ pub struct Atom { pub tagging_status: String, // 'pending', 'processing', 'complete', 'failed', 'skipped' pub embedding_error: Option, pub tagging_error: Option, + /// When true, this atom is protected from automated health-fix mutations + /// (strip-boilerplate, auto-merge-duplicate, auto-resolve-contradiction, + /// relink-broken-link). It remains readable and manually editable. + #[serde(default)] + pub is_locked: bool, } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -367,6 +372,10 @@ pub struct WikiArticleSummary { pub updated_at: String, pub atom_count: i32, pub inbound_links: i32, + /// Live count of atoms tagged under this tag hierarchy that have been added + /// since the article was last generated. Computed server-side via the same + /// recursive CTE used by `GET /api/wiki/{tag_id}/status`; never stale. + pub new_atoms_available: i32, } /// Inter-article wiki link (cross-reference between wiki articles) diff --git a/crates/atomic-core/src/settings.rs b/crates/atomic-core/src/settings.rs index b9ba5709..b701c8bf 100644 --- a/crates/atomic-core/src/settings.rs +++ b/crates/atomic-core/src/settings.rs @@ -82,6 +82,13 @@ pub const DEFAULT_SETTINGS: &[(&str, &str)] = &[ ("task.draft_pipeline.enabled", "true"), ("task.draft_pipeline.interval_minutes", "1"), ("task.draft_pipeline.quiet_minutes", "1"), + // Health maintenance task + ("task.health_maintenance.enabled", "true"), + ("task.health_maintenance.interval_hours", "24"), + // Health LLM prompt templates (empty = use built-in defaults) + ("health.merge_duplicates_prompt", ""), + ("health.contradiction_detection_prompt", ""), + ("health.strip_boilerplate_prompt", ""), ]; /// Migrate settings - add any missing default settings diff --git a/crates/atomic-core/src/storage/mod.rs b/crates/atomic-core/src/storage/mod.rs index 9c33367f..e0a21055 100644 --- a/crates/atomic-core/src/storage/mod.rs +++ b/crates/atomic-core/src/storage/mod.rs @@ -130,6 +130,437 @@ impl StorageBackend { } } +impl StorageBackend { + // ==================== Health dispatch methods ==================== + // These are not part of the Storage trait — health is an internal concern. + // Postgres returns an error for all health operations (not yet supported). + + pub(crate) async fn health_check_data_sync( + &self, + thresholds: crate::health::HealthThresholds, + ) -> Result { + match self { + StorageBackend::Sqlite(s) => { + let s = s.clone(); + tokio::task::spawn_blocking(move || s.health_check_data_impl(&thresholds)) + .await + .map_err(join_err)? + } + #[cfg(feature = "postgres")] + StorageBackend::Postgres(_) => Err(AtomicCoreError::DatabaseOperation( + "health checks not yet supported on Postgres storage".to_string(), + )), + } + } + + pub(crate) async fn store_health_report_sync( + &self, + report: &crate::health::audit::StoredHealthReport, + ) -> Result<(), AtomicCoreError> { + match self { + StorageBackend::Sqlite(s) => { + let s = s.clone(); + let report = report.clone(); + tokio::task::spawn_blocking(move || s.store_health_report_impl(&report)) + .await + .map_err(join_err)? + } + #[cfg(feature = "postgres")] + StorageBackend::Postgres(_) => Err(AtomicCoreError::DatabaseOperation("health reports not supported on Postgres storage".into())), + } + } + + pub(crate) async fn get_latest_health_report_sync( + &self, + ) -> Result, AtomicCoreError> { + match self { + StorageBackend::Sqlite(s) => { + let s = s.clone(); + tokio::task::spawn_blocking(move || s.get_latest_health_report_impl()) + .await + .map_err(join_err)? + } + #[cfg(feature = "postgres")] + StorageBackend::Postgres(_) => Err(AtomicCoreError::DatabaseOperation("health reports not supported on Postgres storage".into())), + } + } + + pub(crate) async fn get_health_reports_sync( + &self, + limit: i32, + ) -> Result, AtomicCoreError> { + match self { + StorageBackend::Sqlite(s) => { + let s = s.clone(); + tokio::task::spawn_blocking(move || s.get_health_reports_impl(limit)) + .await + .map_err(join_err)? + } + #[cfg(feature = "postgres")] + StorageBackend::Postgres(_) => Err(AtomicCoreError::DatabaseOperation("health reports not supported on Postgres storage".into())), + } + } + + pub(crate) async fn log_fix_action_sync( + &self, + log: &crate::health::audit::HealthFixLog, + ) -> Result<(), AtomicCoreError> { + match self { + StorageBackend::Sqlite(s) => { + let s = s.clone(); + let log = log.clone(); + tokio::task::spawn_blocking(move || s.log_fix_action_impl(&log)) + .await + .map_err(join_err)? + } + #[cfg(feature = "postgres")] + StorageBackend::Postgres(_) => Err(AtomicCoreError::DatabaseOperation("health fix log not supported on Postgres storage".into())), + } + } + + pub(crate) async fn get_fix_log_sync( + &self, + fix_id: &str, + ) -> Result, AtomicCoreError> { + match self { + StorageBackend::Sqlite(s) => { + let s = s.clone(); + let fix_id = fix_id.to_string(); + tokio::task::spawn_blocking(move || s.get_fix_log_impl(&fix_id)) + .await + .map_err(join_err)? + } + #[cfg(feature = "postgres")] + StorageBackend::Postgres(_) => Err(AtomicCoreError::DatabaseOperation("health fix log not supported on Postgres storage".into())), + } + } + + pub(crate) async fn get_recent_fixes_sync( + &self, + limit: i32, + ) -> Result, AtomicCoreError> { + match self { + StorageBackend::Sqlite(s) => { + let s = s.clone(); + tokio::task::spawn_blocking(move || s.get_recent_fixes_impl(limit)) + .await + .map_err(join_err)? + } + #[cfg(feature = "postgres")] + StorageBackend::Postgres(_) => Err(AtomicCoreError::DatabaseOperation("health fix log not supported on Postgres storage".into())), + } + } + + pub(crate) async fn mark_fix_undone_sync(&self, fix_id: &str) -> Result<(), AtomicCoreError> { + match self { + StorageBackend::Sqlite(s) => { + let s = s.clone(); + let fix_id = fix_id.to_string(); + tokio::task::spawn_blocking(move || s.mark_fix_undone_impl(&fix_id)) + .await + .map_err(join_err)? + } + #[cfg(feature = "postgres")] + StorageBackend::Postgres(_) => Err(AtomicCoreError::DatabaseOperation("health fix log not supported on Postgres storage".into())), + } + } + + pub(crate) async fn reset_skipped_untagged_to_pending_sync( + &self, + ) -> Result { + match self { + StorageBackend::Sqlite(s) => { + let s = s.clone(); + tokio::task::spawn_blocking(move || { + s.reset_skipped_untagged_to_pending_impl() + }) + .await + .map_err(join_err)? + } + #[cfg(feature = "postgres")] + StorageBackend::Postgres(_) => Err(AtomicCoreError::DatabaseOperation("health fixes not supported on Postgres storage".into())), + } + } + + // ==================== Link resolution dispatch ==================== + + pub(crate) async fn get_link_candidate_atoms_sync( + &self, + ) -> Result)>, AtomicCoreError> { + match self { + StorageBackend::Sqlite(s) => { + let s = s.clone(); + tokio::task::spawn_blocking(move || s.get_link_candidate_atoms_impl()) + .await + .map_err(join_err)? + } + #[cfg(feature = "postgres")] + StorageBackend::Postgres(_) => Err(AtomicCoreError::DatabaseOperation("link-resolution helpers not supported on Postgres storage".into())), + } + } + + pub(crate) async fn find_atoms_by_source_urls_sync( + &self, + urls: Vec, + ) -> Result, AtomicCoreError> { + match self { + StorageBackend::Sqlite(s) => { + let s = s.clone(); + tokio::task::spawn_blocking(move || s.find_atoms_by_source_urls_impl(&urls)) + .await + .map_err(join_err)? + } + #[cfg(feature = "postgres")] + StorageBackend::Postgres(_) => Err(AtomicCoreError::DatabaseOperation("link-resolution helpers not supported on Postgres storage".into())), + } + } + + pub(crate) async fn find_atom_by_wikilink_name_sync( + &self, + name: String, + vault_prefix: String, + ) -> Result, AtomicCoreError> { + match self { + StorageBackend::Sqlite(s) => { + let s = s.clone(); + tokio::task::spawn_blocking(move || { + s.find_atom_by_wikilink_name_impl(&name, &vault_prefix) + }) + .await + .map_err(join_err)? + } + #[cfg(feature = "postgres")] + StorageBackend::Postgres(_) => Err(AtomicCoreError::DatabaseOperation("link-resolution helpers not supported on Postgres storage".into())), + } + } + + + pub(crate) async fn suggest_atoms_by_query_sync( + &self, + q: String, + limit: i32, + ) -> Result, f32)>, AtomicCoreError> { + match self { + StorageBackend::Sqlite(s) => { + let s = s.clone(); + tokio::task::spawn_blocking(move || s.suggest_atoms_by_query_impl(&q, limit)) + .await + .map_err(join_err)? + } + #[cfg(feature = "postgres")] + StorageBackend::Postgres(_) => Err(AtomicCoreError::DatabaseOperation("atom-query suggestion not supported on Postgres storage".into())), + } + } + pub(crate) async fn get_tag_by_id_sync( + &self, + tag_id: &str, + ) -> Result)>, AtomicCoreError> { + match self { + StorageBackend::Sqlite(s) => { + let s = s.clone(); + let tag_id = tag_id.to_string(); + tokio::task::spawn_blocking(move || s.get_tag_by_id_impl(&tag_id)) + .await + .map_err(join_err)? + } + #[cfg(feature = "postgres")] + StorageBackend::Postgres(_) => Err(AtomicCoreError::DatabaseOperation("tag lookup helper not supported on Postgres storage".into())), + } + } + + pub(crate) async fn list_dismissed_keys_sync( + &self, + check_name: &str, + ) -> Result, AtomicCoreError> { + match self { + StorageBackend::Sqlite(s) => { + let s = s.clone(); + let check_name = check_name.to_string(); + tokio::task::spawn_blocking(move || s.list_dismissed_keys_impl(&check_name)) + .await + .map_err(join_err)? + } + #[cfg(feature = "postgres")] + StorageBackend::Postgres(_) => Err(AtomicCoreError::DatabaseOperation("health dismissals not supported on Postgres storage".into())), + } + } + + pub(crate) async fn dismiss_health_item_sync( + &self, + check_name: &str, + item_key: &str, + reason: &str, + expires_at: Option<&str>, + ) -> Result<(), AtomicCoreError> { + match self { + StorageBackend::Sqlite(s) => { + let s = s.clone(); + let check_name = check_name.to_string(); + let item_key = item_key.to_string(); + let reason = reason.to_string(); + let expires_at = expires_at.map(String::from); + tokio::task::spawn_blocking(move || { + s.dismiss_health_item_impl(&check_name, &item_key, &reason, expires_at.as_deref()) + }) + .await + .map_err(join_err)? + } + #[cfg(feature = "postgres")] + StorageBackend::Postgres(_) => Err(AtomicCoreError::DatabaseOperation("health dismissals not supported on Postgres storage".into())), + } + } + + pub(crate) async fn undismiss_health_item_sync( + &self, + check_name: &str, + item_key: &str, + ) -> Result<(), AtomicCoreError> { + match self { + StorageBackend::Sqlite(s) => { + let s = s.clone(); + let check_name = check_name.to_string(); + let item_key = item_key.to_string(); + tokio::task::spawn_blocking(move || { + s.undismiss_health_item_impl(&check_name, &item_key) + }) + .await + .map_err(join_err)? + } + #[cfg(feature = "postgres")] + StorageBackend::Postgres(_) => Err(AtomicCoreError::DatabaseOperation("health dismissals not supported on Postgres storage".into())), + } + } + + pub(crate) async fn gc_dismissals_sync(&self) -> Result { + match self { + StorageBackend::Sqlite(s) => { + let s = s.clone(); + tokio::task::spawn_blocking(move || s.gc_dismissals_impl()) + .await + .map_err(join_err)? + } + #[cfg(feature = "postgres")] + StorageBackend::Postgres(_) => Err(AtomicCoreError::DatabaseOperation("health dismissals not supported on Postgres storage".into())), + } + } + + pub(crate) async fn save_tag_proposal_sync( + &self, + proposal: crate::health::TagProposal, + ) -> Result<(), AtomicCoreError> { + match self { + StorageBackend::Sqlite(s) => { + let s = s.clone(); + tokio::task::spawn_blocking(move || s.save_tag_proposal_impl(&proposal)) + .await + .map_err(join_err)? + } + #[cfg(feature = "postgres")] + StorageBackend::Postgres(_) => Err(AtomicCoreError::DatabaseOperation("tag proposals not supported on Postgres storage".into())), + } + } + + pub(crate) async fn get_tag_proposal_sync( + &self, + id: &str, + ) -> Result, AtomicCoreError> { + match self { + StorageBackend::Sqlite(s) => { + let s = s.clone(); + let id = id.to_string(); + tokio::task::spawn_blocking(move || s.get_tag_proposal_impl(&id)) + .await + .map_err(join_err)? + } + #[cfg(feature = "postgres")] + StorageBackend::Postgres(_) => Err(AtomicCoreError::DatabaseOperation("tag proposals not supported on Postgres storage".into())), + } + } + + pub(crate) async fn get_latest_tag_proposal_sync( + &self, + ) -> Result, AtomicCoreError> { + match self { + StorageBackend::Sqlite(s) => { + let s = s.clone(); + tokio::task::spawn_blocking(move || s.get_latest_tag_proposal_impl()) + .await + .map_err(join_err)? + } + #[cfg(feature = "postgres")] + StorageBackend::Postgres(_) => Err(AtomicCoreError::DatabaseOperation("tag proposals not supported on Postgres storage".into())), + } + } + + pub(crate) async fn mark_tag_proposal_applied_sync( + &self, + id: &str, + ) -> Result<(), AtomicCoreError> { + match self { + StorageBackend::Sqlite(s) => { + let s = s.clone(); + let id = id.to_string(); + tokio::task::spawn_blocking(move || s.mark_tag_proposal_applied_impl(&id)) + .await + .map_err(join_err)? + } + #[cfg(feature = "postgres")] + StorageBackend::Postgres(_) => Err(AtomicCoreError::DatabaseOperation("tag proposals not supported on Postgres storage".into())), + } + } + pub(crate) async fn count_chunk_hash_occurrences_sync( + &self, + hashes: &[String], + ) -> Result, AtomicCoreError> { + match self { + StorageBackend::Sqlite(s) => { + let s = s.clone(); + let hashes = hashes.to_vec(); + tokio::task::spawn_blocking(move || s.count_chunk_hash_occurrences_impl(&hashes)) + .await + .map_err(join_err)? + } + // Postgres: no content_hash column yet; returning "no duplicates" + // is a correct fallback — boilerplate filter becomes a no-op. + #[cfg(feature = "postgres")] + StorageBackend::Postgres(_) => Ok(std::collections::HashMap::new()), + } + } + + pub(crate) async fn delete_vec_chunks_by_ids_sync( + &self, + chunk_ids: &[String], + ) -> Result<(), AtomicCoreError> { + match self { + StorageBackend::Sqlite(s) => { + let s = s.clone(); + let chunk_ids = chunk_ids.to_vec(); + tokio::task::spawn_blocking(move || s.delete_vec_chunks_by_ids_impl(&chunk_ids)) + .await + .map_err(join_err)? + } + // Postgres: boilerplate filter is a no-op (see count_chunk_hash_occurrences_sync). + // Nothing to delete. + #[cfg(feature = "postgres")] + StorageBackend::Postgres(_) => Ok(()), + } + } + + pub(crate) async fn backfill_content_hashes_sync(&self) -> Result { + match self { + StorageBackend::Sqlite(s) => { + let s = s.clone(); + tokio::task::spawn_blocking(move || s.backfill_content_hashes_impl()) + .await + .map_err(join_err)? + } + // Postgres: no content_hash column to backfill yet. + #[cfg(feature = "postgres")] + StorageBackend::Postgres(_) => Ok(0), + } + } +} + // ==================== Async dispatch methods ==================== // // Each method dispatches to either the SqliteStorage sync helper @@ -643,4 +1074,4 @@ dispatch! { => sqlite: migrate_legacy_token_sync, pg_trait: TokenStore, pg_method: migrate_legacy_token; fn ensure_default_token_sync(&self) -> Result, AtomicCoreError> => sqlite: ensure_default_token_sync, pg_trait: TokenStore, pg_method: ensure_default_token; -} +} \ No newline at end of file diff --git a/crates/atomic-core/src/storage/postgres/atoms.rs b/crates/atomic-core/src/storage/postgres/atoms.rs index 1e80105f..5e19be13 100644 --- a/crates/atomic-core/src/storage/postgres/atoms.rs +++ b/crates/atomic-core/src/storage/postgres/atoms.rs @@ -183,20 +183,21 @@ impl PostgresStorage { ), ) -> Atom { Atom { - id: row.0, - content: row.1, - title: row.2, - snippet: row.3, - source_url: row.4, - source: row.5, - published_at: row.6, - created_at: row.7, - updated_at: row.8, - embedding_status: row.9, - tagging_status: row.10, - embedding_error: row.11, - tagging_error: row.12, - } + id: row.0, + content: row.1, + title: row.2, + snippet: row.3, + source_url: row.4, + source: row.5, + published_at: row.6, + created_at: row.7, + updated_at: row.8, + embedding_status: row.9, + tagging_status: row.10, + embedding_error: row.11, + tagging_error: row.12, + is_locked: false, + } } } @@ -364,6 +365,7 @@ impl AtomStore for PostgresStorage { tagging_status: tagging_status.to_string(), embedding_error: None, tagging_error: None, + is_locked: false, }; Ok(AtomWithTags { atom, tags }) @@ -429,6 +431,7 @@ impl AtomStore for PostgresStorage { tagging_status: "pending".to_string(), embedding_error: None, tagging_error: None, + is_locked: false, }; atoms_with_tags.push(AtomWithTags { atom, tags: vec![] }); diff --git a/crates/atomic-core/src/storage/postgres/briefings.rs b/crates/atomic-core/src/storage/postgres/briefings.rs index 730ee9ff..df12a726 100644 --- a/crates/atomic-core/src/storage/postgres/briefings.rs +++ b/crates/atomic-core/src/storage/postgres/briefings.rs @@ -82,6 +82,7 @@ impl BriefingStore for PostgresStorage { tagging_status, embedding_error, tagging_error, + is_locked: false, }, ) .collect(); diff --git a/crates/atomic-core/src/storage/postgres/chunks.rs b/crates/atomic-core/src/storage/postgres/chunks.rs index 2bcd971a..10273e16 100644 --- a/crates/atomic-core/src/storage/postgres/chunks.rs +++ b/crates/atomic-core/src/storage/postgres/chunks.rs @@ -568,20 +568,21 @@ impl ChunkStore for PostgresStorage { .into_iter() .map(|r| { let atom = Atom { - id: r.0.clone(), - content: r.1, - title: r.2, - snippet: r.3, - source_url: r.4, - source: r.5, - published_at: r.6, - created_at: r.7, - updated_at: r.8, - embedding_status: r.9, - tagging_status: r.10, - embedding_error: r.11, - tagging_error: r.12, - }; + id: r.0.clone(), + content: r.1, + title: r.2, + snippet: r.3, + source_url: r.4, + source: r.5, + published_at: r.6, + created_at: r.7, + updated_at: r.8, + embedding_status: r.9, + tagging_status: r.10, + embedding_error: r.11, + tagging_error: r.12, + is_locked: false, + }; (r.0, atom) }) .collect(); diff --git a/crates/atomic-core/src/storage/postgres/search.rs b/crates/atomic-core/src/storage/postgres/search.rs index e98f95b0..849121f7 100644 --- a/crates/atomic-core/src/storage/postgres/search.rs +++ b/crates/atomic-core/src/storage/postgres/search.rs @@ -599,20 +599,21 @@ async fn pg_batch_fetch_atoms( .into_iter() .map(|r| { let atom = Atom { - id: r.0.clone(), - content: r.1, - title: r.2, - snippet: r.3, - source_url: r.4, - source: r.5, - published_at: r.6, - created_at: r.7, - updated_at: r.8, - embedding_status: r.9, - tagging_status: r.10, - embedding_error: r.11, - tagging_error: r.12, - }; + id: r.0.clone(), + content: r.1, + title: r.2, + snippet: r.3, + source_url: r.4, + source: r.5, + published_at: r.6, + created_at: r.7, + updated_at: r.8, + embedding_status: r.9, + tagging_status: r.10, + embedding_error: r.11, + tagging_error: r.12, + is_locked: false, + }; (r.0, atom) }) .collect()) diff --git a/crates/atomic-core/src/storage/postgres/wiki.rs b/crates/atomic-core/src/storage/postgres/wiki.rs index 8711a793..8307d013 100644 --- a/crates/atomic-core/src/storage/postgres/wiki.rs +++ b/crates/atomic-core/src/storage/postgres/wiki.rs @@ -374,11 +374,42 @@ impl WikiStore for PostgresStorage { } async fn get_all_wiki_articles(&self) -> StorageResult> { - let rows = sqlx::query_as::<_, (String, String, String, String, i32, i64)>( - "SELECT w.id, w.tag_id, t.name, w.updated_at, w.atom_count, - (SELECT COUNT(*) FROM wiki_links wl WHERE wl.target_tag_id = w.tag_id AND wl.db_id = $1) + // Use a recursive CTE to compute the live atom count per tag hierarchy so that + // new_atoms_available is always consistent with GET /api/wiki/{tag_id}/status. + let rows = sqlx::query_as::<_, (String, String, String, String, i32, i64, i32)>( + "WITH RECURSIVE + -- Expand each wiki-article tag to include all its descendant tags. + -- Seeded only from tags that have a wiki article so the recursion is + -- bounded by the number of articles, not the full tag tree. + tag_tree(root_id, id) AS ( + SELECT t.id, t.id + FROM tags t + WHERE t.db_id = $1 + AND EXISTS (SELECT 1 FROM wiki_articles wa WHERE wa.tag_id = t.id AND wa.db_id = $1) + UNION ALL + SELECT tt.root_id, t.id + FROM tags t + JOIN tag_tree tt ON t.parent_id = tt.id + WHERE t.db_id = $1 + ), + -- Live atom count per root tag (counts atoms in the entire subtree). + live_counts(tag_id, cnt) AS ( + SELECT tt.root_id, COUNT(DISTINCT at.atom_id)::int + FROM tag_tree tt + JOIN atom_tags at ON at.tag_id = tt.id AND at.db_id = $1 + GROUP BY tt.root_id + ) + SELECT + w.id, + w.tag_id, + t.name, + w.updated_at, + w.atom_count, + (SELECT COUNT(*) FROM wiki_links wl WHERE wl.target_tag_id = w.tag_id AND wl.db_id = $1), + GREATEST(0, COALESCE(lc.cnt, 0) - w.atom_count) FROM wiki_articles w JOIN tags t ON w.tag_id = t.id AND t.db_id = $1 + LEFT JOIN live_counts lc ON lc.tag_id = w.tag_id WHERE w.db_id = $1 ORDER BY (SELECT COUNT(*) FROM wiki_links wl WHERE wl.target_tag_id = w.tag_id AND wl.db_id = $1) DESC, w.atom_count DESC, w.updated_at DESC", @@ -391,7 +422,7 @@ impl WikiStore for PostgresStorage { Ok(rows .into_iter() .map( - |(id, tag_id, tag_name, updated_at, atom_count, inbound_links)| { + |(id, tag_id, tag_name, updated_at, atom_count, inbound_links, new_atoms_available)| { WikiArticleSummary { id, tag_id, @@ -399,6 +430,7 @@ impl WikiStore for PostgresStorage { updated_at, atom_count, inbound_links: inbound_links as i32, + new_atoms_available, } }, ) diff --git a/crates/atomic-core/src/storage/sqlite/atoms.rs b/crates/atomic-core/src/storage/sqlite/atoms.rs index 8b66e9d4..c2847521 100644 --- a/crates/atomic-core/src/storage/sqlite/atoms.rs +++ b/crates/atomic-core/src/storage/sqlite/atoms.rs @@ -238,6 +238,7 @@ impl SqliteStorage { tagging_status: "pending".to_string(), embedding_error: None, tagging_error: None, + is_locked: false, }; let tags = { @@ -320,6 +321,7 @@ impl SqliteStorage { tagging_status: "pending".to_string(), embedding_error: None, tagging_error: None, + is_locked: false, }; atoms_with_tags.push(AtomWithTags { atom, tags: vec![] }); diff --git a/crates/atomic-core/src/storage/sqlite/briefings.rs b/crates/atomic-core/src/storage/sqlite/briefings.rs index ff8d60ca..dc2222a6 100644 --- a/crates/atomic-core/src/storage/sqlite/briefings.rs +++ b/crates/atomic-core/src/storage/sqlite/briefings.rs @@ -50,6 +50,7 @@ impl SqliteStorage { tagging_status: row.get(10)?, embedding_error: row.get(11)?, tagging_error: row.get(12)?, + is_locked: false, }) })? .collect::, _>>()?; diff --git a/crates/atomic-core/src/storage/sqlite/chunks.rs b/crates/atomic-core/src/storage/sqlite/chunks.rs index 3cfb6346..b913c1c7 100644 --- a/crates/atomic-core/src/storage/sqlite/chunks.rs +++ b/crates/atomic-core/src/storage/sqlite/chunks.rs @@ -247,17 +247,24 @@ impl SqliteStorage { // Insert new chunks and embeddings for (index, (chunk_content, embedding_vec)) in chunks.iter().enumerate() { let chunk_id = Uuid::new_v4().to_string(); - let embedding_blob = embedding::f32_vec_to_blob_public(embedding_vec); + let hash = crate::boilerplate::content_hash(chunk_content); + let embedding_blob = if embedding_vec.is_empty() { + None::> + } else { + Some(embedding::f32_vec_to_blob_public(embedding_vec)) + }; conn.execute( - "INSERT INTO atom_chunks (id, atom_id, chunk_index, content, embedding) VALUES (?1, ?2, ?3, ?4, ?5)", - rusqlite::params![&chunk_id, atom_id, index as i32, chunk_content, &embedding_blob], + "INSERT INTO atom_chunks (id, atom_id, chunk_index, content, content_hash, embedding) VALUES (?1, ?2, ?3, ?4, ?5, ?6)", + rusqlite::params![&chunk_id, atom_id, index as i32, chunk_content, &hash, &embedding_blob], )?; - conn.execute( - "INSERT INTO vec_chunks (chunk_id, embedding) VALUES (?1, ?2)", - rusqlite::params![&chunk_id, &embedding_blob], - )?; + if let Some(ref blob) = embedding_blob { + conn.execute( + "INSERT INTO vec_chunks (chunk_id, embedding) VALUES (?1, ?2)", + rusqlite::params![&chunk_id, blob], + )?; + } } // Incrementally update FTS index @@ -1171,6 +1178,97 @@ impl SqliteStorage { tagging_failed, }) } + /// Given a list of content_hash values, return map of hash → count of distinct + /// atoms containing a chunk with that hash. Used for boilerplate detection. + pub(crate) fn count_chunk_hash_occurrences_impl( + &self, + hashes: &[String], + ) -> StorageResult> { + if hashes.is_empty() { + return Ok(std::collections::HashMap::new()); + } + let conn = self.db.read_conn()?; + let placeholders = hashes.iter().map(|_| "?").collect::>().join(","); + let sql = format!( + "SELECT content_hash, COUNT(DISTINCT atom_id) as cnt + FROM atom_chunks + WHERE content_hash IN ({placeholders}) + AND content_hash IS NOT NULL + GROUP BY content_hash" + ); + let mut stmt = conn.prepare(&sql)?; + let mut map = std::collections::HashMap::new(); + let rows = stmt.query_map( + rusqlite::params_from_iter(hashes.iter()), + |row| Ok((row.get::<_, String>(0)?, row.get::<_, i64>(1)?)), + )?; + for row in rows { + let (hash, cnt) = row?; + map.insert(hash, cnt); + } + Ok(map) + } + + /// Delete vec_chunks entries for specific chunk IDs. + /// Used after boilerplate detection to remove vectors for shared chunks. + pub(crate) fn delete_vec_chunks_by_ids_impl( + &self, + chunk_ids: &[String], + ) -> StorageResult<()> { + if chunk_ids.is_empty() { + return Ok(()); + } + let mut conn = self + .db + .conn + .lock() + .map_err(|e| AtomicCoreError::Lock(e.to_string()))?; + let tx = conn + .transaction() + .map_err(|e| AtomicCoreError::DatabaseOperation(e.to_string()))?; + let placeholders = chunk_ids.iter().map(|_| "?").collect::>().join(","); + let sql = format!("DELETE FROM vec_chunks WHERE chunk_id IN ({placeholders})"); + tx.execute(&sql, rusqlite::params_from_iter(chunk_ids.iter()))?; + tx.commit().map_err(|e| AtomicCoreError::DatabaseOperation(e.to_string()))?; + Ok(()) + } + + /// Backfill content_hash for all atom_chunks rows that have content but NULL hash. + /// Safe to run multiple times (idempotent). Returns number of rows updated. + pub(crate) fn backfill_content_hashes_impl(&self) -> StorageResult { + use crate::boilerplate::content_hash; + let conn = self.db.read_conn()?; + let ids_and_contents: Vec<(String, String)> = { + let mut stmt = conn.prepare( + "SELECT id, content FROM atom_chunks WHERE content_hash IS NULL LIMIT 5000", + )?; + let x = stmt.query_map([], |row| Ok((row.get(0)?, row.get(1)?)))? + .collect::, _>>()?; + x + }; + drop(conn); + if ids_and_contents.is_empty() { + return Ok(0); + } + let mut write_conn = self + .db + .conn + .lock() + .map_err(|e| AtomicCoreError::Lock(e.to_string()))?; + let tx = write_conn + .transaction() + .map_err(|e| AtomicCoreError::DatabaseOperation(e.to_string()))?; + let count = ids_and_contents.len(); + for (id, content) in &ids_and_contents { + let hash = content_hash(content); + tx.execute( + "UPDATE atom_chunks SET content_hash = ?1 WHERE id = ?2", + rusqlite::params![hash, id], + )?; + } + tx.commit().map_err(|e| AtomicCoreError::DatabaseOperation(e.to_string()))?; + Ok(count) + } } #[async_trait] diff --git a/crates/atomic-core/src/storage/sqlite/health.rs b/crates/atomic-core/src/storage/sqlite/health.rs new file mode 100644 index 00000000..c5af80fb --- /dev/null +++ b/crates/atomic-core/src/storage/sqlite/health.rs @@ -0,0 +1,1569 @@ +//! SQLite-backed storage for health check raw data and the two health tables +//! (`health_reports`, `health_fix_log`). +//! +//! All methods here are synchronous (run inside `tokio::task::spawn_blocking`). + +use crate::error::AtomicCoreError; +use crate::health::audit::{HealthFixLog, StoredHealthReport}; +use crate::health::{DuplicatePair, WikiGap, WikiStaleEntry}; +use crate::storage::sqlite::SqliteStorage; +use rusqlite::params; + +// ==================== Raw health data ==================== + +/// All data needed by the health checks, fetched in a single blocking pass. +#[derive(Debug, Clone, Default)] +pub struct HealthRawData { + // — totals — + pub total_atoms: i32, + + // — embedding coverage — + pub embedding_pending: i32, + pub embedding_processing: i32, + pub embedding_complete: i32, + pub embedding_failed: i32, + + // — tagging coverage — + pub tagging_pending: i32, + pub tagging_processing: i32, + pub tagging_complete: i32, + pub tagging_failed: i32, + pub tagging_skipped: i32, + /// Atoms whose tagging_status = 'complete' but have 0 tags assigned. + pub untagged_complete: i32, + /// Atoms whose tagging_status = 'skipped' AND have 0 tags (invisible gap). + pub skipped_untagged: i32, + + // — source uniqueness — + /// `(source_url, [atom_id, ...])` for URLs that appear > 1 time. + pub duplicate_sources: Vec<(String, Vec)>, + + // — orphan tags — + /// `(id, name)` for tags with 0 atoms and no children (excluding autotag targets). + pub orphan_tags: Vec<(String, String)>, + + // — semantic graph freshness — + pub newest_atom_updated_at: Option, + pub newest_edge_created_at: Option, + /// Count of atoms whose `updated_at` > `newest_edge_created_at`. + pub atoms_since_edge_rebuild: i32, + + // — wiki coverage — + pub wiki_eligible_count: i32, + pub wiki_present_count: i32, + pub wiki_stale_count: i32, + pub wiki_gaps: Vec, + pub wiki_stale: Vec, + + // — content quality — + /// Atom IDs with content length < 100 chars. + pub very_short_atoms: Vec, + /// Atom IDs with content length > 15 000 chars. + pub very_long_atoms: Vec, + /// Atom IDs with no markdown heading (`#` at start of line). + pub no_heading_atoms: Vec, + /// Atom IDs with null source_url and no "Source:" text in content. + /// Atom IDs with null source_url and no "Source:" text in content. + pub no_source_atoms: Vec, + + // — tag health — + pub single_atom_tags: i32, + pub rootless_tags: i32, + pub similar_name_pair_count: i32, + /// Similar tag name pairs — (id_a, name_a, id_b, name_b). + pub similar_name_pairs_list: Vec<(String, String, String, String)>, + /// Single-atom tags (exactly 1 atom attached), up to 50. + pub single_atom_tag_list: Vec, + + // — duplicate detection (similarity >= 0.92) — + pub duplicate_pairs: Vec, + + // — boilerplate pollution (atoms with >= 2 edges at similarity >= 0.99) — + /// Atom IDs whose embeddings are dominated by shared template text. + /// Atoms whose embeddings are dominated by shared template text. + pub boilerplate_affected_atoms: Vec, + + // — contradiction candidates (similarity 0.75..0.92) — + pub contradiction_pairs_checked: i32, + pub contradiction_candidate_count: i32, + + /// Pairs of high-similarity atoms for manual contradiction review (similarity 0.80–0.92). + pub contradiction_pairs: Vec, + /// Rootless tags (parent_id IS NULL, not autotag targets) with atom counts. + pub rootless_tag_list: Vec, +} + +impl SqliteStorage { + /// Gather all raw health-check data in a single blocking pass. + pub(crate) fn health_check_data_impl( + &self, + thresholds: &crate::health::HealthThresholds, + ) -> Result { + let conn = self.db.read_conn()?; + let mut raw = HealthRawData::default(); + + // ---- total atoms ---- + raw.total_atoms = conn.query_row("SELECT COUNT(*) FROM atoms", [], |r| r.get(0))?; + + if raw.total_atoms == 0 { + return Ok(raw); + } + + // ---- embedding coverage ---- + let mut stmt = conn.prepare( + "SELECT embedding_status, COUNT(*) FROM atoms GROUP BY embedding_status", + )?; + let mut rows = stmt.query([])?; + while let Some(row) = rows.next()? { + let status: String = row.get(0)?; + let count: i32 = row.get(1)?; + match status.as_str() { + "pending" => raw.embedding_pending = count, + "processing" => raw.embedding_processing = count, + "complete" => raw.embedding_complete = count, + "failed" => raw.embedding_failed = count, + _ => {} + } + } + + // ---- tagging coverage ---- + let mut stmt = conn.prepare( + "SELECT tagging_status, COUNT(*) FROM atoms GROUP BY tagging_status", + )?; + let mut rows = stmt.query([])?; + while let Some(row) = rows.next()? { + let status: String = row.get(0)?; + let count: i32 = row.get(1)?; + match status.as_str() { + "pending" => raw.tagging_pending = count, + "processing" => raw.tagging_processing = count, + "complete" => raw.tagging_complete = count, + "failed" => raw.tagging_failed = count, + "skipped" => raw.tagging_skipped = count, + _ => {} + } + } + + // Atoms that completed tagging but have 0 tags + raw.untagged_complete = conn.query_row( + "SELECT COUNT(*) FROM atoms a + WHERE a.tagging_status = 'complete' + AND NOT EXISTS (SELECT 1 FROM atom_tags at WHERE at.atom_id = a.id)", + [], + |r| r.get(0), + )?; + + // Atoms skipped by the tagger that also have 0 tags — invisible gap + raw.skipped_untagged = conn.query_row( + "SELECT COUNT(*) FROM atoms a + WHERE a.tagging_status = 'skipped' + AND NOT EXISTS (SELECT 1 FROM atom_tags at WHERE at.atom_id = a.id)", + [], + |r| r.get(0), + )?; + + // ---- source uniqueness ---- + let mut stmt = conn.prepare( + "SELECT source_url, COUNT(*) as cnt, GROUP_CONCAT(id) + FROM atoms + WHERE source_url IS NOT NULL + GROUP BY source_url + HAVING cnt > 1 + LIMIT 50", + )?; + let mut rows = stmt.query([])?; + while let Some(row) = rows.next()? { + let url: String = row.get(0)?; + let ids_csv: String = row.get(2)?; + let ids: Vec = ids_csv.split(',').map(|s| s.to_string()).collect(); + raw.duplicate_sources.push((url, ids)); + } + + // ---- orphan tags ---- + let mut stmt = conn.prepare( + "SELECT t.id, t.name + FROM tags t + LEFT JOIN atom_tags at ON t.id = at.tag_id + LEFT JOIN tags children ON children.parent_id = t.id + WHERE at.tag_id IS NULL + AND children.id IS NULL + AND t.is_autotag_target = 0", + )?; + let mut rows = stmt.query([])?; + while let Some(row) = rows.next()? { + raw.orphan_tags.push((row.get(0)?, row.get(1)?)); + } + + // ---- semantic graph freshness ---- + raw.newest_atom_updated_at = conn + .query_row("SELECT MAX(updated_at) FROM atoms", [], |r| { + r.get::<_, Option>(0) + }) + .ok() + .flatten(); + + raw.newest_edge_created_at = conn + .query_row( + "SELECT MAX(created_at) FROM semantic_edges", + [], + |r| r.get::<_, Option>(0), + ) + .ok() + .flatten(); + + if let Some(ref newest_edge) = raw.newest_edge_created_at { + raw.atoms_since_edge_rebuild = conn.query_row( + "SELECT COUNT(*) FROM atoms WHERE updated_at > ?1", + params![newest_edge], + |r| r.get(0), + )?; + } else if raw.total_atoms > 0 { + // No edges at all + raw.atoms_since_edge_rebuild = raw.total_atoms; + } + + // ---- wiki coverage ---- + // Tags with >= thresholds.wiki_min_atoms_per_tag atoms + let mut stmt = conn.prepare( + "SELECT t.id, t.name, + COUNT(DISTINCT at.atom_id) as atom_count, + w.id IS NOT NULL as has_wiki, + w.updated_at, + (SELECT MAX(a.updated_at) FROM atoms a + JOIN atom_tags at2 ON a.id = at2.atom_id + WHERE at2.tag_id = t.id) as last_atom_update + FROM tags t + JOIN atom_tags at ON t.id = at.tag_id + LEFT JOIN wiki_articles w ON t.id = w.tag_id + GROUP BY t.id + HAVING COUNT(DISTINCT at.atom_id) >= ?1 + ORDER BY COUNT(DISTINCT at.atom_id) DESC + LIMIT 50", + )?; + let mut rows = stmt.query(params![thresholds.wiki_min_atoms_per_tag])?; + while let Some(row) = rows.next()? { + let tag_id: String = row.get(0)?; + let tag_name: String = row.get(1)?; + let atom_count: i32 = row.get(2)?; + let has_wiki: bool = row.get(3)?; + let wiki_updated_at: Option = row.get(4)?; + let last_atom_update: Option = row.get(5)?; + + raw.wiki_eligible_count += 1; + + if has_wiki { + raw.wiki_present_count += 1; + // Stale if any atom updated after the wiki + let is_stale = match (&wiki_updated_at, &last_atom_update) { + (Some(w), Some(a)) => a > w, + _ => false, + }; + if is_stale { + raw.wiki_stale_count += 1; + raw.wiki_stale.push(WikiStaleEntry { + tag_id, + tag_name, + new_atom_count: atom_count, + }); + } + } else { + raw.wiki_gaps.push(WikiGap { + tag_id, + tag_name, + atom_count, + }); + } + } + + // ---- content quality ---- + const LIMIT: usize = 20; + + let mut stmt = conn.prepare( + "SELECT id FROM atoms WHERE length(content) < ?1 LIMIT ?2", + )?; + let mut rows = stmt.query(params![thresholds.content_quality_short_chars, LIMIT as i32])?; + while let Some(row) = rows.next()? { + raw.very_short_atoms.push(row.get(0)?); + } + + let mut stmt = conn.prepare( + "SELECT id FROM atoms WHERE length(content) > ?1 LIMIT ?2", + )?; + let mut rows = stmt.query(params![thresholds.content_quality_long_chars, LIMIT as i32])?; + while let Some(row) = rows.next()? { + raw.very_long_atoms.push(row.get(0)?); + } + + // No heading: content doesn't start with '#' and doesn't have '\n#' + let mut stmt = conn.prepare( + "SELECT id FROM atoms + WHERE content NOT LIKE '#%' + AND content NOT LIKE '%' || char(10) || '#%' + LIMIT ?1", + )?; + let mut rows = stmt.query(params![LIMIT as i32])?; + while let Some(row) = rows.next()? { + raw.no_heading_atoms.push(row.get(0)?); + } + + // No source: null source_url and no http(s):// in content + // Return title preview + created_at for better UX (no secondary fetch needed) + let mut stmt = conn.prepare( + "SELECT id, content, created_at FROM atoms + WHERE source_url IS NULL + AND content NOT LIKE '%http://%' + AND content NOT LIKE '%https://%' + AND content NOT LIKE '%Source:%' + ORDER BY updated_at DESC + LIMIT ?1", + )?; + let mut rows = stmt.query(params![LIMIT as i32])?; + while let Some(row) = rows.next()? { + let id: String = row.get(0)?; + let content: String = row.get(1)?; + let created_at: String = row.get(2)?; + let title = extract_title_preview(&content); + raw.no_source_atoms.push(crate::health::AtomPreview { id, title, created_at }); + } + + // ---- tag health ---- + // Single-atom tags: fetch list (up to 50) and derive count from it. + { + let mut stmt = conn.prepare( + "SELECT t.id, t.name, t.is_autotag_target + FROM tags t + JOIN atom_tags at ON t.id = at.tag_id + GROUP BY t.id HAVING COUNT(at.atom_id) = 1 + ORDER BY t.name + LIMIT 51", + )?; + let mut rows = stmt.query([])?; + let mut truncated = false; + while let Some(row) = rows.next()? { + if raw.single_atom_tag_list.len() == 50 { + truncated = true; + break; + } + let id: String = row.get(0)?; + let name: String = row.get(1)?; + let is_autotag: bool = row.get::<_, i32>(2)? != 0; + raw.single_atom_tag_list.push(crate::health::SingleAtomTagEntry { id, name, is_autotag }); + } + raw.single_atom_tags = if truncated { + // Count exact total when list was truncated + conn.query_row( + "SELECT COUNT(*) FROM ( + SELECT t.id FROM tags t + JOIN atom_tags at ON t.id = at.tag_id + GROUP BY t.id HAVING COUNT(at.atom_id) = 1 + )", + [], + |r| r.get(0), + )? + } else { + raw.single_atom_tag_list.len() as i32 + }; + } + + // Rootless tags: user-created tags with no parent (excludes autotag category roots). + // is_autotag_target = 1 marks system roots (Topics, People, etc.) — exclude them. + { + let mut stmt = conn.prepare( + "SELECT t.id, t.name, COUNT(at.atom_id) as atom_count + FROM tags t + LEFT JOIN atom_tags at ON t.id = at.tag_id + WHERE t.parent_id IS NULL + AND t.is_autotag_target = 0 + GROUP BY t.id + ORDER BY atom_count DESC + LIMIT 50", + )?; + let mut rows = stmt.query([])?; + while let Some(row) = rows.next()? { + let id: String = row.get(0)?; + let name: String = row.get(1)?; + let atom_count: i32 = row.get(2)?; + raw.rootless_tag_list.push(crate::health::RootlessTagEntry { id, name, atom_count }); + } + raw.rootless_tags = raw.rootless_tag_list.len() as i32; + } + + // Similar name pairs: fetch all tag (id, name) and compare in Rust + { + let mut stmt = conn.prepare("SELECT id, name FROM tags WHERE atom_count > 0")?; + let mut rows = stmt.query([])?; + let mut id_names: Vec<(String, String)> = Vec::new(); + while let Some(row) = rows.next()? { + id_names.push((row.get(0)?, row.get(1)?)); + } + raw.similar_name_pairs_list = collect_similar_name_pairs(&id_names); + raw.similar_name_pair_count = raw.similar_name_pairs_list.len() as i32; + } + + // ---- content overlap detection (Tier 3) ---- + // Moderate similarity (0.55–0.85) + different source prefixes + >= 2 shared tags. + // This surfaces semantically related atoms from different corpora that should be + // reviewed for linking or merging — not template clones (those are boilerplate_pollution). + { + let mut stmt = conn.prepare( + "SELECT + se.source_atom_id, se.target_atom_id, se.similarity_score, + a1.source_url, a1.content, + a2.source_url, a2.content, + COUNT(DISTINCT at_a.tag_id) as shared_tag_count, + a1.created_at, a2.created_at + FROM semantic_edges se + JOIN atoms a1 ON se.source_atom_id = a1.id + JOIN atoms a2 ON se.target_atom_id = a2.id + JOIN atom_tags at_a ON a1.id = at_a.atom_id + JOIN atom_tags at_b ON a2.id = at_b.atom_id AND at_a.tag_id = at_b.tag_id + WHERE se.similarity_score BETWEEN ?1 AND ?2 + GROUP BY se.source_atom_id, se.target_atom_id + HAVING COUNT(DISTINCT at_a.tag_id) >= ?3 + ORDER BY COUNT(DISTINCT at_a.tag_id) DESC, se.similarity_score DESC + LIMIT 20", + )?; + let mut rows = stmt.query(params![ + thresholds.content_overlap_similarity_min, + thresholds.content_overlap_similarity_max, + thresholds.content_overlap_shared_tags_min, + ])?; + while let Some(row) = rows.next()? { + let a_id: String = row.get(0)?; + let b_id: String = row.get(1)?; + let similarity: f32 = row.get(2)?; + let a_source: Option = row.get(3)?; + let a_content: String = row.get(4)?; + let b_source: Option = row.get(5)?; + let b_content: String = row.get(6)?; + let shared_tag_count: i32 = row.get(7)?; + let a_created_at: Option = row.get(8)?; + let b_created_at: Option = row.get(9)?; + + // Skip same-corpus pairs — those are template pollution, not content overlap. + let prefix_a = source_prefix(&a_source); + let prefix_b = source_prefix(&b_source); + if prefix_a == prefix_b { + continue; + } + + let a_title = extract_title_preview(&a_content); + let b_title = extract_title_preview(&b_content); + + raw.duplicate_pairs.push(DuplicatePair { + pair_id: uuid::Uuid::new_v4().to_string(), + atom_a_id: a_id, + atom_a_title: a_title, + atom_a_source: a_source, + atom_b_id: b_id, + atom_b_title: b_title, + atom_b_source: b_source, + similarity, + shared_tag_count, + atom_a_created_at: a_created_at, + atom_b_created_at: b_created_at, + }); + } + } + + // ---- boilerplate pollution (atoms with >= thresholds.boilerplate_min_clones edges at similarity >= thresholds.boilerplate_similarity) ---- + // Return atom title + clone count so UI can show context and prioritise review. + { + let mut stmt = conn.prepare( + "SELECT se.source_atom_id, a.content, COUNT(*) as clone_count + FROM semantic_edges se + JOIN atoms a ON se.source_atom_id = a.id + WHERE se.similarity_score >= ?1 + GROUP BY se.source_atom_id + HAVING COUNT(*) >= ?2 + ORDER BY clone_count DESC + LIMIT 50", + )?; + let mut rows = stmt.query(params![ + thresholds.boilerplate_similarity, + thresholds.boilerplate_min_clones, + ])?; + while let Some(row) = rows.next()? { + let id: String = row.get(0)?; + let content: String = row.get(1)?; + let clone_count: i32 = row.get(2)?; + let title = extract_title_preview(&content); + raw.boilerplate_affected_atoms.push(crate::health::BoilerplateAtomEntry { id, title, clone_count }); + } + } + + // ---- contradiction candidates (similarity thresholds.contradiction_similarity_min .. thresholds.contradiction_similarity_max) ---- + // Surface actual atom pairs for manual review. + // + // Two post-query filters run here to reduce false positives caused by + // template / boilerplate overlap: + // + // (1) Boilerplate-exclusion: if either atom is already flagged as + // boilerplate-polluted (computed above), its embedding is known + // to be dominated by shared template text rather than unique + // content. A high-similarity edge between such atoms is almost + // certainly template noise, not a real contradiction. Re-embed + // with unique content first, then re-check. + // + // (2) Token-Jaccard prefilter: real contradictions express *different* + // claims, so their atom contents use largely different token sets. + // Pairs whose unique-token overlap is at/above + // thresholds.contradiction_max_content_jaccard are treated as + // template clones and filtered out. + // + // We raise the SQL LIMIT so filtering doesn't starve the final list; + // the raw candidate count (raw.contradiction_pairs_checked) still + // reflects the full matching edge count pre-filter, so the UI can tell + // the user how many were considered. + { + let boilerplate_ids: std::collections::HashSet<&str> = raw + .boilerplate_affected_atoms + .iter() + .map(|a| a.id.as_str()) + .collect(); + const MAX_CONTRADICTION_PAIRS: usize = 20; + const SQL_LIMIT: i32 = 200; + let mut stmt = conn.prepare( + "SELECT + se.source_atom_id, se.target_atom_id, se.similarity_score, + a1.source_url, a1.content, + a2.source_url, a2.content, + COUNT(DISTINCT at_a.tag_id) as shared_tag_count, + a1.created_at, a2.created_at + FROM semantic_edges se + JOIN atoms a1 ON se.source_atom_id = a1.id + JOIN atoms a2 ON se.target_atom_id = a2.id + LEFT JOIN atom_tags at_a ON a1.id = at_a.atom_id + LEFT JOIN atom_tags at_b ON a2.id = at_b.atom_id AND at_a.tag_id = at_b.tag_id + WHERE se.similarity_score >= ?1 AND se.similarity_score < ?2 + GROUP BY se.source_atom_id, se.target_atom_id + HAVING COUNT(DISTINCT at_a.tag_id) >= ?3 + ORDER BY se.similarity_score DESC + LIMIT ?4", + )?; + let mut rows = stmt.query(params![ + thresholds.contradiction_similarity_min, + thresholds.contradiction_similarity_max, + thresholds.contradiction_shared_tags_min, + SQL_LIMIT, + ])?; + let jaccard_cap = thresholds.contradiction_max_content_jaccard.clamp(0.0, 1.0); + while let Some(row) = rows.next()? { + if raw.contradiction_pairs.len() >= MAX_CONTRADICTION_PAIRS { + break; + } + let a_id: String = row.get(0)?; + let b_id: String = row.get(1)?; + let similarity: f32 = row.get(2)?; + let a_source: Option = row.get(3)?; + let a_content: String = row.get(4)?; + let b_source: Option = row.get(5)?; + let b_content: String = row.get(6)?; + let shared_tag_count: i32 = row.get(7)?; + let a_created_at: Option = row.get(8)?; + let b_created_at: Option = row.get(9)?; + + // Filter (1): skip pairs where either atom is boilerplate-polluted. + if boilerplate_ids.contains(a_id.as_str()) + || boilerplate_ids.contains(b_id.as_str()) + { + continue; + } + + // Filter (2): skip pairs whose contents overlap too much at the + // token level — likely same template, not different claims. + if content_token_jaccard(&a_content, &b_content) >= jaccard_cap { + continue; + } + + let a_title = extract_title_preview(&a_content); + let b_title = extract_title_preview(&b_content); + + // Filter (3): titles must share at least one informative token. + // Real contradictions argue about the *same entity*; their titles + // overlap ("Parolee Search" vs "Absconder Search" share "search"; + // "Deploy an Application" vs "Application URL Path" share + // "application"). Pairs like "PITVR" vs "Roster Download" share + // zero title tokens — different entities that happen to sit on + // the same runbook template. This filter is robust where content + // Jaccard is noisy, because H1/title text is short and + // template-free. + if !titles_share_token(&a_title, &b_title) { + continue; + } + + // Filter (4): the pair's own similarity sits in the boilerplate + // zone. If the user has said "edges at/above X are template + // clones" (thresholds.boilerplate_similarity), we honor that + // upper bound for contradictions too. Real contradictions live + // below the template-clone plateau. + if similarity >= thresholds.boilerplate_similarity { + continue; + } + + raw.contradiction_pairs.push(crate::health::ContradictionPairEntry { + pair_id: uuid::Uuid::new_v4().to_string(), + atom_a: crate::health::ContradictionAtom { id: a_id, title: a_title, source: a_source, created_at: a_created_at }, + atom_b: crate::health::ContradictionAtom { id: b_id, title: b_title, source: b_source, created_at: b_created_at }, + similarity, + shared_tag_count, + }); + } + raw.contradiction_pairs_checked = conn.query_row( + "SELECT COUNT(*) FROM semantic_edges + WHERE similarity_score >= ?1 AND similarity_score < ?2", + params![ + thresholds.contradiction_similarity_min, + thresholds.contradiction_similarity_max, + ], + |r| r.get(0), + )?; + raw.contradiction_candidate_count = raw.contradiction_pairs.len() as i32; + } + + Ok(raw) + } + + /// Reset atoms with `tagging_status = 'skipped'` AND 0 tags back to `pending` + /// so the tagger pipeline will process them on the next run. + /// Returns the number of atoms reset. + pub(crate) fn reset_skipped_untagged_to_pending_impl( + &self, + ) -> Result { + let conn = self.db.conn.lock().map_err(|e| { + AtomicCoreError::DatabaseOperation(format!("lock error: {e}")) + })?; + let n = conn.execute( + "UPDATE atoms + SET tagging_status = 'pending' + WHERE tagging_status = 'skipped' + AND NOT EXISTS ( + SELECT 1 FROM atom_tags at WHERE at.atom_id = atoms.id + )", + [], + )? as i32; + Ok(n) + } + + // ==================== Health report storage ==================== + + pub(crate) fn store_health_report_impl( + &self, + report: &StoredHealthReport, + ) -> Result<(), AtomicCoreError> { + let conn = self.db.conn.lock().map_err(|e| { + AtomicCoreError::DatabaseOperation(format!("lock error: {e}")) + })?; + conn.execute( + "INSERT OR REPLACE INTO health_reports + (id, computed_at, overall_score, check_scores, atom_count, auto_fixes_applied, report_json) + VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7)", + params![ + report.id, + report.computed_at, + report.overall_score, + report.check_scores, + report.atom_count, + report.auto_fixes_applied, + report.report_json, + ], + )?; + // Prune reports older than 90 days + conn.execute( + "DELETE FROM health_reports WHERE computed_at < datetime('now', '-90 days')", + [], + )?; + Ok(()) + } + + pub(crate) fn get_latest_health_report_impl( + &self, + ) -> Result, AtomicCoreError> { + let conn = self.db.read_conn()?; + let result: rusqlite::Result = conn.query_row( + "SELECT report_json FROM health_reports ORDER BY computed_at DESC LIMIT 1", + [], + |r| r.get(0), + ); + match result { + Ok(json) => { + let report: crate::health::HealthReport = + serde_json::from_str(&json).map_err(|e| { + AtomicCoreError::DatabaseOperation(format!( + "failed to deserialize health report: {e}" + )) + })?; + Ok(Some(report)) + } + Err(rusqlite::Error::QueryReturnedNoRows) => Ok(None), + Err(e) => Err(e.into()), + } + } + + pub(crate) fn get_health_reports_impl( + &self, + limit: i32, + ) -> Result, AtomicCoreError> { + let conn = self.db.read_conn()?; + let mut stmt = conn.prepare( + "SELECT id, computed_at, overall_score, check_scores, atom_count, auto_fixes_applied, report_json + FROM health_reports + ORDER BY computed_at DESC + LIMIT ?1", + )?; + let reports = stmt + .query_map(params![limit], |r| { + Ok(StoredHealthReport { + id: r.get(0)?, + computed_at: r.get(1)?, + overall_score: r.get::<_, i32>(2)? as u32, + check_scores: r.get(3)?, + atom_count: r.get(4)?, + auto_fixes_applied: r.get(5)?, + report_json: r.get(6)?, + }) + })? + .filter_map(|r| r.ok()) + .collect(); + Ok(reports) + } + + // ==================== Fix log storage ==================== + + pub(crate) fn log_fix_action_impl( + &self, + log: &HealthFixLog, + ) -> Result<(), AtomicCoreError> { + let conn = self.db.conn.lock().map_err(|e| { + AtomicCoreError::DatabaseOperation(format!("lock error: {e}")) + })?; + let atom_ids_json = log + .atom_ids + .as_ref() + .map(|ids| serde_json::to_string(ids).unwrap_or_default()); + let tag_ids_json = log + .tag_ids + .as_ref() + .map(|ids| serde_json::to_string(ids).unwrap_or_default()); + conn.execute( + "INSERT INTO health_fix_log + (id, check_name, action, tier, atom_ids, tag_ids, + before_state, after_state, llm_prompt, llm_response, executed_at, undone_at) + VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10, ?11, ?12)", + params![ + log.id, + log.check_name, + log.action, + log.tier, + atom_ids_json, + tag_ids_json, + log.before_state, + log.after_state, + log.llm_prompt, + log.llm_response, + log.executed_at, + log.undone_at, + ], + )?; + Ok(()) + } + + pub(crate) fn get_fix_log_impl( + &self, + fix_id: &str, + ) -> Result, AtomicCoreError> { + let conn = self.db.read_conn()?; + let result = conn.query_row( + "SELECT id, check_name, action, tier, atom_ids, tag_ids, + before_state, after_state, llm_prompt, llm_response, executed_at, undone_at + FROM health_fix_log WHERE id = ?1", + params![fix_id], + |r| { + Ok(HealthFixLog { + id: r.get(0)?, + check_name: r.get(1)?, + action: r.get(2)?, + tier: r.get(3)?, + atom_ids: r + .get::<_, Option>(4)? + .and_then(|s| serde_json::from_str(&s).ok()), + tag_ids: r + .get::<_, Option>(5)? + .and_then(|s| serde_json::from_str(&s).ok()), + before_state: r.get(6)?, + after_state: r.get(7)?, + llm_prompt: r.get(8)?, + llm_response: r.get(9)?, + executed_at: r.get(10)?, + undone_at: r.get(11)?, + }) + }, + ); + match result { + Ok(log) => Ok(Some(log)), + Err(rusqlite::Error::QueryReturnedNoRows) => Ok(None), + Err(e) => Err(e.into()), + } + } + + pub(crate) fn get_recent_fixes_impl( + &self, + limit: i32, + ) -> Result, AtomicCoreError> { + let conn = self.db.read_conn()?; + let mut stmt = conn.prepare( + "SELECT id, check_name, action, tier, atom_ids, tag_ids, + before_state, after_state, llm_prompt, llm_response, executed_at, undone_at + FROM health_fix_log + ORDER BY executed_at DESC + LIMIT ?1", + )?; + let logs = stmt + .query_map(params![limit], |r| { + Ok(HealthFixLog { + id: r.get(0)?, + check_name: r.get(1)?, + action: r.get(2)?, + tier: r.get(3)?, + atom_ids: r + .get::<_, Option>(4)? + .and_then(|s| serde_json::from_str(&s).ok()), + tag_ids: r + .get::<_, Option>(5)? + .and_then(|s| serde_json::from_str(&s).ok()), + before_state: r.get(6)?, + after_state: r.get(7)?, + llm_prompt: r.get(8)?, + llm_response: r.get(9)?, + executed_at: r.get(10)?, + undone_at: r.get(11)?, + }) + })? + .filter_map(|r| r.ok()) + .collect(); + Ok(logs) + } + + pub(crate) fn mark_fix_undone_impl(&self, fix_id: &str) -> Result<(), AtomicCoreError> { + let conn = self.db.conn.lock().map_err(|e| { + AtomicCoreError::DatabaseOperation(format!("lock error: {e}")) + })?; + let now = chrono::Utc::now().to_rfc3339(); + conn.execute( + "UPDATE health_fix_log SET undone_at = ?1 WHERE id = ?2", + params![now, fix_id], + )?; + Ok(()) + } + + // ==================== Link resolution storage ==================== + + /// Fetch atoms that likely contain internal links (first-pass SQL filter). + /// Returns (id, content, source_url). + /// The exact link extraction happens in Rust using `link_resolution::extract_internal_links`. + pub(crate) fn get_link_candidate_atoms_impl( + &self, + ) -> Result)>, AtomicCoreError> { + let conn = self.db.read_conn()?; + let mut stmt = conn.prepare( + "SELECT id, content, source_url FROM atoms + WHERE content LIKE '%](.%.md%' + OR content LIKE '%](./%' + OR content LIKE '%](../%' + OR (content LIKE '%[[%' AND content LIKE '%]]%')", + )?; + let rows = stmt + .query_map([], |r| Ok((r.get(0)?, r.get(1)?, r.get::<_, Option>(2)?)))? + .filter_map(|r| r.ok()) + .collect(); + Ok(rows) + } + + /// Batch lookup: given a list of candidate source URLs, return a map of + /// source_url → atom_id for those that exist in the database. + pub(crate) fn find_atoms_by_source_urls_impl( + &self, + urls: &[String], + ) -> Result, AtomicCoreError> { + if urls.is_empty() { + return Ok(std::collections::HashMap::new()); + } + let conn = self.db.read_conn()?; + let mut map = std::collections::HashMap::new(); + // SQLite doesn't support binding a variable-length IN list, so we query one by one. + // For the typical link count (<50 per atom), this is fast enough. + let mut stmt = conn.prepare("SELECT id FROM atoms WHERE source_url = ?1")?; + for url in urls { + if let Ok(id) = stmt.query_row(params![url], |r| r.get::<_, String>(0)) { + map.insert(url.clone(), id); + } + } + Ok(map) + } + + /// Wikilink fallback: find an atom whose source_url ends with `/.md` + /// (case-insensitive on the name stem) anywhere in the vault. + /// Returns the first match as (atom_id, title_preview). + pub(crate) fn find_atom_by_wikilink_name_impl( + &self, + name: &str, + vault_prefix: &str, + ) -> Result, AtomicCoreError> { + let conn = self.db.read_conn()?; + // Try exact stem match under the vault (case-insensitive) + let like_pattern = format!("%/{}%.md", name.to_lowercase().replace(' ', "-")); + let alt_pattern = format!("%/{}%.md", name.to_lowercase().replace(' ', "_")); + let result = conn.query_row( + "SELECT id, content FROM atoms + WHERE source_url LIKE ?1 || ?3 + OR LOWER(source_url) LIKE ?2 + OR LOWER(source_url) LIKE ?4", + params![vault_prefix, like_pattern, name.replace(' ', "-") + ".md", alt_pattern], + |r| { + let id: String = r.get(0)?; + let content: String = r.get(1)?; + Ok((id, content)) + }, + ); + match result { + Ok((id, content)) => { + let title = content + .lines() + .find(|l| !l.trim().is_empty()) + .unwrap_or(&id) + .trim_start_matches('#') + .trim() + .chars() + .take(80) + .collect::(); + Ok(Some((id, title))) + } + Err(rusqlite::Error::QueryReturnedNoRows) => Ok(None), + Err(e) => Err(e.into()), + } + } + + /// Suggest atom candidates for a broken link query. + /// Searches by source_url suffix, title prefix/contains, and content LIKE. + /// Returns vec of (atom_id, title, source_url, score) sorted by score desc. + pub(crate) fn suggest_atoms_by_query_impl( + &self, + q: &str, + limit: i32, + ) -> Result, f32)>, AtomicCoreError> { + if q.trim().is_empty() { + return Ok(vec![]); + } + let conn = self.db.read_conn()?; + let mut results: Vec<(String, String, Option, f32)> = Vec::new(); + let mut seen: std::collections::HashSet = std::collections::HashSet::new(); + + // Normalize query: strip extension, directory prefixes, replace hyphens/underscores with spaces. + let normalized = { + let no_ext = if let Some(pos) = q.rfind('.') { + if pos > 0 && !q[..pos].contains('/') || q[..pos].contains('/') { + &q[..pos] + } else { + q + } + } else { + q + }; + let no_dir = if let Some(pos) = no_ext.rfind('/') { + &no_ext[pos + 1..] + } else { + no_ext + }; + let no_prefix = no_dir.trim_start_matches('.').trim_start_matches('/'); + no_prefix.replace(['-', '_'], " ").to_lowercase() + }; + let nq = normalized.as_str(); + + // 1. Exact source_url suffix match (score 1.0) + { + let suffix_pat = format!("%{}", q); + let mut stmt = conn.prepare( + "SELECT id, content, source_url FROM atoms WHERE source_url LIKE ?1 ESCAPE '\\' LIMIT 20", + )?; + let rows = stmt.query_map(rusqlite::params![suffix_pat], |r| { + Ok((r.get::<_, String>(0)?, r.get::<_, String>(1)?, r.get::<_, Option>(2)?)) + })?; + for row in rows.flatten() { + let (id, content, src) = row; + if seen.insert(id.clone()) { + let title = extract_title_preview(&content); + results.push((id, title, src, 1.0f32)); + } + } + } + + // 2a. Title prefix match (score 0.8) — first non-empty line starts with nq + { + let prefix_pat = format!("{}%", nq); + let prefix_pat_hash = format!("# {}%", nq); + let mut stmt = conn.prepare( + "SELECT id, content, source_url FROM atoms + WHERE LOWER(SUBSTR(TRIM(content), 1, 80)) LIKE ?1 ESCAPE '\\' + OR LOWER(SUBSTR(TRIM(content), 1, 80)) LIKE ?2 ESCAPE '\\' + LIMIT 40", + )?; + let rows = stmt.query_map(rusqlite::params![prefix_pat, prefix_pat_hash], |r| { + Ok((r.get::<_, String>(0)?, r.get::<_, String>(1)?, r.get::<_, Option>(2)?)) + })?; + for row in rows.flatten() { + let (id, content, src) = row; + if seen.insert(id.clone()) { + let title = extract_title_preview(&content); + results.push((id, title, src, 0.8f32)); + } + } + } + + // 2b. Title contains match (score 0.6) + { + let contains_pat = format!("%{}%", nq); + let mut stmt = conn.prepare( + "SELECT id, content, source_url FROM atoms + WHERE LOWER(SUBSTR(TRIM(content), 1, 80)) LIKE ?1 ESCAPE '\\' + LIMIT 40", + )?; + let rows = stmt.query_map(rusqlite::params![contains_pat], |r| { + Ok((r.get::<_, String>(0)?, r.get::<_, String>(1)?, r.get::<_, Option>(2)?)) + })?; + for row in rows.flatten() { + let (id, content, src) = row; + if seen.insert(id.clone()) { + let title = extract_title_preview(&content); + results.push((id, title, src, 0.6f32)); + } + } + } + + // 3. Fuzzy content LIKE on first 80 chars (score 0.4) + { + let contains_pat = format!("%{}%", nq); + let mut stmt = conn.prepare( + "SELECT id, content, source_url FROM atoms + WHERE LOWER(SUBSTR(content, 1, 80)) LIKE ?1 ESCAPE '\\' + LIMIT 40", + )?; + let rows = stmt.query_map(rusqlite::params![contains_pat], |r| { + Ok((r.get::<_, String>(0)?, r.get::<_, String>(1)?, r.get::<_, Option>(2)?)) + })?; + for row in rows.flatten() { + let (id, content, src) = row; + if seen.insert(id.clone()) { + let title = extract_title_preview(&content); + results.push((id, title, src, 0.4f32)); + } + } + } + + // Sort by score desc, truncate to limit. + results.sort_by(|a, b| b.3.partial_cmp(&a.3).unwrap_or(std::cmp::Ordering::Equal)); + results.truncate(limit.max(1).min(20) as usize); + Ok(results) + } +} + + +// ==================== Helpers ==================== + +/// Collect similar tag name pairs — one is a prefix/substring of the other. +fn collect_similar_name_pairs(tags: &[(String, String)]) -> Vec<(String, String, String, String)> { + let mut out = Vec::new(); + for (i, (a_id, a_name)) in tags.iter().enumerate() { + for (b_id, b_name) in tags.iter().skip(i + 1) { + let la = a_name.to_lowercase(); + let lb = b_name.to_lowercase(); + if la == lb { + continue; // exact duplicate (already handled elsewhere) + } + if la.contains(lb.as_str()) || lb.contains(la.as_str()) { + // canonical order: smaller id first, for stable pair_id + if a_id <= b_id { + out.push((a_id.clone(), a_name.clone(), b_id.clone(), b_name.clone())); + } else { + out.push((b_id.clone(), b_name.clone(), a_id.clone(), a_name.clone())); + } + } + } + } + out +} + +/// Extract first ~60 chars as a title preview. +fn extract_title_preview(content: &str) -> String { + let first_line = content.lines().next().unwrap_or("").trim(); + let clean = first_line.trim_start_matches('#').trim(); + if clean.len() > 60 { + format!("{}\u{2026}", &clean[..60]) + } else if clean.is_empty() { + content.chars().take(60).collect() + } else { + clean.to_string() + } +} + +/// Extract the source prefix: scheme + authority (everything up to the path). +/// Examples: +/// `https://tylertech.atlassian.net/wiki/...` → `https://tylertech.atlassian.net` +/// `obsidian://ar-playbook/path/to/file` → `obsidian://ar-playbook` +/// `None` → `manual` +pub(crate) fn source_prefix(url: &Option) -> String { + let Some(u) = url else { + return "manual".to_string(); + }; + // Find "://" then the next "/" after it + if let Some(scheme_end) = u.find("://") { + let after_scheme = &u[scheme_end + 3..]; + if let Some(slash) = after_scheme.find('/') { + return u[..scheme_end + 3 + slash].to_string(); + } + } else if let Some(slash) = u.find('/') { + return u[..slash].to_string(); + } + u.clone() +} + +/// Token-level Jaccard similarity between two atom contents. +/// +/// Tokens are lowercased alphanumeric runs, length ≥ 3 (drops punctuation, +/// drops trivial words like "a"/"is" that add noise without signal). Returns +/// `|A ∩ B| / |A ∪ B|`, or `0.0` when both sets are empty. +/// +/// Rationale: the contradiction check already filters by *embedding* +/// similarity; the embedding can be dominated by shared template text, which +/// produces false positives. A high embedding-sim pair with also-high token +/// Jaccard is almost certainly template overlap, not a semantic disagreement. +/// Real contradictions use *different* words to assert conflicting facts. +pub(crate) fn content_token_jaccard(a: &str, b: &str) -> f32 { + fn tokens(s: &str) -> std::collections::HashSet { + let mut out = std::collections::HashSet::new(); + let mut buf = String::new(); + for ch in s.chars() { + if ch.is_alphanumeric() { + for c in ch.to_lowercase() { + buf.push(c); + } + } else if !buf.is_empty() { + if buf.len() >= 3 { + out.insert(std::mem::take(&mut buf)); + } else { + buf.clear(); + } + } + } + if buf.len() >= 3 { + out.insert(buf); + } + out + } + + let set_a = tokens(a); + let set_b = tokens(b); + if set_a.is_empty() && set_b.is_empty() { + return 0.0; + } + let inter = set_a.intersection(&set_b).count(); + let union = set_a.len() + set_b.len() - inter; + if union == 0 { + 0.0 + } else { + inter as f32 / union as f32 + } +} + +/// Do two titles share at least one informative token? +/// +/// "Informative" = length >= 3, lowercased, alphanumeric runs, and NOT in a +/// tiny stopword list (common filler that can coincidentally link unrelated +/// titles: "the", "and", "for", "with", etc.). +/// +/// Used as a contradiction-pair prefilter. Titles are short and +/// template-free — if two titles share zero informative tokens, the atoms +/// are almost certainly about different entities, even if their embedding +/// vectors land close in similarity space (template / boilerplate +/// pollution). Real contradictions are *about the same thing* and therefore +/// share subject tokens in the H1. +pub(crate) fn titles_share_token(a: &str, b: &str) -> bool { + const STOPWORDS: &[&str] = &[ + "the", "and", "for", "with", "from", "into", "onto", "over", "under", + "about", "this", "that", "these", "those", "not", "are", "but", "out", + ]; + fn toks(s: &str) -> std::collections::HashSet { + let mut out = std::collections::HashSet::new(); + let mut buf = String::new(); + for ch in s.chars() { + if ch.is_alphanumeric() { + for c in ch.to_lowercase() { + buf.push(c); + } + } else if !buf.is_empty() { + if buf.len() >= 3 && !STOPWORDS.contains(&buf.as_str()) { + out.insert(std::mem::take(&mut buf)); + } else { + buf.clear(); + } + } + } + if buf.len() >= 3 && !STOPWORDS.contains(&buf.as_str()) { + out.insert(buf); + } + out + } + let a = toks(a); + let b = toks(b); + !a.is_disjoint(&b) +} + +#[cfg(test)] +mod jaccard_tests { + use super::content_token_jaccard; + + #[test] + fn empty_strings_return_zero() { + assert_eq!(content_token_jaccard("", ""), 0.0); + } + + #[test] + fn identical_content_is_one() { + let s = "alpha beta gamma delta"; + assert!((content_token_jaccard(s, s) - 1.0).abs() < 1e-6); + } + + #[test] + fn disjoint_content_is_zero() { + assert_eq!( + content_token_jaccard("alpha beta gamma", "delta epsilon zeta"), + 0.0 + ); + } + + #[test] + fn punctuation_and_case_normalized() { + let a = "ALPHA, beta! Gamma?"; + let b = "alpha beta gamma"; + assert!((content_token_jaccard(a, b) - 1.0).abs() < 1e-6); + } + + #[test] + fn short_tokens_dropped() { + // "a" is length 1, dropped; the other tokens all match. + let a = "a big red house"; + let b = "big red house"; + assert!((content_token_jaccard(a, b) - 1.0).abs() < 1e-6); + } + + #[test] + fn template_clone_scores_high() { + // Mirrors the real-world PITVR vs Roster Download shape: heavy shared + // template (health endpoints / troubleshooting table), small unique + // product text. Jaccard should sit high, above the 0.70 default cap. + let pitvr = "PITVR Personal Interactive TVR purpose allows users obtain \ + driving record license status checks customer DFA runtime environment \ + Linux General health endpoints environment URL dev uat prod troubleshooting \ + outage alerts alert message response unable connect database check cluster \ + SMTP SendGrid external web service personal inquiry VPN TVR service CCP \ + status generate order id payment API cache Redis cluster"; + let roster = "Roster Download source Confluence allows users obtain downloadable \ + roster runtime environment Linux General health endpoints environment URL \ + dev uat prod troubleshooting outage alerts alert message response could \ + connect database check cluster CCP service status Redis cluster template \ + database Linux VIP cluster SFTP agency port obtain OrderId Payment API"; + let j = content_token_jaccard(pitvr, roster); + assert!(j >= 0.40, "expected meaningful template overlap, got {j}"); + } + + #[test] + fn different_claims_score_low() { + // Same topic, different facts — the contradiction case we *want* to keep. + let a = "The service runs on port 8080 with TLS disabled in development."; + let b = "The service listens on port 9090 with TLS required for all traffic."; + let j = content_token_jaccard(a, b); + // Some overlap (service, port, TLS) but well under 0.70. + assert!(j < 0.70, "expected jaccard < 0.70, got {j}"); + } +} + +#[cfg(test)] +mod title_overlap_tests { + use super::titles_share_token; + + #[test] + fn distinct_entities_do_not_share_tokens() { + // The real-world PITVR vs Roster Download case: zero title overlap. + assert!(!titles_share_token( + "PITVR (Personal/Interactive TVR)", + "Roster Download", + )); + } + + #[test] + fn similar_entities_share_tokens() { + // Real contradictions sit on the same subject — titles overlap. + assert!(titles_share_token( + "DOC - Parolee Search", + "Absconder Search", + )); + assert!(titles_share_token( + "Deploy an Application", + "Application URL Path Naming Standard", + )); + } + + #[test] + fn stopwords_do_not_count() { + // "the" alone is not enough evidence these titles are about the same thing. + assert!(!titles_share_token( + "The Lightning Report", + "The Marketing Playbook", + )); + } + + #[test] + fn punctuation_does_not_block_match() { + assert!(titles_share_token( + "PITVR (Personal/Interactive TVR)", + "Interactive TVR Operator Guide", + )); + } + + #[test] + fn empty_titles_return_false() { + assert!(!titles_share_token("", "anything")); + assert!(!titles_share_token("", "")); + } +} + +// ==================== Dismissal methods ==================== + +impl SqliteStorage { + /// Get a tag by ID. Returns (name, parent_id). + pub(crate) fn get_tag_by_id_impl( + &self, + tag_id: &str, + ) -> Result)>, AtomicCoreError> { + let conn = self.db.read_conn()?; + let mut stmt = conn.prepare( + "SELECT name, parent_id FROM tags WHERE id = ?1", + )?; + let result = stmt + .query_map(params![tag_id], |row| { + Ok((row.get::<_, String>(0)?, row.get::<_, Option>(1)?)) + })? + .next() + .transpose()?; + Ok(result) + } + + /// List currently active dismissals for a check. Returns (item_key, reason) pairs. + pub(crate) fn list_dismissed_keys_impl( + &self, + check_name: &str, + ) -> Result, AtomicCoreError> { + let conn = self.db.read_conn()?; + let now = chrono::Utc::now().to_rfc3339(); + let mut stmt = conn.prepare( + "SELECT item_key, reason FROM health_dismissals + WHERE check_name = ?1 + AND (expires_at IS NULL OR expires_at > ?2)", + )?; + let rows = stmt + .query_map(params![check_name, now], |row| { + Ok((row.get::<_, String>(0)?, row.get::<_, String>(1)?)) + })? + .collect::, _>>()?; + Ok(rows) + } + + /// Insert or update a dismissal (upsert on unique index). + pub(crate) fn dismiss_health_item_impl( + &self, + check_name: &str, + item_key: &str, + reason: &str, + expires_at: Option<&str>, + ) -> Result<(), AtomicCoreError> { + let mut conn = self + .db + .conn + .lock() + .map_err(|e| AtomicCoreError::Lock(e.to_string()))?; + let id = uuid::Uuid::new_v4().to_string(); + let now = chrono::Utc::now().to_rfc3339(); + let tx = conn + .transaction() + .map_err(|e| AtomicCoreError::DatabaseOperation(e.to_string()))?; + tx.execute( + "INSERT INTO health_dismissals (id, check_name, item_key, reason, dismissed_at, expires_at) + VALUES (?1, ?2, ?3, ?4, ?5, ?6) + ON CONFLICT(check_name, item_key) DO UPDATE SET + reason = excluded.reason, + dismissed_at = excluded.dismissed_at, + expires_at = excluded.expires_at", + params![id, check_name, item_key, reason, now, expires_at], + )?; + tx.commit() + .map_err(|e| AtomicCoreError::DatabaseOperation(e.to_string()))?; + Ok(()) + } + + pub(crate) fn undismiss_health_item_impl( + &self, + check_name: &str, + item_key: &str, + ) -> Result<(), AtomicCoreError> { + let mut conn = self + .db + .conn + .lock() + .map_err(|e| AtomicCoreError::Lock(e.to_string()))?; + let tx = conn + .transaction() + .map_err(|e| AtomicCoreError::DatabaseOperation(e.to_string()))?; + tx.execute( + "DELETE FROM health_dismissals WHERE check_name = ?1 AND item_key = ?2", + params![check_name, item_key], + )?; + tx.commit() + .map_err(|e| AtomicCoreError::DatabaseOperation(e.to_string()))?; + Ok(()) + } + + /// Delete stale dismissal rows: expired TTL, orphaned atom refs, orphaned tag refs. + pub(crate) fn gc_dismissals_impl(&self) -> Result { + let mut conn = self.db.conn.lock().map_err(|e| AtomicCoreError::Lock(e.to_string()))?; + let now = chrono::Utc::now().to_rfc3339(); + let tx = conn.transaction().map_err(|e| AtomicCoreError::DatabaseOperation(e.to_string()))?; + let mut total: u64 = 0; + + // 1. Expired dismissals (defer TTL passed) + total += tx.execute( + "DELETE FROM health_dismissals WHERE expires_at IS NOT NULL AND expires_at <= ?1", + params![now], +)? as u64; + + // 2a. Per-atom checks + total += tx.execute( + "DELETE FROM health_dismissals + WHERE check_name IN ('boilerplate_pollution', 'content_quality') + AND item_key NOT IN (SELECT id FROM atoms)", + [], +)? as u64; + + // 2b. tag_health dismissals pointing at deleted tags + total += tx.execute( + "DELETE FROM health_dismissals + WHERE check_name = 'tag_health' + AND item_key NOT IN (SELECT id FROM tags)", + [], +)? as u64; + + // 2c. Pair-keyed checks — delete if either half atom is gone + total += tx.execute( + "DELETE FROM health_dismissals + WHERE check_name IN ('content_overlap', 'contradiction_detection') + AND ( + instr(item_key, '__') = 0 + OR substr(item_key, 1, instr(item_key, '__') - 1) NOT IN (SELECT id FROM atoms) + OR substr(item_key, instr(item_key, '__') + 2) NOT IN (SELECT id FROM atoms) + )", + [], +)? as u64; + + tx.commit().map_err(|e| AtomicCoreError::DatabaseOperation(e.to_string()))?; + Ok(total) + } + /// Persist a new tag proposal. + pub(crate) fn save_tag_proposal_impl( + &self, + proposal: &crate::health::TagProposal, + ) -> Result<(), AtomicCoreError> { + let actions_json = serde_json::to_string(&proposal.actions) + .map_err(|e| AtomicCoreError::Validation(e.to_string()))?; + let mut conn = self.db.conn.lock().map_err(|e| AtomicCoreError::Lock(e.to_string()))?; + let tx = conn.transaction().map_err(|e| AtomicCoreError::DatabaseOperation(e.to_string()))?; + tx.execute( + "INSERT INTO tag_proposals (id, summary, actions_json, created_at) VALUES (?1, ?2, ?3, ?4)", + params![proposal.id, proposal.summary, actions_json, proposal.generated_at], + )?; + tx.commit().map_err(|e| AtomicCoreError::DatabaseOperation(e.to_string()))?; + Ok(()) + } + + /// Load a proposal by ID. + pub(crate) fn get_tag_proposal_impl( + &self, + id: &str, + ) -> Result, AtomicCoreError> { + let conn = self.db.read_conn()?; + let result = conn.query_row( + "SELECT id, summary, actions_json, created_at FROM tag_proposals WHERE id = ?1", + params![id], + |row| { + Ok(( + row.get::<_, String>(0)?, + row.get::<_, String>(1)?, + row.get::<_, String>(2)?, + row.get::<_, String>(3)?, + )) + }, + ); + match result { + Ok((id, summary, actions_json, generated_at)) => { + let actions: Vec = + serde_json::from_str(&actions_json) + .map_err(|e| AtomicCoreError::Validation(e.to_string()))?; + Ok(Some(crate::health::TagProposal { id, summary, actions, generated_at })) + } + Err(rusqlite::Error::QueryReturnedNoRows) => Ok(None), + Err(e) => Err(AtomicCoreError::DatabaseOperation(e.to_string())), + } + } + + /// Load the latest un-applied proposal. + pub(crate) fn get_latest_tag_proposal_impl( + &self, + ) -> Result, AtomicCoreError> { + let conn = self.db.read_conn()?; + let result = conn.query_row( + "SELECT id, summary, actions_json, created_at FROM tag_proposals WHERE applied_at IS NULL ORDER BY created_at DESC LIMIT 1", + [], + |row| { + Ok(( + row.get::<_, String>(0)?, + row.get::<_, String>(1)?, + row.get::<_, String>(2)?, + row.get::<_, String>(3)?, + )) + }, + ); + match result { + Ok((id, summary, actions_json, generated_at)) => { + let actions: Vec = + serde_json::from_str(&actions_json) + .map_err(|e| AtomicCoreError::Validation(e.to_string()))?; + Ok(Some(crate::health::TagProposal { id, summary, actions, generated_at })) + } + Err(rusqlite::Error::QueryReturnedNoRows) => Ok(None), + Err(e) => Err(AtomicCoreError::DatabaseOperation(e.to_string())), + } + } + + /// Mark a proposal as applied. + pub(crate) fn mark_tag_proposal_applied_impl( + &self, + id: &str, + ) -> Result<(), AtomicCoreError> { + let now = chrono::Utc::now().to_rfc3339(); + let mut conn = self.db.conn.lock().map_err(|e| AtomicCoreError::Lock(e.to_string()))?; + let tx = conn.transaction().map_err(|e| AtomicCoreError::DatabaseOperation(e.to_string()))?; + tx.execute( + "UPDATE tag_proposals SET applied_at = ?1 WHERE id = ?2", + params![now, id], + )?; + tx.commit().map_err(|e| AtomicCoreError::DatabaseOperation(e.to_string()))?; + Ok(()) + } +} \ No newline at end of file diff --git a/crates/atomic-core/src/storage/sqlite/mod.rs b/crates/atomic-core/src/storage/sqlite/mod.rs index f2dbccb3..4d4db542 100644 --- a/crates/atomic-core/src/storage/sqlite/mod.rs +++ b/crates/atomic-core/src/storage/sqlite/mod.rs @@ -14,6 +14,7 @@ mod search; mod settings; mod tags; mod wiki; +pub(crate) mod health; use crate::db::Database; use crate::storage::traits::*; diff --git a/crates/atomic-core/src/storage/sqlite/wiki.rs b/crates/atomic-core/src/storage/sqlite/wiki.rs index 6759c5e8..3629a9e0 100644 --- a/crates/atomic-core/src/storage/sqlite/wiki.rs +++ b/crates/atomic-core/src/storage/sqlite/wiki.rs @@ -145,6 +145,17 @@ impl SqliteStorage { ) -> StorageResult<(Vec, i32)> { let conn = self.db.read_conn()?; + // Load per-DB wiki-excluded tag ids from settings (may be empty). + let excluded_tag_ids: Vec = conn + .query_row( + "SELECT value FROM settings WHERE key = 'wiki_excluded_tag_ids'", + [], + |row| row.get::<_, String>(0), + ) + .ok() + .and_then(|s| serde_json::from_str::>(&s).ok()) + .unwrap_or_default(); + // Get all descendant tag IDs (including the tag itself) let all_tag_ids = wiki::get_tag_hierarchy(&conn, tag_id).map_err(|e| AtomicCoreError::Wiki(e))?; @@ -197,6 +208,7 @@ impl SqliteStorage { centroid, &scoped_atom_ids, max_source_tokens, + &excluded_tag_ids, ) .map_err(|e| AtomicCoreError::Wiki(e))? } else { @@ -233,6 +245,17 @@ impl SqliteStorage { ) -> StorageResult, i32)>> { let conn = self.db.read_conn()?; + // Load per-DB wiki-excluded tag ids from settings (may be empty). + let excluded_tag_ids: Vec = conn + .query_row( + "SELECT value FROM settings WHERE key = 'wiki_excluded_tag_ids'", + [], + |row| row.get::<_, String>(0), + ) + .ok() + .and_then(|s| serde_json::from_str::>(&s).ok()) + .unwrap_or_default(); + // Get atoms added after the last update let mut new_atom_stmt = conn .prepare( @@ -271,6 +294,7 @@ impl SqliteStorage { centroid, &new_atom_id_set, max_source_tokens, + &excluded_tag_ids, ) .map_err(|e| AtomicCoreError::Wiki(e))? } else { diff --git a/crates/atomic-core/src/wiki/centroid.rs b/crates/atomic-core/src/wiki/centroid.rs index 58ba802b..d34db1e6 100644 --- a/crates/atomic-core/src/wiki/centroid.rs +++ b/crates/atomic-core/src/wiki/centroid.rs @@ -122,6 +122,7 @@ pub(crate) fn select_chunks_by_centroid( centroid_blob: &[u8], scoped_atom_ids: &std::collections::HashSet, max_source_tokens: usize, + excluded_tag_ids: &[String], ) -> Result, String> { // Fetch more than we need from vec_chunks since we'll filter by scope. // Over-fetch by 3x to account for chunks outside the tag hierarchy. @@ -147,7 +148,7 @@ pub(crate) fn select_chunks_by_centroid( // Batch-load chunk details for all candidates let chunk_ids: Vec<&str> = candidates.iter().map(|(id, _)| id.as_str()).collect(); - let chunk_details = batch_fetch_chunk_details(conn, &chunk_ids)?; + let chunk_details = super::batch_fetch_chunk_details_excluding_tags(conn, &chunk_ids, excluded_tag_ids)?; // Filter to scoped atoms and fill token budget let mut chunks = Vec::new(); diff --git a/crates/atomic-core/src/wiki/mod.rs b/crates/atomic-core/src/wiki/mod.rs index 2d58763e..d4497bb6 100644 --- a/crates/atomic-core/src/wiki/mod.rs +++ b/crates/atomic-core/src/wiki/mod.rs @@ -757,6 +757,28 @@ pub(crate) fn count_atoms_with_tags(conn: &Connection, tag_ids: &[String]) -> Re pub(crate) fn batch_fetch_chunk_details( conn: &Connection, chunk_ids: &[&str], +) -> Result, String> { + batch_fetch_chunk_details_filtered(conn, chunk_ids, &[]) +} + +/// Convenience: fetch chunk details and filter out any atom tagged with a +/// wiki-excluded tag (resolved from the caller-provided list). Same signature +/// as `batch_fetch_chunk_details` plus the exclusion set. +pub(crate) fn batch_fetch_chunk_details_excluding_tags( + conn: &Connection, + chunk_ids: &[&str], + excluded_tag_ids: &[String], +) -> Result, String> { + batch_fetch_chunk_details_filtered(conn, chunk_ids, excluded_tag_ids) +} + +/// Fetch chunk details, dropping chunks whose atom is tagged with any of +/// `excluded_tag_ids`. Use this when selecting source chunks for wiki +/// generation so excluded-tagged notes stay out of the LLM context. +pub(crate) fn batch_fetch_chunk_details_filtered( + conn: &Connection, + chunk_ids: &[&str], + excluded_tag_ids: &[String], ) -> Result, String> { let mut map = std::collections::HashMap::new(); // Batch in groups of 500 to stay under SQLite parameter limit @@ -789,6 +811,40 @@ pub(crate) fn batch_fetch_chunk_details( map.insert(id, (atom_id, chunk_index, content)); } } + + // Drop chunks whose atom is tagged with any excluded tag. Done after the + // fetch (not in the SQL) so the filter works regardless of how the caller + // paginates chunk_ids. + if !excluded_tag_ids.is_empty() && !map.is_empty() { + let atom_ids: std::collections::HashSet = + map.values().map(|(aid, _, _)| aid.clone()).collect(); + if !atom_ids.is_empty() { + let atom_vec: Vec = atom_ids.into_iter().collect(); + let a_ph = atom_vec.iter().map(|_| "?").collect::>().join(","); + let t_ph = excluded_tag_ids.iter().map(|_| "?").collect::>().join(","); + let query = format!( + "SELECT DISTINCT atom_id FROM atom_tags WHERE atom_id IN ({}) AND tag_id IN ({})", + a_ph, t_ph + ); + let mut params: Vec<&dyn rusqlite::ToSql> = Vec::new(); + for a in &atom_vec { params.push(a); } + for t in excluded_tag_ids { params.push(t); } + let mut stmt = conn.prepare(&query) + .map_err(|e| format!("Failed to prepare excluded-atoms query: {}", e))?; + let mut rows = stmt.query(rusqlite::params_from_iter(params)) + .map_err(|e| format!("Failed to query excluded atoms: {}", e))?; + let mut excluded: std::collections::HashSet = std::collections::HashSet::new(); + while let Some(row) = rows.next() + .map_err(|e| format!("Failed to read row: {}", e))? + { + let atom_id: String = row.get(0) + .map_err(|e| format!("Failed to get atom_id: {}", e))?; + excluded.insert(atom_id); + } + map.retain(|_, (aid, _, _)| !excluded.contains(aid)); + } + } + Ok(map) } @@ -1176,12 +1232,42 @@ pub fn delete_article(conn: &Connection, tag_id: &str) -> Result<(), String> { /// Load all wiki articles with tag names for list view, sorted by importance pub fn load_all_wiki_articles(conn: &Connection) -> Result, String> { + // Use a recursive CTE to compute the live atom count for each wiki article's tag + // hierarchy (tag + all descendants). This mirrors get_article_status exactly, so + // new_atoms_available here is always consistent with GET /api/wiki/{tag_id}/status. let mut stmt = conn .prepare( - "SELECT w.id, w.tag_id, t.name as tag_name, w.updated_at, w.atom_count, - (SELECT COUNT(*) FROM wiki_links wl WHERE wl.target_tag_id = w.tag_id) as inbound_links + "WITH RECURSIVE + -- Expand each wiki-article tag to include all its descendant tags. + -- Seeded only from tags that have a wiki article so the recursion is + -- bounded by the number of articles, not the full tag tree. + tag_tree(root_id, id) AS ( + SELECT t.id, t.id + FROM tags t + WHERE EXISTS (SELECT 1 FROM wiki_articles wa WHERE wa.tag_id = t.id) + UNION ALL + SELECT tt.root_id, t.id + FROM tags t + JOIN tag_tree tt ON t.parent_id = tt.id + ), + -- Live atom count per root tag (counts atoms in the entire subtree). + live_counts(tag_id, cnt) AS ( + SELECT tt.root_id, COUNT(DISTINCT at.atom_id) + FROM tag_tree tt + JOIN atom_tags at ON at.tag_id = tt.id + GROUP BY tt.root_id + ) + SELECT + w.id, + w.tag_id, + t.name AS tag_name, + w.updated_at, + w.atom_count, + (SELECT COUNT(*) FROM wiki_links wl WHERE wl.target_tag_id = w.tag_id) AS inbound_links, + MAX(0, COALESCE(lc.cnt, 0) - w.atom_count) AS new_atoms_available FROM wiki_articles w JOIN tags t ON w.tag_id = t.id + LEFT JOIN live_counts lc ON lc.tag_id = w.tag_id ORDER BY inbound_links DESC, w.atom_count DESC, w.updated_at DESC", ) .map_err(|e| format!("Failed to prepare wiki articles query: {}", e))?; @@ -1195,6 +1281,7 @@ pub fn load_all_wiki_articles(conn: &Connection) -> Result, +) -> String { + core.create_atom( + CreateAtomRequest { + content: content.to_string(), + source_url: source_url.map(|s| s.to_string()), + ..Default::default() + }, + |_| {}, + ) + .await + .expect("create atom") + .expect("atom inserted") + .atom + .id +} + +/// LLM returns confidence 0.9 → should relink to the top candidate. +#[tokio::test] +async fn broken_link_auto_resolve_relinks_on_high_confidence() { + let mock = MockAiServer::start().await; + + // We need a target atom with a source_url that will surface as a candidate. + let handle = setup_core(Backend::Sqlite, &mock.base_url()) + .await + .expect("harness"); + let core = &handle.core; + + // Create the target atom first. + let target_id = make_atom( + core, + "# Bravo Notes\n\nContent about bravo.", + Some("vault://notes/bravo.md"), + ) + .await; + + // Create the source atom with the broken link. + let source_id = make_atom( + core, + "see [bravo](./bravo.md) for details", + Some("vault://notes/alpha.md"), + ) + .await; + + // The LLM returns JSON selecting the target with confidence 0.9. + let llm_response = format!( + r#"{{"target_atom_id":"{target_id}","confidence":0.9,"reason":"exact title match"}}"# + ); + mock.mock_chat_completion(llm_response).await; + + let outcome = auto_resolve_broken_link(core, &source_id, "[bravo](./bravo.md)", "bravo") + .await + .expect("auto_resolve"); + + match outcome { + AutoResolveOutcome::Relinked { target_atom_id, confidence, .. } => { + assert_eq!(target_atom_id, target_id, "should relink to target"); + assert!(confidence >= 0.9 - f32::EPSILON, "confidence should be 0.9"); + } + other => panic!("expected Relinked, got: {other:?}"), + } + + // Verify the atom content was actually updated. + let updated = core.get_atom(&source_id).await.expect("get").expect("exists"); + assert!( + updated.atom.content.contains(&format!("atom://{target_id}")), + "content should contain relinked atom:// URI" + ); +} + +/// LLM returns confidence 0.3 → should skip (leave link unchanged). +#[tokio::test] +async fn broken_link_auto_resolve_skips_on_low_confidence() { + let mock = MockAiServer::start().await; + + let handle = setup_core(Backend::Sqlite, &mock.base_url()) + .await + .expect("harness"); + let core = &handle.core; + + // Create a target so there's at least one candidate. + let target_id = make_atom( + core, + "# Gamma Notes\n\nSomething.", + Some("vault://notes/gamma.md"), + ) + .await; + + let source_id = make_atom( + core, + "see [gamma](./gamma.md) here", + Some("vault://notes/beta.md"), + ) + .await; + + let llm_response = format!( + r#"{{"target_atom_id":"{target_id}","confidence":0.3,"reason":"uncertain match"}}"# + ); + mock.mock_chat_completion(llm_response).await; + + let outcome = auto_resolve_broken_link(core, &source_id, "[gamma](./gamma.md)", "gamma") + .await + .expect("auto_resolve"); + + match outcome { + AutoResolveOutcome::Skipped { reason } => { + assert!(reason.contains("0.30") || reason.contains("low confidence") || reason.contains("uncertain"), + "unexpected skip reason: {reason}"); + } + other => panic!("expected Skipped, got: {other:?}"), + } + + // Verify content is unchanged. + let updated = core.get_atom(&source_id).await.expect("get").expect("exists"); + assert!( + updated.atom.content.contains("[gamma](./gamma.md)"), + "content should be unchanged" + ); +} + +/// No candidates available → should remove the link. +#[tokio::test] +async fn broken_link_auto_resolve_removes_when_no_candidates() { + let mock = MockAiServer::start().await; + + let handle = setup_core(Backend::Sqlite, &mock.base_url()) + .await + .expect("harness"); + let core = &handle.core; + + // Create an atom with a broken link; no other atoms exist that would match. + let source_id = make_atom( + core, + "see [nonexistent](./totally-missing-xyz.md) here", + Some("vault://notes/src.md"), + ) + .await; + + // Mock should not be called since we return early, but provide a fallback anyway. + mock.mock_chat_completion(r#"{"target_atom_id":null,"confidence":0.0,"reason":"no match"}"#) + .await; + + let outcome = + auto_resolve_broken_link(core, &source_id, "[nonexistent](./totally-missing-xyz.md)", "nonexistent") + .await + .expect("auto_resolve"); + + match outcome { + AutoResolveOutcome::Removed { reason } => { + assert!(!reason.is_empty(), "reason must be non-empty: {reason}"); + } + // If there were candidates (e.g. source_url suffix match), the outcome might be + // Skipped — both are acceptable for the "no good target" case. + AutoResolveOutcome::Skipped { .. } => {} + AutoResolveOutcome::Relinked { .. } => panic!("should not have relinked with no valid candidates"), + } +} diff --git a/crates/atomic-core/tests/broken_link_scope_tests.rs b/crates/atomic-core/tests/broken_link_scope_tests.rs new file mode 100644 index 00000000..2e875cef --- /dev/null +++ b/crates/atomic-core/tests/broken_link_scope_tests.rs @@ -0,0 +1,115 @@ +//! Regression tests for broken-internal-links scope. +//! +//! Covers the gap where the link resolver only tried current-dir and +//! vault-root candidates for bare markdown hrefs. Atoms living in a sibling +//! subdirectory were incorrectly flagged as broken even though the picker +//! (which does a vault-wide title/URL search) could find them. +//! +//! After the fix, a bare markdown href that misses both exact-candidate +//! lookups falls back to a vault-wide filename-stem search mirroring the +//! wikilink resolution path. + +mod support; + +use atomic_core::health::compute::compute_single_check; +use atomic_core::CreateAtomRequest; +use support::{setup_core, Backend, MockAiServer}; + +async fn make_atom( + core: &atomic_core::AtomicCore, + content: &str, + source_url: Option<&str>, +) -> String { + core.create_atom( + CreateAtomRequest { + content: content.to_string(), + source_url: source_url.map(|s| s.to_string()), + ..Default::default() + }, + |_| {}, + ) + .await + .expect("create atom") + .expect("atom inserted") + .atom + .id +} + +fn count_from_check(data: &serde_json::Value) -> i64 { + data.get("broken_links") + .and_then(|v| v.as_i64()) + .unwrap_or_else(|| { + data.get("broken_link_list") + .and_then(|v| v.as_array()) + .map(|a| a.len() as i64) + .unwrap_or(0) + }) +} + +/// A bare markdown link `[x](glossary.md)` in an atom inside `references/` +/// must resolve against a target atom living in `shared/glossary.md` — not +/// be reported as broken. +#[tokio::test] +async fn broken_links_resolves_sibling_subdirectory_markdown() { + let mock = MockAiServer::start().await; + let handle = setup_core(Backend::Sqlite, &mock.base_url()) + .await + .expect("harness"); + let core = &handle.core; + + // Target lives in a different subdirectory than the source. The old + // resolver would only try `references/glossary.md` and + // `vault://notes/glossary.md`, missing this one. + let _target = make_atom( + core, + "# Glossary\n\nTerms.", + Some("vault://notes/shared/glossary.md"), + ) + .await; + + let _source = make_atom( + core, + "see [glossary](./glossary.md) for terms", + Some("vault://notes/references/onboarding.md"), + ) + .await; + + let (_, result) = compute_single_check(core, "broken_internal_links") + .await + .expect("compute"); + + assert_eq!( + count_from_check(&result.data), + 0, + "expected 0 broken links after subdir fallback; data={:?}", + result.data, + ); +} + +/// Control: a markdown link whose stem does not exist anywhere in the vault +/// must still be reported as broken. +#[tokio::test] +async fn broken_links_still_flags_truly_missing_markdown() { + let mock = MockAiServer::start().await; + let handle = setup_core(Backend::Sqlite, &mock.base_url()) + .await + .expect("harness"); + let core = &handle.core; + + let _source = make_atom( + core, + "see [missing](./totally-missing-xyz.md) here", + Some("vault://notes/references/note.md"), + ) + .await; + + let (_, result) = compute_single_check(core, "broken_internal_links") + .await + .expect("compute"); + + assert!( + count_from_check(&result.data) >= 1, + "truly-missing markdown href must still be flagged; data={:?}", + result.data, + ); +} diff --git a/crates/atomic-core/tests/dismissal_gc_tests.rs b/crates/atomic-core/tests/dismissal_gc_tests.rs new file mode 100644 index 00000000..605afed9 --- /dev/null +++ b/crates/atomic-core/tests/dismissal_gc_tests.rs @@ -0,0 +1,74 @@ +//! Unit tests for the health_dismissals GC storage method. + +use atomic_core::{AtomicCore, CreateAtomRequest}; +use tempfile::TempDir; + +async fn setup() -> (AtomicCore, TempDir) { + let dir = TempDir::new().expect("create tempdir"); + let core = AtomicCore::open_or_create(dir.path().join("test.db")).expect("open sqlite"); + (core, dir) +} + +#[tokio::test] +async fn test_gc_dismissals_removes_expired_and_orphaned() { + let (core, _dir) = setup().await; + + // Create two real atoms A and B. + let atom_a = core + .create_atom( + CreateAtomRequest { + content: "Atom A content".to_string(), + ..Default::default() + }, + |_| {}, + ) + .await + .expect("create A"); + let atom_b = core + .create_atom( + CreateAtomRequest { + content: "Atom B content".to_string(), + ..Default::default() + }, + |_| {}, + ) + .await + .expect("create B"); + + let id_a = atom_a.unwrap().atom.id.clone(); + let id_b = atom_b.unwrap().atom.id.clone(); + + // Dismissal for a non-existent atom C (should be GC'd). + let fake_c = "00000000-0000-0000-0000-000000000099"; + core.dismiss_health_item("boilerplate_pollution", fake_c, "orphan", None) + .await + .expect("dismiss fake C"); + + // Pair dismissal A__B — both atoms exist (should survive). + let pair_ab = format!("{}__{}", id_a, id_b); + core.dismiss_health_item("content_overlap", &pair_ab, "reviewed", None) + .await + .expect("dismiss pair A__B"); + + // Expired deferred dismissal B__A. + let pair_ba = format!("{}__{}", id_b, id_a); + let past = "2000-01-01T00:00:00+00:00"; + core.dismiss_health_item("content_overlap", &pair_ba, "deferred", Some(past)) + .await + .expect("dismiss expired pair B__A"); + + // GC should remove 2 rows: orphan C + expired B__A. + let removed = core + .gc_health_dismissals() + .await + .expect("gc_dismissals"); + assert_eq!(removed, 2, "expected 2 rows deleted, got {removed}"); + + // Only A__B survives. + let remaining = core + .list_dismissed_keys("content_overlap") + .await + .expect("list dismissed"); + let keys: Vec = remaining.into_iter().map(|(k, _)| k).collect(); + assert_eq!(keys, vec![pair_ab], "only pair A__B should remain"); +} diff --git a/crates/atomic-core/tests/health_batch_tests.rs b/crates/atomic-core/tests/health_batch_tests.rs new file mode 100644 index 00000000..15787eba --- /dev/null +++ b/crates/atomic-core/tests/health_batch_tests.rs @@ -0,0 +1,42 @@ +//! Integration tests for health batch dismissal. +//! +//! Tests that multiple dismiss operations all succeed — analogous to what +//! the batch endpoint does per-item. + +use atomic_core::AtomicCore; +use tempfile::TempDir; + +async fn setup() -> (AtomicCore, TempDir) { + let dir = TempDir::new().expect("create tempdir"); + let core = AtomicCore::open_or_create(dir.path().join("test.db")) + .expect("open sqlite"); + (core, dir) +} + +#[tokio::test] +async fn test_batch_dismiss_records_all_items() { + let (core, _dir) = setup().await; + + // Simulate what the batch endpoint does: dismiss multiple items in sequence. + core.dismiss_health_item("content_overlap", "a__b", "ignored_pair", None) + .await + .expect("dismiss a__b"); + core.dismiss_health_item("content_overlap", "c__d", "ignored_pair", None) + .await + .expect("dismiss c__d"); + + // Upsert semantics: re-dismissing with a different reason should not error. + core.dismiss_health_item("content_overlap", "a__b", "resolved_other", None) + .await + .expect("re-dismiss a__b"); + + // Undismiss succeeds. + core.undismiss_health_item("content_overlap", "a__b") + .await + .expect("undismiss a__b"); + + // Undismissing a non-existent key is idempotent. + core.undismiss_health_item("content_overlap", "does_not_exist") + .await + .expect("undismiss missing key is idempotent"); +} diff --git a/crates/atomic-core/tests/health_custom_integration.rs b/crates/atomic-core/tests/health_custom_integration.rs new file mode 100644 index 00000000..3577cfe0 --- /dev/null +++ b/crates/atomic-core/tests/health_custom_integration.rs @@ -0,0 +1,361 @@ +//! Integration tests for custom health checks wired into `compute_health`. +//! +//! These exercise the full path that unit tests in `health::custom` can't +//! reach: persistence via the settings table, key prefixing, weight +//! propagation into `aggregate_score`, informational semantics, and +//! collision avoidance between built-in and custom check keys. + +use atomic_core::health::custom::{result_key, CustomCheck, CustomRule, DomainMatchMode}; +use atomic_core::{AtomicCore, CreateAtomRequest}; +use tempfile::TempDir; + +async fn setup() -> (AtomicCore, TempDir) { + let dir = TempDir::new().expect("tempdir"); + let core = AtomicCore::open_or_create(dir.path().join("test.db")).expect("open sqlite"); + (core, dir) +} + +async fn make_atom(core: &AtomicCore, content: &str, source: Option<&str>) -> String { + let atom = core + .create_atom( + CreateAtomRequest { + content: content.to_string(), + source_url: source.map(|s| s.to_string()), + published_at: None, + tag_ids: vec![], + skip_if_source_exists: false, + }, + |_| {}, + ) + .await + .expect("create_atom") + .expect("created"); + atom.atom.id +} + +fn check(id: &str, weight: f64, rule: CustomRule) -> CustomCheck { + CustomCheck { + id: id.to_string(), + label: id.to_string(), + description: String::new(), + enabled: true, + weight, + rule, + } +} + +// --- Persistence ------------------------------------------------------------ + +#[tokio::test] +async fn custom_checks_round_trip_via_settings() { + let (core, _dir) = setup().await; + + assert!(core.get_custom_health_checks().await.unwrap().is_empty()); + + let checks = vec![ + check("c1", 0.0, CustomRule::RequireSource { tag_filter: None }), + check( + "c2", + 0.5, + CustomRule::ContentLength { + min_words: 10, + max_words: 0, + tag_filter: None, + }, + ), + ]; + core.set_custom_health_checks(&checks).await.unwrap(); + + let loaded = core.get_custom_health_checks().await.unwrap(); + assert_eq!(loaded.len(), 2); + assert_eq!(loaded[0].id, "c1"); + assert_eq!(loaded[1].id, "c2"); + assert!((loaded[1].weight - 0.5).abs() < f64::EPSILON); +} + +// --- compute_health wires custom checks into the report -------------------- + +#[tokio::test] +async fn compute_health_includes_custom_check_with_prefixed_key() { + let (core, _dir) = setup().await; + make_atom(&core, "no source", None).await; + make_atom(&core, "has source", Some("https://example.com/x")).await; + + core.set_custom_health_checks(&[check( + "needs_source", + 0.0, + CustomRule::RequireSource { tag_filter: None }, + )]) + .await + .unwrap(); + + let report = core.compute_health().await.expect("compute_health"); + let key = result_key("needs_source"); + assert_eq!(key, "custom.needs_source"); + + let res = report.checks.get(&key).expect("custom check present"); + assert_eq!(res.data["total_considered"], 2); + assert_eq!(res.data["flagged_count"], 1); + assert_eq!(res.status, "error"); + assert!(res.requires_review); +} + +// --- Zero-weight / disabled ------------------------------------------------ + +#[tokio::test] +async fn zero_weight_custom_check_is_informational_and_not_scored() { + let (core, _dir) = setup().await; + make_atom(&core, "no source", None).await; + make_atom(&core, "has source", Some("https://x.com/a")).await; + + // Zero weight → informational. Should not drag overall_score below + // what it would have been without the rule. + core.set_custom_health_checks(&[check( + "info_only", + 0.0, + CustomRule::RequireSource { tag_filter: None }, + )]) + .await + .unwrap(); + let with_info = core.compute_health().await.unwrap(); + let res = with_info.checks.get("custom.info_only").unwrap(); + assert!(res.informational, "zero-weight rule must be informational"); + + // Wipe rule → recompute baseline score. + core.set_custom_health_checks(&[]).await.unwrap(); + let baseline = core.compute_health().await.unwrap(); + + assert_eq!( + with_info.overall_score, baseline.overall_score, + "informational custom check must not affect overall score" + ); +} + +#[tokio::test] +async fn positive_weight_custom_check_lowers_overall_score() { + let (core, _dir) = setup().await; + // 3 atoms, one sourced → 2/3 flagged → check score ~33. + make_atom(&core, "a", None).await; + make_atom(&core, "b", None).await; + make_atom(&core, "c", Some("https://x.com/c")).await; + + core.set_custom_health_checks(&[]).await.unwrap(); + let baseline = core.compute_health().await.unwrap(); + + core.set_custom_health_checks(&[check( + "scored", + 1.0, + CustomRule::RequireSource { tag_filter: None }, + )]) + .await + .unwrap(); + let with_rule = core.compute_health().await.unwrap(); + + let res = with_rule.checks.get("custom.scored").unwrap(); + assert!(!res.informational, "positive weight must be scored"); + assert!(res.score < 100); + + assert!( + with_rule.overall_score < baseline.overall_score, + "positive-weight failing rule must drop overall score \ + (baseline={}, with_rule={})", + baseline.overall_score, + with_rule.overall_score, + ); +} + +#[tokio::test] +async fn disabled_custom_check_does_not_appear_in_report() { + let (core, _dir) = setup().await; + make_atom(&core, "no source", None).await; + + let mut chk = check("c1", 1.0, CustomRule::RequireSource { tag_filter: None }); + chk.enabled = false; + core.set_custom_health_checks(&[chk]).await.unwrap(); + + let report = core.compute_health().await.expect("compute_health"); + assert!( + !report.checks.contains_key("custom.c1"), + "disabled rule must not be evaluated" + ); +} + +// --- Key collision avoidance ------------------------------------------------ + +#[tokio::test] +async fn custom_check_cannot_collide_with_builtin_key() { + let (core, _dir) = setup().await; + make_atom(&core, "x", None).await; + + // Custom rule with an id that matches a built-in check name — prefix + // must prevent collision. + core.set_custom_health_checks(&[check( + "tag_health", + 1.0, + CustomRule::RequireSource { tag_filter: None }, + )]) + .await + .unwrap(); + + let report = core.compute_health().await.unwrap(); + // Built-in check survives untouched. + assert!(report.checks.contains_key("tag_health")); + // Custom check lands under prefixed key. + assert!(report.checks.contains_key("custom.tag_health")); + + let builtin = &report.checks["tag_health"]; + let custom = &report.checks["custom.tag_health"]; + // Different shapes — builtin has no "custom" flag. + assert!(custom.data.get("custom").and_then(|v| v.as_bool()).unwrap_or(false)); + assert!(builtin.data.get("custom").is_none()); +} + +// --- Multi-rule batch ------------------------------------------------------- + +#[tokio::test] +async fn multiple_custom_checks_all_evaluate_independently() { + let (core, _dir) = setup().await; + make_atom(&core, "short", None).await; + make_atom( + &core, + "one two three four five six seven eight nine ten eleven twelve", + Some("https://arxiv.org/abs/1"), + ) + .await; + make_atom( + &core, + "plenty of words here for the length check", + Some("https://reddit.com/r/x"), + ) + .await; + + core.set_custom_health_checks(&[ + check("src", 0.0, CustomRule::RequireSource { tag_filter: None }), + check( + "len", + 0.0, + CustomRule::ContentLength { + min_words: 5, + max_words: 0, + tag_filter: None, + }, + ), + check( + "dom", + 0.0, + CustomRule::SourceDomainMatches { + domains: vec!["arxiv.org".into()], + mode: DomainMatchMode::Allowlist, + tag_filter: None, + }, + ), + ]) + .await + .unwrap(); + + let report = core.compute_health().await.unwrap(); + let src = &report.checks["custom.src"]; + let len = &report.checks["custom.len"]; + let dom = &report.checks["custom.dom"]; + + // One atom without source. + assert_eq!(src.data["flagged_count"], 1); + // "short" has 1 word (<5). + assert_eq!(len.data["flagged_count"], 1); + // Allowlist = arxiv.org → reddit.com is flagged; no-source atom is + // skipped by SourceDomainMatches (has no source). + assert_eq!(dom.data["flagged_count"], 1); +} + +// --- Fault isolation -------------------------------------------------------- + +#[tokio::test] +async fn malformed_regex_does_not_break_builtin_checks() { + let (core, _dir) = setup().await; + make_atom(&core, "x", None).await; + + core.set_custom_health_checks(&[check( + "bad_regex", + 1.0, + CustomRule::ContentRegex { + pattern: "(?P = report + .checks + .keys() + .filter(|k| k.starts_with("custom.")) + .collect(); + assert!( + custom_keys.is_empty() || !custom_keys.iter().any(|k| k.as_str() == "custom.bad_regex"), + "malformed rule must not leak a bogus successful result; got {custom_keys:?}" + ); +} + + +// --- Preview (dry run) ----------------------------------------------------- + +#[tokio::test] +async fn preview_reports_counts_and_sample_without_persisting() { + let (core, _dir) = setup().await; + make_atom(&core, "a", None).await; + make_atom(&core, "b", None).await; + make_atom(&core, "c", Some("https://x.com/c")).await; + + let preview = core + .preview_custom_health_check(&CustomRule::RequireSource { tag_filter: None }) + .await + .expect("preview"); + assert_eq!(preview.total_considered, 3); + assert_eq!(preview.flagged_count, 2); + assert_eq!(preview.sample.len(), 2); + + // Preview must not leak into persisted checks. + assert!(core.get_custom_health_checks().await.unwrap().is_empty()); +} + +#[tokio::test] +async fn preview_sample_capped_at_ten() { + let (core, _dir) = setup().await; + for i in 0..15 { + make_atom(&core, &format!("atom {i}"), None).await; + } + let preview = core + .preview_custom_health_check(&CustomRule::RequireSource { tag_filter: None }) + .await + .unwrap(); + assert_eq!(preview.total_considered, 15); + assert_eq!(preview.flagged_count, 15); + assert_eq!( + preview.sample.len(), + 10, + "sample should be capped regardless of total flagged" + ); +} + +#[tokio::test] +async fn preview_surfaces_malformed_regex_as_error() { + let (core, _dir) = setup().await; + make_atom(&core, "x", None).await; + + let err = core + .preview_custom_health_check(&CustomRule::ContentRegex { + pattern: "(?P (AtomicCore, TempDir) { + let dir = TempDir::new().expect("create tempdir"); + let core = AtomicCore::open_or_create(dir.path().join("test.db")).expect("open sqlite"); + (core, dir) +} + +async fn create(core: &AtomicCore, content: &str) { + core.create_atom( + CreateAtomRequest { + content: content.to_string(), + ..Default::default() + }, + |_| {}, + ) + .await + .expect("create_atom"); +} + +fn count_flagged(result: &atomic_core::health::HealthCheckResult) -> usize { + // `issues` is an object keyed by issue type (`very_short`, `very_long`, + // `no_headings`, `no_source`, …). We only care about the length-based + // ones since those are what HealthThresholds controls. + let issues = match result.data.get("issues").and_then(|v| v.as_object()) { + Some(o) => o, + None => return 0, + }; + let length_based = ["very_short", "very_long"]; + length_based + .iter() + .filter_map(|k| issues.get(*k)) + .filter_map(|issue| issue.get("count").and_then(|c| c.as_u64())) + .map(|c| c as usize) + .sum() +} +#[tokio::test] +async fn test_content_quality_threshold_controls_flagged_count() { + let (core, _dir) = setup().await; + + // Three atoms: 50 chars, 150 chars, 50_000 chars. + create(&core, &"a".repeat(50)).await; + create(&core, &"b".repeat(150)).await; + create(&core, &"c".repeat(50_000)).await; + + // --- Strict window: short < 200, long > 40_000. Should flag all three + // (50 + 150 are both short, 50_000 is long). + let strict = HealthConfig { + thresholds: HealthThresholds { + content_quality_short_chars: 200, + content_quality_long_chars: 40_000, + ..HealthThresholds::default() + }, + ..HealthConfig::default() + }; + core.set_health_config(&strict) + .await + .expect("set strict config"); + + let (_, strict_result) = atomic_core::health::compute_single_check(&core, "content_quality") + .await + .expect("compute content_quality (strict)"); + let strict_count = count_flagged(&strict_result); + assert!( + strict_count >= 3, + "strict thresholds should flag at least the 3 seeded atoms, got {strict_count} (data={})", + strict_result.data + ); + + // --- Lax window: short < 20, long > 100_000. Should flag none of our three. + let lax = HealthConfig { + thresholds: HealthThresholds { + content_quality_short_chars: 20, + content_quality_long_chars: 100_000, + ..HealthThresholds::default() + }, + ..HealthConfig::default() + }; + core.set_health_config(&lax).await.expect("set lax config"); + + let (_, lax_result) = atomic_core::health::compute_single_check(&core, "content_quality") + .await + .expect("compute content_quality (lax)"); + let lax_count = count_flagged(&lax_result); + assert!( + lax_count < strict_count, + "lax thresholds should flag strictly fewer atoms than strict (strict={strict_count}, lax={lax_count})" + ); +} + diff --git a/crates/atomic-core/tests/strip_boilerplate_tests.rs b/crates/atomic-core/tests/strip_boilerplate_tests.rs new file mode 100644 index 00000000..66ee86bd --- /dev/null +++ b/crates/atomic-core/tests/strip_boilerplate_tests.rs @@ -0,0 +1,114 @@ +mod support; + +use atomic_core::CreateAtomRequest; +use support::{setup_core, Backend, MockAiServer}; + +#[tokio::test] +async fn strip_boilerplate_dry_run_does_not_mutate() { + let mock = MockAiServer::start().await; + mock.mock_chat_completion("Only the unique part.").await; + let handle = setup_core(Backend::Sqlite, &mock.base_url()) + .await + .expect("harness"); + let core = &handle.core; + + let result = core + .create_atom( + CreateAtomRequest { + content: "# Template\n## Subject\nOnly the unique part.\n".to_string(), + ..Default::default() + }, + |_| {}, + ) + .await + .expect("create") + .expect("atom inserted"); + + let (proposed, action) = + atomic_core::health::llm_fixes::strip_boilerplate_atom(core, &result.atom.id, true) + .await + .expect("strip dry"); + + assert!(proposed.contains("unique part"), "proposed: {proposed}"); + assert!(action.is_none(), "dry_run must not emit a FixAction"); + + let reloaded = core + .get_atom(&result.atom.id) + .await + .expect("get") + .expect("exists"); + assert!( + reloaded.atom.content.contains("# Template"), + "content unchanged in dry_run" + ); +} + +#[tokio::test] +async fn strip_boilerplate_apply_mutates_and_logs_fix() { + let mock = MockAiServer::start().await; + mock.mock_chat_completion("Stripped content.").await; + let handle = setup_core(Backend::Sqlite, &mock.base_url()) + .await + .expect("harness"); + let core = &handle.core; + + let result = core + .create_atom( + CreateAtomRequest { + content: "Original with template cruft.".to_string(), + ..Default::default() + }, + |_| {}, + ) + .await + .expect("create") + .expect("atom inserted"); + + let (proposed, action) = + atomic_core::health::llm_fixes::strip_boilerplate_atom(core, &result.atom.id, false) + .await + .expect("strip apply"); + + assert_eq!(proposed, "Stripped content."); + assert!(action.is_some(), "apply must return a FixAction"); + + let reloaded = core + .get_atom(&result.atom.id) + .await + .expect("get") + .expect("exists"); + assert_eq!(reloaded.atom.content, "Stripped content."); +} + +#[tokio::test] +async fn strip_boilerplate_rejects_empty_response() { + let mock = MockAiServer::start().await; + mock.mock_chat_completion("EMPTY").await; + let handle = setup_core(Backend::Sqlite, &mock.base_url()) + .await + .expect("harness"); + let core = &handle.core; + + let result = core + .create_atom( + CreateAtomRequest { + content: "All boilerplate.".to_string(), + ..Default::default() + }, + |_| {}, + ) + .await + .expect("create") + .expect("atom inserted"); + + let err = + atomic_core::health::llm_fixes::strip_boilerplate_atom(core, &result.atom.id, false) + .await + .expect_err("must reject EMPTY"); + + let msg = err.to_string(); + assert!( + msg.to_lowercase().contains("boilerplate") || msg.to_lowercase().contains("empty"), + "unexpected error message: {msg}" + ); +} diff --git a/crates/atomic-core/tests/support/mod.rs b/crates/atomic-core/tests/support/mod.rs index 652a4848..d70efc0e 100644 --- a/crates/atomic-core/tests/support/mod.rs +++ b/crates/atomic-core/tests/support/mod.rs @@ -96,6 +96,21 @@ impl MockAiServer { self.counters.embedding_requests.store(0, Ordering::Relaxed); self.counters.chat_requests.store(0, Ordering::Relaxed); } + + /// Mount a higher-priority chat-completion handler that returns a fixed `content`. + /// Because wiremock matches later-mounted mocks first, this overrides the default + /// `ChatResponder` for all subsequent requests. + pub async fn mock_chat_completion(&self, content: impl Into) { + Mock::given(method("POST")) + .and(path("/v1/chat/completions")) + .respond_with(FixedChatResponder { + content: content.into(), + counters: Arc::clone(&self.counters), + }) + .with_priority(1) + .mount(&self.server) + .await; + } } /// Bag-of-words style unit-vector embedder. Two texts sharing words land at @@ -226,6 +241,29 @@ impl Respond for ChatResponder { } } +struct FixedChatResponder { + content: String, + counters: Arc, +} + +impl Respond for FixedChatResponder { + fn respond(&self, _req: &Request) -> ResponseTemplate { + self.counters.chat_requests.fetch_add(1, Ordering::Relaxed); + ResponseTemplate::new(200).set_body_json(json!({ + "id": "chatcmpl-test", + "object": "chat.completion", + "created": 0, + "model": "mock-model", + "choices": [{ + "index": 0, + "message": {"role": "assistant", "content": self.content}, + "finish_reason": "stop" + }], + "usage": {"prompt_tokens": 1, "completion_tokens": 1, "total_tokens": 2} + })) + } +} + // ==================== Backend switch + test harness ==================== pub enum Backend { diff --git a/crates/atomic-server/src/lib.rs b/crates/atomic-server/src/lib.rs index 67ebefc9..441e4a27 100644 --- a/crates/atomic-server/src/lib.rs +++ b/crates/atomic-server/src/lib.rs @@ -169,6 +169,14 @@ pub use utoipa_scalar::{Scalar, Servable}; routes::feeds::poll_feed, // Logs routes::logs::get_logs, + // Health + routes::health::get_health_knowledge, + routes::health::run_health_fix, + routes::health::apply_manual_fix, + routes::health::undo_health_fix, + routes::health::get_health_history, + routes::health::get_recent_fixes, + routes::health::compute_single_check, ), components(schemas( // Core types @@ -284,6 +292,20 @@ pub use utoipa_scalar::{Scalar, Servable}; atomic_core::CreateFeedRequest, atomic_core::UpdateFeedRequest, error::ApiErrorResponse, + // Health + atomic_core::health::HealthReport, + atomic_core::health::HealthCheckResult, + atomic_core::health::HealthStatus, + atomic_core::health::FixRequest, + atomic_core::health::FixResponse, + atomic_core::health::FixAction, + atomic_core::health::SkippedFix, + atomic_core::health::DuplicatePair, + atomic_core::health::WikiGap, + atomic_core::health::WikiStaleEntry, + atomic_core::health::audit::StoredHealthReport, + atomic_core::health::audit::HealthFixLog, + routes::health::ManualFixRequest, )), tags( (name = "atoms", description = "Atom CRUD operations"), @@ -307,6 +329,7 @@ pub use utoipa_scalar::{Scalar, Servable}; (name = "briefings", description = "Daily briefing generation and history"), (name = "logs", description = "Server log access"), (name = "oauth", description = "OAuth 2.0 endpoints for remote MCP clients"), + (name = "health", description = "Knowledge base health checks and auto-remediation"), ), security( ("bearer_auth" = []), diff --git a/crates/atomic-server/src/main.rs b/crates/atomic-server/src/main.rs index 64ff5d67..eba10e48 100644 --- a/crates/atomic-server/src/main.rs +++ b/crates/atomic-server/src/main.rs @@ -447,6 +447,10 @@ async fn run_server( registry.register(Arc::new( atomic_core::graph_maintenance::GraphMaintenanceTask, )); + registry.register(Arc::new( + atomic_core::health::task::HealthMaintenanceTask, + )); + registry.register(Arc::new(atomic_core::health::gc_task::DismissalGcTask)); let registry = Arc::new(registry); let mut interval = tokio::time::interval(Duration::from_secs(15)); diff --git a/crates/atomic-server/src/routes/atoms.rs b/crates/atomic-server/src/routes/atoms.rs index 23533024..edc9d031 100644 --- a/crates/atomic-server/src/routes/atoms.rs +++ b/crates/atomic-server/src/routes/atoms.rs @@ -418,6 +418,22 @@ pub async fn delete_atom(db: Db, path: web::Path) -> HttpResponse { ok_or_error(db.0.delete_atom(&id).await) } +#[derive(serde::Deserialize)] +pub struct SetLockedBody { pub locked: bool } + +/// POST /api/atoms/{id}/lock — set the lock flag +pub async fn set_atom_locked( + db: Db, + path: web::Path, + body: web::Json, +) -> HttpResponse { + let id = path.into_inner(); + match db.0.set_atom_locked(&id, body.locked).await { + Ok(()) => HttpResponse::NoContent().finish(), + Err(e) => crate::error::error_response(e), + } +} + // ==================== Tags ==================== #[derive(Deserialize, IntoParams)] diff --git a/crates/atomic-server/src/routes/health.rs b/crates/atomic-server/src/routes/health.rs new file mode 100644 index 00000000..9f39c92b --- /dev/null +++ b/crates/atomic-server/src/routes/health.rs @@ -0,0 +1,888 @@ +//! Knowledge health check routes. +//! +//! GET /api/health/knowledge — compute & return full health report +//! POST /api/health/fix — auto-fix by tier +//! POST /api/health/fix/{check}/{item} — fix one item requiring review +//! POST /api/health/undo/{fix_id} — undo a previously applied fix +//! GET /api/health/history — last N stored reports (trending) +//! GET /api/health/fixes/recent — recent fix log entries + +use crate::db_extractor::Db; +use actix_web::{web, HttpResponse}; +use atomic_core::compaction; +use atomic_core::health::{ + self, audit, pair_key, FixRequest, FixResponse, HealthCheckResult, HealthReport, +}; +use atomic_core::health::audit::{HealthFixLog, StoredHealthReport}; +use serde::{Deserialize, Serialize}; +use utoipa::ToSchema; + +/// Request body for the per-item fix endpoint. +#[derive(Deserialize, Serialize, ToSchema)] +pub struct ManualFixRequest { + pub action: String, + // Optional per-action fields + pub url: Option, + pub parent_id: Option, + pub into_tag_id: Option, + pub content: Option, + pub winner_atom_id: Option, + pub loser_atom_id: Option, + #[serde(default)] + pub dry_run: bool, +} + +/// Query params for history endpoint. +#[derive(Deserialize)] +pub struct HistoryQuery { + pub limit: Option, +} + +// ==================== GET /api/health/knowledge ==================== + +#[utoipa::path( + get, + path = "/api/health/knowledge", + tag = "health", + responses( + (status = 200, description = "Current health report", body = HealthReport), + (status = 500, description = "Internal server error"), + ), + security(("bearer_auth" = [])), +)] +pub async fn get_health_knowledge(db: Db) -> HttpResponse { + match health::compute_health(&db.0).await { + Ok(report) => HttpResponse::Ok().json(report), + Err(e) => crate::error::error_response(e), + } +} + +// ==================== POST /api/health/fix ==================== + +#[utoipa::path( + post, + path = "/api/health/fix", + tag = "health", + request_body = FixRequest, + responses( + (status = 200, description = "Fix response", body = FixResponse), + (status = 500, description = "Internal server error"), + ), + security(("bearer_auth" = [])), +)] +pub async fn run_health_fix(db: Db, body: web::Json) -> HttpResponse { + match health::run_fix(&db.0, &body).await { + Ok(response) => HttpResponse::Ok().json(response), + Err(e) => crate::error::error_response(e), + } +} + +// ==================== POST /api/health/fix/{check}/{item_id} ==================== + +#[utoipa::path( + post, + path = "/api/health/fix/{check}/{item_id}", + tag = "health", + params( + ("check" = String, Path, description = "Check name"), + ("item_id" = String, Path, description = "Item identifier"), + ), + request_body = ManualFixRequest, + responses( + (status = 200, description = "Action taken or no-op"), + (status = 400, description = "Bad request"), + (status = 500, description = "Internal server error"), + ), + security(("bearer_auth" = [])), +)] +pub async fn apply_manual_fix( + db: Db, + path: web::Path<(String, String)>, + body: web::Json, +) -> HttpResponse { + let (check, item_id) = path.into_inner(); + match apply_manual_fix_impl(&db, &check, &item_id, body.into_inner()).await { + Ok(v) => HttpResponse::Ok().json(v), + Err(e) => crate::error::error_response(e), + } +} + +async fn apply_manual_fix_impl( + db: &Db, + check: &str, + item_id: &str, + req: ManualFixRequest, +) -> Result { + use atomic_core::error::AtomicCoreError; + let core = &db.0; + + match (check, req.action.as_str()) { + // === Existing: content-overlap LLM merge === + ("duplicate_detection" | "content_overlap", "merge_with_llm") => { + let parts: Vec<&str> = item_id.splitn(2, "__").collect(); + let (atom_a, atom_b) = if parts.len() == 2 { + (parts[0], parts[1]) + } else { + let legacy: Vec<&str> = item_id.splitn(2, '_').collect(); + if legacy.len() != 2 { + return Err(AtomicCoreError::Validation( + "item_id must be 'atom_a__atom_b' for pair actions".into(), + )); + } + (legacy[0], legacy[1]) + }; + match atomic_core::health::llm_fixes::merge_duplicate_pair( + core, atom_a, atom_b, req.dry_run, + ) + .await + { + Ok(Some(action)) => Ok(serde_json::to_value(action).unwrap_or_default()), + Ok(None) => Ok(serde_json::json!({"status": "no_op"})), + Err(e) => Err(e), + } + } + + // === Content overlap: keep_a / keep_b (archive the loser) === + ("content_overlap" | "duplicate_detection", action @ ("keep_a" | "keep_b")) => { + let parts: Vec<&str> = item_id.splitn(2, "__").collect(); + if parts.len() != 2 { + return Err(AtomicCoreError::Validation( + "item_id must be 'atom_a__atom_b'".into(), + )); + } + let (a, b) = (parts[0], parts[1]); + let loser = if action == "keep_a" { b } else { a }; + core.delete_atom(loser).await?; + let key = pair_key(a, b); + let _ = core + .dismiss_health_item("content_overlap", &key, "resolved_other", None) + .await; + Ok(serde_json::json!({"status": "ok"})) + } + + // === Dismiss actions (all reviewable checks) === + (check_name, action @ ("dismiss" | "mark_intentional" | "ignore_pair" | "defer")) => { + let reason = match action { + "mark_intentional" => "intentional_no_source", + "ignore_pair" => "ignored_pair", + "defer" => "deferred", + _ => "resolved_other", + }; + let expires_at = if action == "defer" { + let exp = chrono::Utc::now() + chrono::Duration::days(7); + Some(exp.to_rfc3339()) + } else { + None + }; + core + .dismiss_health_item(check_name, item_id, reason, expires_at.as_deref()) + .await?; + Ok(serde_json::json!({"status": "dismissed"})) + } + + // === Content quality: add source URL === + ("content_quality", "add_source") => { + let url = match req.url.as_deref() { + Some(u) if !u.trim().is_empty() => u.trim().to_string(), + _ => { + return Err(AtomicCoreError::Validation( + "url is required for add_source".into(), + )) + } + }; + match core.get_atom(item_id).await? { + Some(atom) => { + let tag_ids: Vec = atom.tags.iter().map(|t| t.id.clone()).collect(); + let upd = atomic_core::UpdateAtomRequest { + content: atom.atom.content.clone(), + source_url: Some(url), + published_at: atom.atom.published_at.clone(), + tag_ids: Some(tag_ids), + }; + core.update_atom(item_id, upd, |_| {}).await?; + Ok(serde_json::json!({"status": "ok"})) + } + None => Err(AtomicCoreError::NotFound("atom not found".into())), + } + } + + // === Tag health: move_under (reparent rootless tag) === + ("tag_health", "move_under") => { + let parent_id = match req.parent_id.as_deref() { + Some(p) if !p.trim().is_empty() => p.trim().to_string(), + _ => { + return Err(AtomicCoreError::Validation( + "parent_id is required for move_under".into(), + )) + } + }; + match core.get_tag_by_id(item_id).await? { + Some((name, _)) => { + core.update_tag(item_id, &name, Some(&parent_id)).await?; + Ok(serde_json::json!({"status": "ok"})) + } + None => Err(AtomicCoreError::NotFound("tag not found".into())), + } + } + + // === Tag health: merge (winner becomes into_tag_id, loser is item_id) === + ("tag_health", "merge") => { + let winner_id = match req.into_tag_id.as_deref() { + Some(p) if !p.trim().is_empty() => p.trim().to_string(), + _ => { + return Err(AtomicCoreError::Validation( + "into_tag_id is required for merge".into(), + )) + } + }; + let winner_name = match core.get_tag_by_id(&winner_id).await? { + Some((name, _)) => name, + None => return Err(AtomicCoreError::NotFound("target tag not found".into())), + }; + let loser_name = match core.get_tag_by_id(item_id).await? { + Some((name, _)) => name, + None => return Err(AtomicCoreError::NotFound("source tag not found".into())), + }; + let merges = vec![compaction::TagMerge { + winner_name, + loser_name, + reason: "manual_review_merge".to_string(), + }]; + core.apply_tag_merges(&merges).await?; + Ok(serde_json::json!({"status": "ok"})) + } + + // === Tag health: merge_tags (similar-name pair — item_id = "a_id__b_id", winner = into_tag_id) === + ("tag_health", "merge_tags") => { + let winner_id = match req.into_tag_id.as_deref() { + Some(p) if !p.trim().is_empty() => p.trim().to_string(), + _ => { + return Err(AtomicCoreError::Validation( + "into_tag_id is required for merge_tags".into(), + )) + } + }; + let parts: Vec<&str> = item_id.splitn(2, "__").collect(); + if parts.len() != 2 { + return Err(AtomicCoreError::Validation( + "item_id must be 'a_id__b_id' for merge_tags".into(), + )); + } + let (a_id, b_id) = (parts[0], parts[1]); + let loser_id = if winner_id == a_id { b_id } else { a_id }; + let winner_name = match core.get_tag_by_id(&winner_id).await? { + Some((name, _)) => name, + None => return Err(AtomicCoreError::NotFound("winner tag not found".into())), + }; + let loser_name = match core.get_tag_by_id(loser_id).await? { + Some((name, _)) => name, + None => return Err(AtomicCoreError::NotFound("loser tag not found".into())), + }; + let merges = vec![compaction::TagMerge { + winner_name, + loser_name, + reason: "similar_name_pair_merge".to_string(), + }]; + core.apply_tag_merges(&merges).await?; + // Also dismiss the pair so it doesn't resurface + let _ = core + .dismiss_health_item("tag_health", item_id, "merged", None) + .await; + Ok(serde_json::json!({"status": "ok"})) + } + // === Tag health: delete_tag (manual review — single-atom non-autotag or any tag) === + ("tag_health", "delete_tag") => { + core.delete_tag(item_id, false).await?; + audit::log_fix( + core, + "tag_health", + "delete_tag", + "low", + None, + Some(&[item_id.to_string()]), + serde_json::json!([{"id": item_id}]), + serde_json::json!({"deleted": 1}), + None, + None, + ) + .await?; + Ok(serde_json::json!({"status": "ok"})) + } + + // === Tag health: merge_into_parent (reparent a tag) === + ("tag_health", "merge_into_parent") => { + let new_parent_id = match req.into_tag_id.as_deref() { + Some(p) if !p.trim().is_empty() => p.trim().to_string(), + _ => { + return Err(AtomicCoreError::Validation( + "into_tag_id is required for merge_into_parent".into(), + )) + } + }; + match core.get_tag_by_id(item_id).await? { + Some((name, _)) => { + core.update_tag(item_id, &name, Some(&new_parent_id)).await?; + Ok(serde_json::json!({"status": "ok"})) + } + None => Err(AtomicCoreError::NotFound("tag not found".into())), + } + } + + // === Boilerplate: re-embed === + ("boilerplate_pollution", "reembed") => { + core.retry_embedding(item_id, |_| {}).await?; + Ok(serde_json::json!({"status": "ok"})) + } + + // === Content overlap: merge_with_edited_content === + ("content_overlap" | "duplicate_detection", "merge_with_edited_content") => { + let parts: Vec<&str> = item_id.splitn(2, "__").collect(); + if parts.len() != 2 { + return Err(AtomicCoreError::Validation( + "item_id must be 'atom_a__atom_b'".into(), + )); + } + let winner = match req.winner_atom_id.as_deref() { + Some(w) if !w.is_empty() => w.to_string(), + _ => return Err(AtomicCoreError::Validation("winner_atom_id required".into())), + }; + let loser = match req.loser_atom_id.as_deref() { + Some(l) if !l.is_empty() => l.to_string(), + _ => return Err(AtomicCoreError::Validation("loser_atom_id required".into())), + }; + let content = match req.content.as_deref() { + Some(c) if !c.trim().is_empty() => c.to_string(), + _ => return Err(AtomicCoreError::Validation("content required".into())), + }; + let action = atomic_core::health::llm_fixes::apply_edited_merge(core, &winner, &loser, &content).await?; + let key = atomic_core::health::pair_key(parts[0], parts[1]); + let _ = core.dismiss_health_item("content_overlap", &key, "resolved_other", None).await; + Ok(serde_json::to_value(action).unwrap_or_default()) + } + + // === Broken internal links: remove-link === + ("broken_internal_links", "remove_link") => { + let link_raw = match req.content.as_deref() { + Some(c) if !c.trim().is_empty() => c.to_string(), + _ => return Err(AtomicCoreError::Validation("content (link_raw) is required for remove_link".into())), + }; + let action = atomic_core::health::fixes::remove_broken_link(core, item_id, &link_raw).await?; + Ok(serde_json::to_value(action).unwrap_or_default()) + } + + // === Broken internal links: relink === + ("broken_internal_links", "relink") => { + let link_raw = match req.content.as_deref() { + Some(c) if !c.trim().is_empty() => c.to_string(), + _ => return Err(AtomicCoreError::Validation("content (link_raw) is required for relink".into())), + }; + let target_atom_id = match req.into_tag_id.as_deref() { + Some(t) if !t.trim().is_empty() => t.trim().to_string(), + _ => return Err(AtomicCoreError::Validation("into_tag_id (target_atom_id) is required for relink".into())), + }; + let action = atomic_core::health::fixes::relink_broken_link(core, item_id, &link_raw, &target_atom_id).await?; + Ok(serde_json::to_value(action).unwrap_or_default()) + } + + // === Broken internal links: auto_resolve === + ("broken_internal_links", "auto_resolve") => { + let link_raw = match req.content.as_deref() { + Some(c) if !c.trim().is_empty() => c.to_string(), + _ => return Err(AtomicCoreError::Validation("content (link_raw) is required for auto_resolve".into())), + }; + let link_text = req.url.as_deref().unwrap_or("").to_string(); + let outcome = atomic_core::health::llm_fixes::auto_resolve_broken_link( + core, item_id, &link_raw, &link_text, + ).await?; + Ok(serde_json::to_value(&outcome).unwrap_or_default()) + } + + // === Content overlap / duplicate: verify with LLM === + ("content_overlap" | "duplicate_detection", "verify_with_llm") => { + let parts: Vec<&str> = item_id.splitn(2, "__").collect(); + let (atom_a, atom_b) = if parts.len() == 2 { + (parts[0], parts[1]) + } else { + return Err(AtomicCoreError::Validation( + "item_id must be 'atom_a__atom_b' for verify_with_llm".into(), + )); + }; + let (is_duplicate, reason) = + atomic_core::health::llm_fixes::verify_overlap_pair(core, atom_a, atom_b).await?; + Ok(serde_json::json!({"is_duplicate": is_duplicate, "reason": reason})) + } + + // === Contradiction detection: verify with LLM === + ("contradiction_detection", "verify_with_llm") => { + let parts: Vec<&str> = item_id.splitn(2, "__").collect(); + let (atom_a, atom_b) = if parts.len() == 2 { + (parts[0], parts[1]) + } else { + return Err(AtomicCoreError::Validation( + "item_id must be 'atom_a__atom_b' for verify_with_llm".into(), + )); + }; + let (is_real, reason) = + atomic_core::health::llm_fixes::verify_contradiction_pair(core, atom_a, atom_b).await?; + Ok(serde_json::json!({"is_contradiction": is_real, "reason": reason})) + } + + // === Contradiction detection: merge with LLM === + ("contradiction_detection", "merge_with_llm") => { + let parts: Vec<&str> = item_id.splitn(2, "__").collect(); + let (atom_a, atom_b) = if parts.len() == 2 { + (parts[0], parts[1]) + } else { + return Err(AtomicCoreError::Validation( + "item_id must be 'atom_a__atom_b' for merge_with_llm".into(), + )); + }; + match atomic_core::health::llm_fixes::merge_contradicting_pair( + core, atom_a, atom_b, req.dry_run, + ) + .await + { + Ok(Some(action)) => Ok(serde_json::to_value(action).unwrap_or_default()), + Ok(None) => Ok(serde_json::json!({"status": "no_op"})), + Err(e) => Err(e), + } + } + + _ => Err(AtomicCoreError::Validation(format!( + "unsupported check '{}' or action '{}'", + check, req.action + ))), + } +} + +// ==================== POST /api/health/fix/batch ==================== + +#[derive(Debug, Deserialize)] +pub struct BatchFixItem { + pub check: String, + pub item_id: String, + pub action: String, + #[serde(default)] pub url: Option, + #[serde(default)] pub parent_id: Option, + #[serde(default)] pub into_tag_id: Option, + #[serde(default)] pub content: Option, + #[serde(default)] pub winner_atom_id: Option, + #[serde(default)] pub loser_atom_id: Option, + #[serde(default)] pub dry_run: bool, +} + +#[derive(Debug, Deserialize)] +pub struct BatchFixRequest { + pub items: Vec, +} + +pub async fn apply_manual_fix_batch( + db: Db, + body: web::Json, +) -> HttpResponse { + let req = body.into_inner(); + let mut results = Vec::with_capacity(req.items.len()); + for item in req.items { + let single = ManualFixRequest { + action: item.action.clone(), + url: item.url, + parent_id: item.parent_id, + into_tag_id: item.into_tag_id, + content: item.content, + winner_atom_id: item.winner_atom_id, + loser_atom_id: item.loser_atom_id, + dry_run: item.dry_run, + }; + match apply_manual_fix_impl(&db, &item.check, &item.item_id, single).await { + Ok(_) => results.push(serde_json::json!({ + "check": item.check, + "item_id": item.item_id, + "ok": true + })), + Err(e) => results.push(serde_json::json!({ + "check": item.check, + "item_id": item.item_id, + "ok": false, + "error": e.to_string() + })), + } + } + HttpResponse::Ok().json(serde_json::json!({"results": results})) +} + +// ==================== POST /api/health/undo/{fix_id} ==================== + +#[utoipa::path( + post, + path = "/api/health/undo/{fix_id}", + tag = "health", + params( + ("fix_id" = String, Path, description = "Fix ID from the audit log"), + ), + responses( + (status = 200, description = "Undo successful"), + (status = 404, description = "Fix not found"), + (status = 500, description = "Internal server error"), + ), + security(("bearer_auth" = [])), +)] +pub async fn undo_health_fix(db: Db, path: web::Path) -> HttpResponse { + let fix_id = path.into_inner(); + match audit::undo(&db.0, &fix_id).await { + Ok(()) => HttpResponse::Ok().json(serde_json::json!({"status": "ok", "fix_id": fix_id})), + Err(e) => crate::error::error_response(e), + } +} + +// ==================== GET /api/health/history ==================== + +#[utoipa::path( + get, + path = "/api/health/history", + tag = "health", + params( + ("limit" = Option, Query, description = "Maximum number of reports to return"), + ), + responses( + (status = 200, description = "Stored health reports", body = Vec), + (status = 500, description = "Internal server error"), + ), + security(("bearer_auth" = [])), +)] +pub async fn get_health_history(db: Db, query: web::Query) -> HttpResponse { + let limit = query.limit.unwrap_or(30).min(90); + match db.0.get_health_reports(limit).await { + Ok(reports) => HttpResponse::Ok().json(reports), + Err(e) => crate::error::error_response(e), + } +} + +// ==================== GET /api/health/fixes/recent ==================== + +#[utoipa::path( + get, + path = "/api/health/fixes/recent", + tag = "health", + params( + ("limit" = Option, Query, description = "Maximum number of fix log entries"), + ), + responses( + (status = 200, description = "Recent fix log entries", body = Vec), + (status = 500, description = "Internal server error"), + ), + security(("bearer_auth" = [])), +)] +pub async fn get_recent_fixes(db: Db, query: web::Query) -> HttpResponse { + let limit = query.limit.unwrap_or(20).min(100); + match audit::get_recent_fixes(&db.0, limit).await { + Ok(fixes) => HttpResponse::Ok().json(fixes), + Err(e) => crate::error::error_response(e), + } +} + + +// ==================== POST /api/health/check/{check_name} ==================== + +#[utoipa::path( + post, + path = "/api/health/check/{check_name}", + tag = "health", + params( + ("check_name" = String, Path, description = "Health check name to run in isolation"), + ), + responses( + (status = 200, description = "Check result", body = HealthCheckResult), + (status = 400, description = "Unknown check name"), + (status = 500, description = "Internal server error"), + ), + security(("bearer_auth" = [])), +)] +pub async fn compute_single_check( + db: Db, + path: web::Path, +) -> HttpResponse { + let check_name = path.into_inner(); + match health::compute_single_check(&db.0, &check_name).await { + Ok((_name, result)) => HttpResponse::Ok().json(result), + Err(e) => crate::error::error_response(e), + } +} + +// ==================== POST /api/health/contradiction-summary/{atom_a}/{atom_b} ==================== + +pub async fn contradiction_summary_handler( + db: Db, + path: web::Path<(String, String)>, +) -> HttpResponse { + let (a, b) = path.into_inner(); + match atomic_core::health::llm_fixes::contradiction_summary(&db.0, &a, &b).await { + Ok(summary) => HttpResponse::Ok().json(serde_json::json!({"summary": summary})), + Err(e) => crate::error::error_response(e), + } +} + +// ==================== POST /api/health/strip-boilerplate/{atom_id} ==================== + +#[derive(Debug, Deserialize, Default)] +pub struct StripBoilerplateQuery { + #[serde(default)] + pub dry_run: bool, +} + +pub async fn strip_boilerplate_handler( + db: Db, + path: web::Path, + query: web::Query, +) -> HttpResponse { + let atom_id = path.into_inner(); + match atomic_core::health::llm_fixes::strip_boilerplate_atom(&db.0, &atom_id, query.dry_run).await { + Ok((content, action)) => HttpResponse::Ok().json(serde_json::json!({ + "content": content, + "action": action, + "dry_run": query.dry_run + })), + Err(e) => crate::error::error_response(e), + } +} + +// ==================== GET /api/health/broken-link-suggest ==================== + +#[derive(Deserialize)] +pub struct BrokenLinkSuggestQuery { + pub q: String, + #[serde(default)] + pub limit: Option, +} + +pub async fn broken_link_suggest_handler( + db: Db, + query: web::Query, +) -> HttpResponse { + let limit = query.limit.unwrap_or(5).min(20).max(1); + match db.0.suggest_atoms_for_broken_link(&query.q, limit).await { + Ok(rows) => { + let suggestions: Vec = rows.into_iter().map(|(atom_id, title, source_url, score)| { + serde_json::json!({ + "atom_id": atom_id, + "title": title, + "source_url": source_url, + "score": score, + }) + }).collect(); + HttpResponse::Ok().json(serde_json::json!({ "suggestions": suggestions })) + } + Err(e) => crate::error::error_response(e), + } +} + +#[derive(serde::Deserialize, Default)] +pub struct AutoResolveAllQuery { + #[serde(default)] + pub max: Option, +} + +pub async fn broken_links_auto_resolve_all( + db: Db, + body: web::Json, +) -> HttpResponse { + let max = body + .get("max") + .and_then(|v| v.as_u64()) + .map(|n| n as usize) + .unwrap_or(25); + match atomic_core::health::llm_fixes::auto_resolve_all_broken_links(&db.0, max).await { + Ok(result) => HttpResponse::Ok().json(result), + Err(e) => crate::error::error_response(e), + } +} + +// ==================== POST /api/health/verify/{check} ==================== + +#[derive(Debug, serde::Deserialize)] +pub struct VerifyBatchBody { + pub item_ids: Vec, + pub max: Option, +} + +pub async fn verify_batch_handler( + db: Db, + path: web::Path, + body: web::Json, +) -> HttpResponse { + let check = path.into_inner(); + let body = body.into_inner(); + let limit = body.max.unwrap_or(50) as usize; + let ids: Vec = body.item_ids.into_iter().take(limit).collect(); + let core = &db.0; + + let mut checked = 0u32; + let mut kept = 0u32; + let mut dismissed_ids: Vec = Vec::new(); + + for item_id in &ids { + let parts: Vec<&str> = item_id.splitn(2, "__").collect(); + if parts.len() != 2 { + continue; + } + let (atom_a, atom_b) = (parts[0], parts[1]); + checked += 1; + let result = match check.as_str() { + "content_overlap" | "duplicate_detection" => { + atomic_core::health::llm_fixes::verify_overlap_pair(core, atom_a, atom_b) + .await + .map(|(is_dup, _)| is_dup) + } + "contradiction_detection" => { + atomic_core::health::llm_fixes::verify_contradiction_pair(core, atom_a, atom_b) + .await + .map(|(is_real, _)| is_real) + } + _ => break, + }; + match result { + Ok(true) => kept += 1, + Ok(false) => dismissed_ids.push(item_id.clone()), + Err(_) => {} + } + } + + HttpResponse::Ok().json(serde_json::json!({ + "checked": checked, + "kept": kept, + "dismissed_ids": dismissed_ids, + })) +} + +// ==================== Tag proposal handlers ==================== + +/// POST /api/health/tag-proposal — generate a new LLM proposal. +pub async fn create_tag_proposal(db: Db) -> HttpResponse { + match atomic_core::health::llm_fixes::propose_tag_restructure(&db.0).await { + Ok(proposal) => HttpResponse::Ok().json(proposal), + Err(e) => crate::error::error_response(e), + } +} + +#[derive(serde::Deserialize)] +pub struct ApplyTagProposalRequest { + #[serde(default)] + pub accepted_indices: Vec, +} + +/// POST /api/health/tag-proposal/{proposal_id}/apply +pub async fn apply_tag_proposal( + db: Db, + path: web::Path, + body: web::Json, +) -> HttpResponse { + let proposal_id = path.into_inner(); + match atomic_core::health::llm_fixes::apply_tag_proposal( + &db.0, + &proposal_id, + &body.accepted_indices, + ) + .await + { + Ok(actions) => HttpResponse::Ok().json(actions), + Err(e) => crate::error::error_response(e), + } +} + +/// GET /api/health/tag-proposal/latest +pub async fn get_latest_tag_proposal(db: Db) -> HttpResponse { + match db.0.get_latest_tag_proposal().await { + Ok(Some(proposal)) => HttpResponse::Ok().json(proposal), + Ok(None) => HttpResponse::NotFound().json(serde_json::json!({"error": "no pending proposal"})), + Err(e) => crate::error::error_response(e), + } +} + +// ==================== Health Config ==================== + +/// GET /api/health/config +pub async fn get_health_config(db: Db) -> HttpResponse { + match db.0.get_health_config().await { + Ok(config) => HttpResponse::Ok().json(config), + Err(e) => crate::error::error_response(e), + } +} + +/// PUT /api/health/config +pub async fn set_health_config( + db: Db, + body: web::Json, +) -> HttpResponse { + match db.0.set_health_config(&body.into_inner()).await { + Ok(()) => HttpResponse::NoContent().finish(), + Err(e) => crate::error::error_response(e), + } +} + +// ==================== Wiki exclusion ==================== + +#[derive(serde::Deserialize)] +pub struct SetWikiExcludedTagsBody { pub tag_ids: Vec } + +/// GET /api/wiki/excluded-tags +pub async fn get_wiki_excluded_tags(db: Db) -> HttpResponse { + match db.0.get_wiki_excluded_tag_ids().await { + Ok(ids) => HttpResponse::Ok().json(serde_json::json!({ "tag_ids": ids })), + Err(e) => crate::error::error_response(e), + } +} + +/// PUT /api/wiki/excluded-tags +pub async fn set_wiki_excluded_tags( + db: Db, + body: web::Json, +) -> HttpResponse { + match db.0.set_wiki_excluded_tag_ids(&body.into_inner().tag_ids).await { + Ok(()) => HttpResponse::NoContent().finish(), + Err(e) => crate::error::error_response(e), + } +} + +// ==================== Custom health checks ==================== + +/// GET /api/health/custom-checks +pub async fn get_custom_health_checks(db: Db) -> HttpResponse { + match db.0.get_custom_health_checks().await { + Ok(checks) => HttpResponse::Ok().json(serde_json::json!({ "checks": checks })), + Err(e) => crate::error::error_response(e), + } +} + +#[derive(serde::Deserialize)] +pub struct SetCustomHealthChecksBody { + pub checks: Vec, +} + +/// PUT /api/health/custom-checks +pub async fn set_custom_health_checks( + db: Db, + body: web::Json, +) -> HttpResponse { + match db.0.set_custom_health_checks(&body.into_inner().checks).await { + Ok(()) => HttpResponse::NoContent().finish(), + Err(e) => crate::error::error_response(e), + } +} + +#[derive(serde::Deserialize)] +pub struct PreviewCustomHealthCheckBody { + pub rule: atomic_core::health::custom::CustomRule, +} + +/// POST /api/health/custom-checks/preview +/// +/// Dry-runs an unsaved rule against the current DB so the UI can show +/// "this would flag N atoms" while the user is tuning parameters. +pub async fn preview_custom_health_check( + db: Db, + body: web::Json, +) -> HttpResponse { + match db.0.preview_custom_health_check(&body.into_inner().rule).await { + Ok(result) => HttpResponse::Ok().json(result), + Err(e) => crate::error::error_response(e), + } +} \ No newline at end of file diff --git a/crates/atomic-server/src/routes/import.rs b/crates/atomic-server/src/routes/import.rs index 544fe337..6ea35b37 100644 --- a/crates/atomic-server/src/routes/import.rs +++ b/crates/atomic-server/src/routes/import.rs @@ -37,7 +37,23 @@ pub async fn import_obsidian_vault( .import_obsidian_vault(&body.vault_path, body.max_notes, on_event, on_progress) .await { - Ok(result) => HttpResponse::Ok().json(result), + Ok(result) => { + // Fire-and-forget health maintenance after bulk import + let core = db.0.clone(); + tokio::spawn(async move { + if let Ok(report) = core.compute_health().await { + if report.overall_score < 95 { + let req = atomic_core::health::FixRequest { + checks: None, + mode: "auto".to_string(), + include_medium: false, + }; + let _ = core.run_health_fix(&req).await; + } + } + }); + HttpResponse::Ok().json(result) + } Err(e) => crate::error::error_response(e), } } diff --git a/crates/atomic-server/src/routes/mod.rs b/crates/atomic-server/src/routes/mod.rs index 709523b8..d0006583 100644 --- a/crates/atomic-server/src/routes/mod.rs +++ b/crates/atomic-server/src/routes/mod.rs @@ -21,6 +21,7 @@ pub mod settings; pub mod setup; pub mod utils; pub mod wiki; +pub mod health; use actix_web::web; @@ -50,6 +51,7 @@ pub fn configure_routes(cfg: &mut web::ServiceConfig) { web::post().to(atoms::process_atom_pipeline), ); cfg.route("/atoms/{id}", web::delete().to(atoms::delete_atom)); + cfg.route("/atoms/{id}/lock", web::post().to(atoms::set_atom_locked)); cfg.route("/atoms/{id}/similar", web::get().to(search::find_similar)); cfg.route( "/atoms/{id}/embedding-status", @@ -348,4 +350,32 @@ pub fn configure_routes(cfg: &mut web::ServiceConfig) { // Logs cfg.route("/logs", web::get().to(logs::get_logs)); -} + + // Health + cfg.route("/health/knowledge", web::get().to(health::get_health_knowledge)); + cfg.route("/health/fix", web::post().to(health::run_health_fix)); + cfg.route( + "/health/fix/{check}/{item_id}", + web::post().to(health::apply_manual_fix), + ); + cfg.route("/health/undo/{fix_id}", web::post().to(health::undo_health_fix)); + cfg.route("/health/history", web::get().to(health::get_health_history)); + cfg.route("/health/fixes/recent", web::get().to(health::get_recent_fixes)); + cfg.route("/health/check/{check_name}", web::post().to(health::compute_single_check)); + cfg.route("/health/contradiction-summary/{atom_a}/{atom_b}", web::post().to(health::contradiction_summary_handler)); + cfg.route("/health/fix/batch", web::post().to(health::apply_manual_fix_batch)); + cfg.route("/health/strip-boilerplate/{atom_id}", web::post().to(health::strip_boilerplate_handler)); + cfg.route("/health/broken-link-suggest", web::get().to(health::broken_link_suggest_handler)); + cfg.route("/health/broken-links/auto-resolve-all", web::post().to(health::broken_links_auto_resolve_all)); + cfg.route("/health/verify/{check}", web::post().to(health::verify_batch_handler)); + cfg.route("/health/tag-proposal", web::post().to(health::create_tag_proposal)); + cfg.route("/health/tag-proposal/latest", web::get().to(health::get_latest_tag_proposal)); + cfg.route("/health/tag-proposal/{proposal_id}/apply", web::post().to(health::apply_tag_proposal)); + cfg.route("/health/config", web::get().to(health::get_health_config)); + cfg.route("/health/config", web::put().to(health::set_health_config)); + cfg.route("/wiki/excluded-tags", web::get().to(health::get_wiki_excluded_tags)); + cfg.route("/wiki/excluded-tags", web::put().to(health::set_wiki_excluded_tags)); + cfg.route("/health/custom-checks", web::get().to(health::get_custom_health_checks)); + cfg.route("/health/custom-checks", web::put().to(health::set_custom_health_checks)); + cfg.route("/health/custom-checks/preview", web::post().to(health::preview_custom_health_check)); +} \ No newline at end of file diff --git a/crates/atomic-server/tests/api_health_config.rs b/crates/atomic-server/tests/api_health_config.rs new file mode 100644 index 00000000..88230efb --- /dev/null +++ b/crates/atomic-server/tests/api_health_config.rs @@ -0,0 +1,252 @@ +//! Integration tests for the health-config REST endpoints. +//! +//! GET /api/health/config → HealthConfig (defaults if unset) +//! PUT /api/health/config → round-trip persistence + threshold validation (400 on bad input) + +use actix_web::{test as actix_test, web, App}; +use serde_json::{json, Value}; +use std::sync::Arc; +use tokio::sync::broadcast; + +struct TestCtx { + _temp: tempfile::TempDir, + state: web::Data, + token: String, +} + +impl TestCtx { + async fn new() -> Self { + let temp = tempfile::TempDir::new().unwrap(); + let manager = Arc::new(atomic_core::DatabaseManager::new(temp.path()).unwrap()); + let (_info, raw_token) = manager + .active_core() + .await + .unwrap() + .create_api_token("test") + .await + .unwrap(); + let (event_tx, _) = broadcast::channel(16); + let state = web::Data::new(atomic_server::state::AppState { + manager, + event_tx, + public_url: None, + log_buffer: atomic_server::log_buffer::LogBuffer::new(16), + export_jobs: atomic_server::export_jobs::ExportJobManager::for_tests( + temp.path().join("exports"), + ), + }); + TestCtx { + _temp: temp, + state, + token: raw_token, + } + } + + fn auth_header(&self) -> (&str, String) { + ("Authorization", format!("Bearer {}", self.token)) + } +} + +fn test_app( + ctx: &TestCtx, +) -> App< + impl actix_web::dev::ServiceFactory< + actix_web::dev::ServiceRequest, + Config = (), + Response = actix_web::dev::ServiceResponse, + Error = actix_web::Error, + InitError = (), + >, +> { + App::new().app_data(ctx.state.clone()).service( + web::scope("/api") + .wrap(atomic_server::auth::BearerAuth { + state: ctx.state.clone(), + }) + .configure(atomic_server::routes::configure_routes), + ) +} + +#[actix_web::test] +async fn test_health_config_defaults_on_first_read() { + let ctx = TestCtx::new().await; + let app = actix_test::init_service(test_app(&ctx)).await; + + let req = actix_test::TestRequest::get() + .uri("/api/health/config") + .insert_header(ctx.auth_header()) + .to_request(); + let resp = actix_test::call_service(&app, req).await; + assert_eq!(resp.status(), 200); + + let body: Value = actix_test::read_body_json(resp).await; + // All threshold defaults must be present for forward-compat. + let t = &body["thresholds"]; + assert_eq!(t["boilerplate_similarity"].as_f64().unwrap(), 0.99); + assert_eq!(t["boilerplate_min_clones"].as_i64().unwrap(), 2); + assert_eq!(t["content_quality_short_chars"].as_i64().unwrap(), 100); + assert_eq!(t["content_quality_long_chars"].as_i64().unwrap(), 15_000); + assert_eq!(t["wiki_min_atoms_per_tag"].as_i64().unwrap(), 5); + assert_eq!(t["semantic_graph_freshness_warning"].as_i64().unwrap(), 20); +} + +#[actix_web::test] +async fn test_health_config_put_round_trip() { + let ctx = TestCtx::new().await; + let app = actix_test::init_service(test_app(&ctx)).await; + + // Start from defaults and override a subset. + let payload = json!({ + "overrides": {}, + "thresholds": { + "boilerplate_similarity": 0.995, + "boilerplate_min_clones": 3, + "contradiction_similarity_min": 0.80, + "contradiction_similarity_max": 0.92, + "contradiction_shared_tags_min": 1, + "content_overlap_similarity_min": 0.55, + "content_overlap_similarity_max": 0.85, + "content_overlap_shared_tags_min": 2, + "content_quality_short_chars": 150, + "content_quality_long_chars": 20_000, + "wiki_min_atoms_per_tag": 10, + "tag_health_single_atom_threshold": 5, + "semantic_graph_freshness_warning": 50 + } + }); + + let req = actix_test::TestRequest::put() + .uri("/api/health/config") + .insert_header(ctx.auth_header()) + .set_json(&payload) + .to_request(); + let resp = actix_test::call_service(&app, req).await; + assert!( + resp.status().is_success(), + "PUT failed: {}", + resp.status() + ); + + let req = actix_test::TestRequest::get() + .uri("/api/health/config") + .insert_header(ctx.auth_header()) + .to_request(); + let resp = actix_test::call_service(&app, req).await; + assert_eq!(resp.status(), 200); + + let body: Value = actix_test::read_body_json(resp).await; + let t = &body["thresholds"]; + assert_eq!(t["boilerplate_similarity"].as_f64().unwrap(), 0.995); + assert_eq!(t["boilerplate_min_clones"].as_i64().unwrap(), 3); + assert_eq!(t["content_quality_short_chars"].as_i64().unwrap(), 150); + assert_eq!(t["wiki_min_atoms_per_tag"].as_i64().unwrap(), 10); + assert_eq!(t["semantic_graph_freshness_warning"].as_i64().unwrap(), 50); +} + +#[actix_web::test] +async fn test_health_config_rejects_similarity_out_of_range() { + let ctx = TestCtx::new().await; + let app = actix_test::init_service(test_app(&ctx)).await; + + let bad = json!({ + "overrides": {}, + "thresholds": { + "boilerplate_similarity": 1.5, + "boilerplate_min_clones": 2, + "contradiction_similarity_min": 0.80, + "contradiction_similarity_max": 0.92, + "contradiction_shared_tags_min": 1, + "content_overlap_similarity_min": 0.55, + "content_overlap_similarity_max": 0.85, + "content_overlap_shared_tags_min": 2, + "content_quality_short_chars": 100, + "content_quality_long_chars": 15_000, + "wiki_min_atoms_per_tag": 5, + "tag_health_single_atom_threshold": 3, + "semantic_graph_freshness_warning": 20 + } + }); + + let req = actix_test::TestRequest::put() + .uri("/api/health/config") + .insert_header(ctx.auth_header()) + .set_json(&bad) + .to_request(); + let resp = actix_test::call_service(&app, req).await; + assert_eq!(resp.status(), 400); + + let body: Value = actix_test::read_body_json(resp).await; + assert!( + body["error"].as_str().unwrap_or("").contains("boilerplate_similarity"), + "expected boilerplate_similarity in error, got {}", + body + ); +} + +#[actix_web::test] +async fn test_health_config_rejects_inverted_window() { + let ctx = TestCtx::new().await; + let app = actix_test::init_service(test_app(&ctx)).await; + + let bad = json!({ + "overrides": {}, + "thresholds": { + "boilerplate_similarity": 0.99, + "boilerplate_min_clones": 2, + "contradiction_similarity_min": 0.95, + "contradiction_similarity_max": 0.90, + "contradiction_shared_tags_min": 1, + "content_overlap_similarity_min": 0.55, + "content_overlap_similarity_max": 0.85, + "content_overlap_shared_tags_min": 2, + "content_quality_short_chars": 100, + "content_quality_long_chars": 15_000, + "wiki_min_atoms_per_tag": 5, + "tag_health_single_atom_threshold": 3, + "semantic_graph_freshness_warning": 20 + } + }); + + let req = actix_test::TestRequest::put() + .uri("/api/health/config") + .insert_header(ctx.auth_header()) + .set_json(&bad) + .to_request(); + let resp = actix_test::call_service(&app, req).await; + assert_eq!(resp.status(), 400); +} + +#[actix_web::test] +async fn test_health_config_partial_payload_uses_defaults() { + // Demonstrates serde-default forward-compat: client sends only the fields + // it knows about, server fills the rest from defaults. + let ctx = TestCtx::new().await; + let app = actix_test::init_service(test_app(&ctx)).await; + + let payload = json!({ + "overrides": {}, + "thresholds": { + "boilerplate_similarity": 0.995 + } + }); + + let req = actix_test::TestRequest::put() + .uri("/api/health/config") + .insert_header(ctx.auth_header()) + .set_json(&payload) + .to_request(); + let resp = actix_test::call_service(&app, req).await; + assert!(resp.status().is_success(), "PUT failed: {}", resp.status()); + + let req = actix_test::TestRequest::get() + .uri("/api/health/config") + .insert_header(ctx.auth_header()) + .to_request(); + let resp = actix_test::call_service(&app, req).await; + let body: Value = actix_test::read_body_json(resp).await; + let t = &body["thresholds"]; + assert_eq!(t["boilerplate_similarity"].as_f64().unwrap(), 0.995); + // Unset fields fell back to defaults. + assert_eq!(t["boilerplate_min_clones"].as_i64().unwrap(), 2); + assert_eq!(t["wiki_min_atoms_per_tag"].as_i64().unwrap(), 5); +} diff --git a/crates/atomic-server/tests/api_health_custom_checks.rs b/crates/atomic-server/tests/api_health_custom_checks.rs new file mode 100644 index 00000000..6f911bba --- /dev/null +++ b/crates/atomic-server/tests/api_health_custom_checks.rs @@ -0,0 +1,252 @@ +//! Integration tests for the custom-health-checks REST endpoints. +//! +//! GET /api/health/custom-checks → Vec +//! PUT /api/health/custom-checks → round-trip persistence + +use actix_web::{test as actix_test, web, App}; +use serde_json::{json, Value}; +use std::sync::Arc; +use tokio::sync::broadcast; + +struct TestCtx { + _temp: tempfile::TempDir, + state: web::Data, + token: String, +} + +impl TestCtx { + async fn new() -> Self { + let temp = tempfile::TempDir::new().unwrap(); + let manager = Arc::new(atomic_core::DatabaseManager::new(temp.path()).unwrap()); + let (_info, raw_token) = manager + .active_core() + .await + .unwrap() + .create_api_token("test") + .await + .unwrap(); + let (event_tx, _) = broadcast::channel(16); + let state = web::Data::new(atomic_server::state::AppState { + manager, + event_tx, + public_url: None, + log_buffer: atomic_server::log_buffer::LogBuffer::new(16), + export_jobs: atomic_server::export_jobs::ExportJobManager::for_tests( + temp.path().join("exports"), + ), + }); + TestCtx { + _temp: temp, + state, + token: raw_token, + } + } + + fn auth_header(&self) -> (&str, String) { + ("Authorization", format!("Bearer {}", self.token)) + } +} + +fn test_app( + ctx: &TestCtx, +) -> App< + impl actix_web::dev::ServiceFactory< + actix_web::dev::ServiceRequest, + Config = (), + Response = actix_web::dev::ServiceResponse, + Error = actix_web::Error, + InitError = (), + >, +> { + App::new().app_data(ctx.state.clone()).service( + web::scope("/api") + .wrap(atomic_server::auth::BearerAuth { + state: ctx.state.clone(), + }) + .configure(atomic_server::routes::configure_routes), + ) +} + +#[actix_web::test] +async fn test_custom_checks_get_default_empty() { + let ctx = TestCtx::new().await; + let app = actix_test::init_service(test_app(&ctx)).await; + + let req = actix_test::TestRequest::get() + .uri("/api/health/custom-checks") + .insert_header(ctx.auth_header()) + .to_request(); + let resp = actix_test::call_service(&app, req).await; + assert_eq!(resp.status(), 200); + + let body: Value = actix_test::read_body_json(resp).await; + assert_eq!(body["checks"].as_array().unwrap().len(), 0); +} + +#[actix_web::test] +async fn test_custom_checks_put_and_get_round_trip() { + let ctx = TestCtx::new().await; + let app = actix_test::init_service(test_app(&ctx)).await; + + let payload = json!({ + "checks": [{ + "id": "needs_source", + "label": "Requires source URL", + "description": "All atoms must carry a source_url.", + "enabled": true, + "weight": 0.5, + "rule": { "kind": "require_source", "tag_filter": null } + }, { + "id": "tagged_enough", + "label": "Tag count bounds", + "description": "", + "enabled": false, + "weight": 0.0, + "rule": { "kind": "tag_cardinality", "min": 1, "max": 5, "tag_filter": null } + }] + }); + + let req = actix_test::TestRequest::put() + .uri("/api/health/custom-checks") + .insert_header(ctx.auth_header()) + .set_json(&payload) + .to_request(); + let resp = actix_test::call_service(&app, req).await; + assert!( + resp.status().is_success(), + "PUT failed: {}", + resp.status() + ); + + let req = actix_test::TestRequest::get() + .uri("/api/health/custom-checks") + .insert_header(ctx.auth_header()) + .to_request(); + let resp = actix_test::call_service(&app, req).await; + assert_eq!(resp.status(), 200); + + let body: Value = actix_test::read_body_json(resp).await; + let arr = body["checks"].as_array().unwrap(); + assert_eq!(arr.len(), 2); + assert_eq!(arr[0]["id"], "needs_source"); + assert_eq!(arr[0]["rule"]["kind"], "require_source"); + assert_eq!(arr[1]["id"], "tagged_enough"); + assert_eq!(arr[1]["rule"]["min"], 1); + assert_eq!(arr[1]["rule"]["max"], 5); + assert_eq!(arr[1]["enabled"], false); +} + +#[actix_web::test] +async fn test_custom_checks_put_overwrites_previous() { + let ctx = TestCtx::new().await; + let app = actix_test::init_service(test_app(&ctx)).await; + + let first = json!({ + "checks": [{ + "id": "a", + "label": "A", + "description": "", + "enabled": true, + "weight": 1.0, + "rule": { "kind": "require_source", "tag_filter": null } + }] + }); + let req = actix_test::TestRequest::put() + .uri("/api/health/custom-checks") + .insert_header(ctx.auth_header()) + .set_json(&first) + .to_request(); + assert!(actix_test::call_service(&app, req).await.status().is_success()); + + let second = json!({ "checks": [] }); + let req = actix_test::TestRequest::put() + .uri("/api/health/custom-checks") + .insert_header(ctx.auth_header()) + .set_json(&second) + .to_request(); + assert!(actix_test::call_service(&app, req).await.status().is_success()); + + let req = actix_test::TestRequest::get() + .uri("/api/health/custom-checks") + .insert_header(ctx.auth_header()) + .to_request(); + let body: Value = actix_test::read_body_json(actix_test::call_service(&app, req).await).await; + assert_eq!(body["checks"].as_array().unwrap().len(), 0); +} + +#[actix_web::test] +async fn test_custom_checks_requires_auth() { + let ctx = TestCtx::new().await; + let app = actix_test::init_service(test_app(&ctx)).await; + + let req = actix_test::TestRequest::get() + .uri("/api/health/custom-checks") + .to_request(); + match actix_test::try_call_service(&app, req).await { + Ok(resp) => assert_eq!(resp.status(), 401), + Err(err) => { + let resp = err.error_response(); + assert_eq!(resp.status(), 401); + } + } +} + + +// --- Preview --------------------------------------------------------------- + +#[actix_web::test] +async fn test_custom_checks_preview_returns_counts() { + let ctx = TestCtx::new().await; + let app = actix_test::init_service(test_app(&ctx)).await; + + // Seed two atoms — one missing source_url so RequireSource flags it. + let req = actix_test::TestRequest::post() + .uri("/api/atoms") + .insert_header(ctx.auth_header()) + .set_json(json!({ "content": "no source" })) + .to_request(); + assert!(actix_test::call_service(&app, req).await.status().is_success()); + let req = actix_test::TestRequest::post() + .uri("/api/atoms") + .insert_header(ctx.auth_header()) + .set_json(json!({ "content": "has source", "source_url": "https://example.com/a" })) + .to_request(); + assert!(actix_test::call_service(&app, req).await.status().is_success()); + + let req = actix_test::TestRequest::post() + .uri("/api/health/custom-checks/preview") + .insert_header(ctx.auth_header()) + .set_json(json!({ "rule": { "kind": "require_source", "tag_filter": null } })) + .to_request(); + let resp = actix_test::call_service(&app, req).await; + assert_eq!(resp.status(), 200); + + let body: Value = actix_test::read_body_json(resp).await; + assert_eq!(body["total_considered"], 2); + assert_eq!(body["flagged_count"], 1); + assert_eq!(body["sample"].as_array().unwrap().len(), 1); + + // Preview must not persist — list should still be empty. + let req = actix_test::TestRequest::get() + .uri("/api/health/custom-checks") + .insert_header(ctx.auth_header()) + .to_request(); + let body: Value = actix_test::read_body_json(actix_test::call_service(&app, req).await).await; + assert_eq!(body["checks"].as_array().unwrap().len(), 0); +} + +#[actix_web::test] +async fn test_custom_checks_preview_returns_error_for_malformed_regex() { + let ctx = TestCtx::new().await; + let app = actix_test::init_service(test_app(&ctx)).await; + + let req = actix_test::TestRequest::post() + .uri("/api/health/custom-checks/preview") + .insert_header(ctx.auth_header()) + .set_json(json!({ + "rule": { "kind": "content_regex", "pattern": "(?P { + virtualizer.measure(); +}, [flatTags, virtualizer]); +``` + +This tells the virtualizer to recalculate all item positions whenever the visible tag list changes. + +#### Change 2: Fix the scroll effect (lines 90–99) + +Replace the existing scroll effect with: + +```typescript +// Scroll to selected tag +useEffect(() => { + if (selectedTagId) { + const index = tagIndexMap.get(selectedTagId); + if (index !== undefined) { + // Scroll directly without setTimeout—measurements are now fresh + virtualizer.scrollToIndex(index, { align: 'center', behavior: 'smooth' }); + } + } +}, [selectedTagId, tagIndexMap, virtualizer]); +``` + +**Key changes:** +- Remove `setTimeout()` — no more arbitrary delays +- Change `align: 'auto'` to `align: 'center'` — clearer visual feedback +- Scroll only runs when `selectedTagId` changes (when you click a tag) + +### Why This Works + +1. **Expand a tag** → `flatTags` updates → `useEffect` calls `virtualizer.measure()` +2. **Virtualizer recalculates** all item positions (now includes newly visible child tags) +3. **Click a tag in the expanded group** → `selectedTagId` changes +4. **Scroll effect runs** → index lookup is now correct, virtualizer knows all positions +5. **Smooth scroll to center** — no stale state, no race conditions + +## Testing + +After applying the fix: + +```bash +npm run tauri dev +``` + +Then in the app: + +1. **Expand a tag** — should stay in place (no scroll) +2. **Click a tag text** — should scroll smoothly to center that tag +3. **Expand multiple nested tags** — smooth scrolling, no jank +4. **Rapid expand/collapse** — no stale index bugs +5. **Manual scroll + click** — scrolls to correct position + +## Code Location Reference + +| What | Where | +|------|-------| +| Virtualizer setup | `src/components/tags/TagTree.tsx:82–88` | +| Tag expansion toggle | `src/components/tags/TagNode.tsx:24–30` | +| Scroll effect (to modify) | `src/components/tags/TagTree.tsx:90–99` | +| Where to add remeasure | `src/components/tags/TagTree.tsx:after line 99` | + +## Why We Don't Scroll on Expand/Collapse + +- **Expand** is a tree-structure change, not a selection change +- Scrolling on every expand would be jarring (user would see tree expand, then jump) +- **Current behavior is correct:** expand in place, then click to scroll to selection + +Only scroll when the user explicitly selects a tag (clicks the text). + +## Performance Notes + +- `virtualizer.measure()` is cheap — it recalculates sizes, doesn't re-render +- Smooth scroll (`behavior: 'smooth'`) is hardware-accelerated +- No performance regression expected + +## Related Code + +- **Virtualizer setup**: `useVirtualizer()` hook from `@tanstack/react-virtual` +- **Tag selection**: `setSelectedTag()` in `src/stores/ui.ts:297–313` +- **Tree flattening**: `flattenVisibleTags()` in `src/components/tags/TagTree.tsx:20–45` diff --git a/docs/plans/2026-04-30-knowledge-health-dashboard/plan.md b/docs/plans/2026-04-30-knowledge-health-dashboard/plan.md new file mode 100644 index 00000000..ccb7801a --- /dev/null +++ b/docs/plans/2026-04-30-knowledge-health-dashboard/plan.md @@ -0,0 +1,1248 @@ +# Knowledge Base Health Dashboard with Auto-Remediation + +**Status:** Planning +**Project:** atomic +**Date:** 2026-04-30 +**Scope:** New feature + +## Executive Summary + +This plan implements a comprehensive health-check system for Atomic that detects and remediates data quality issues through a combination of deterministic SQL fixes and LLM-powered judgment calls. The system exposes two main endpoints (`GET /api/health/knowledge` and `POST /api/health/fix`), stores fix audits for undo capability, and integrates a dashboard widget for monitoring and manual remediation. + +The feature is designed to run automatically after bulk operations (imports, re-embeddings, tag deletions) and optionally on a nightly schedule. It will surface issues to the user through the daily briefing when necessary, and enable both automatic and manual fix workflows. + +**Key capabilities:** +- 11 distinct health checks covering embeddings, tagging, duplicates, contradictions, orphan tags, and content quality +- Tiered auto-fix safety model (Safe/Low/Medium/High) with dry-run support +- Durable audit logs with undo capability for all fixes +- LLM-powered fixes for merge, split, enrich, and contradiction resolution +- Dashboard widget showing health score and actionable issues +- Post-bulk-operation hooks to prevent score degradation +- Health history trending for monitoring KB quality over time + +## Current Architecture / Evidence + +### Existing Infrastructure to Build On + +**Endpoints already in place:** +- `GET /api/embeddings/status` — returns pipeline counts (pending, processing, complete, failed) +- `GET /api/embeddings/status/all` — returns per-atom pipeline status +- `GET /api/wiki/suggestions` — returns tags eligible for wiki articles +- `GET /api/wiki/{tag_id}/status` — returns `new_atoms_available` for a specific wiki +- `GET /api/atoms/{id}/similar` — returns similar atoms above a threshold +- `GET /api/graph/edges` — returns semantic edges with similarity scores +- `POST /api/embeddings/process-pending` — processes pending embeddings +- `POST /api/embeddings/retry-failed` — retries failed embeddings +- `POST /api/graph/rebuild` — rebuilds semantic edge graph +- `POST /api/wiki/{tag_id}/generate` — generates wiki for a tag +- `POST /api/utils/compact-tags` — removes orphan tags + +**Code patterns to follow:** + +1. **Route handlers** (`crates/atomic-server/src/routes/`): + - Follow pattern: `#[utoipa::path(...)] pub async fn handler(db: Db, ...) -> HttpResponse` + - Use `ok_or_error()` for simple responses, `crate::error::error_response(e)` for errors + - Register new routes in `routes/mod.rs` + +2. **LLM integration** (existing patterns from `tagging`, `wiki`, `chat`): + - Call LLM provider trait methods from atomic-core + - Send structured prompts with available context + - Handle streaming vs. completion-based responses + - Parse JSON-structured outputs when needed + +3. **Event callbacks** (pattern from `embedding.rs`): + ```rust + let on_event = embedding_event_callback(state.event_tx.clone()); + db.0.some_operation(on_event).await + ``` + +4. **Database schema** (`crates/atomic-core/src/db.rs`): + - Use rusqlite for SQLite; all new tables need both SQLite and Postgres implementations + - Existing migrations in `migrations/` directory (SQLite) and `crates/atomic-core/src/storage/postgres/migrations/` + - Per-DB data lives in the data database; shared state lives in registry.db + +5. **Settings/Configuration** (`crates/atomic-core/src/settings.rs`): + - Global settings stored in registry.db via `get_setting()` / `set_setting()` + - Per-DB settings can override via `storage.get_all_settings_sync()` / `set_setting_sync()` + - LLM prompt templates stored as settings + +### Data Model + +**Atoms** (existing): +- `atoms.id`, `atoms.content`, `atoms.source_url`, `atoms.embedding_status`, `atoms.tagging_status`, `atoms.embedding_error`, `atoms.tagging_error` + +**Tags** (existing): +- `tags.id`, `tags.name`, `tags.parent_id`, `tags.is_autotag_target` + +**Semantic edges** (existing): +- `semantic_edges.atom_a_id`, `semantic_edges.atom_b_id`, `semantic_edges.similarity` + +**Wiki articles** (existing): +- `wiki_articles.tag_id`, `wiki_articles.content`, `wiki_articles.last_generated_at` + +**Conversations** (existing): +- `conversations.id`, `conversations.tag_filter` + +## Recommended Approach + +### Architecture Decision: Modular Health System + +The health system will be organized as a new module `atomic-core::health` with submodules: + +- `health/mod.rs` — orchestration, score calculation, overall health computation +- `health/checks.rs` — individual check implementations (deterministic SQL queries) +- `health/fixes.rs` — deterministic auto-fix logic (no LLM needed) +- `health/llm_fixes.rs` — LLM-powered fix logic (merge, split, enrich, contradict) +- `health/audit.rs` — fix logging and undo capability + +**Why this structure:** +1. Separates concerns: query logic, fix logic, LLM logic, audit logic are independent +2. Makes it easy to test individual checks in isolation +3. Allows future extensions without touching core module +4. Follows existing pattern: embedding, wiki, chat are all separate modules with clear responsibilities + +### Tiered Fix Safety Model + +| Tier | Risk | Confirmation | Examples | +|------|------|-------------|----------| +| **Safe** | Zero risk | Auto-run, no confirmation | Retry failed embeddings, rebuild graph, process pending tagging | +| **Low** | Minimal risk, reversible | Auto-run with undo log | Delete orphan tags, generate missing wikis | +| **Medium** | Changes content | Dry-run first, confirm | Add headings, merge exact-source duplicates | +| **High** | Deletes or rewrites | Always require review | Merge similar atoms, split long atoms, delete stubs, resolve contradictions | + +**Endpoint semantics:** +- `POST /api/health/fix { mode: "auto" }` → runs Safe + Low tier +- `POST /api/health/fix { mode: "auto", include_medium: true }` → Safe + Low + Medium +- `POST /api/health/fix { mode: "dry_run", ... }` → report what would be fixed without executing + +### LLM Prompt Templates + +Store as settings (like `tagging_prompt`, `chat_prompt` already do): +- `health.merge_duplicates_prompt` — for merging high-similarity atoms +- `health.contradiction_detection_prompt` — for finding conflicting info +- `health.split_long_atom_prompt` — for splitting >15K character atoms +- `health.enrich_stub_atom_prompt` — for expanding <100 char atoms +- `health.add_structure_prompt` — for adding headings to unstructured content +- `health.tag_reorganize_prompt` — for suggesting tag hierarchy fixes + +All with sensible defaults so zero setup is needed. + +## Implementation Plan + +### Phase I: Core Infrastructure (Foundation) + +**1.1 Database schema additions** + +Add to SQLite migration (new file `migrations/XXX_create_health_tables.sql`): + +```sql +-- Health reports — historical snapshots of KB state +CREATE TABLE health_reports ( + id TEXT PRIMARY KEY, + computed_at TEXT NOT NULL, + overall_score INTEGER NOT NULL, + check_scores TEXT NOT NULL, -- JSON: {"duplicates": 80, ...} + atom_count INTEGER NOT NULL, + auto_fixes_applied INTEGER DEFAULT 0, + report_json TEXT NOT NULL -- Full report for detail view +); +CREATE INDEX idx_health_reports_computed ON health_reports(computed_at DESC); + +-- Audit log of all auto-fix actions (for undo) +CREATE TABLE health_fix_log ( + id TEXT PRIMARY KEY, + check_name TEXT NOT NULL, + action TEXT NOT NULL, + tier TEXT NOT NULL, -- "safe", "low", "medium", "high" + atom_ids TEXT, -- JSON array of affected atom IDs + tag_ids TEXT, -- JSON array of affected tag IDs + before_state TEXT, -- JSON snapshot (for undo) + after_state TEXT, -- JSON snapshot (for verification) + llm_prompt TEXT, -- Prompt sent to LLM (if applicable) + llm_response TEXT, -- Raw LLM response (for audit) + executed_at TEXT NOT NULL, + undone_at TEXT -- NULL unless undone +); +CREATE INDEX idx_health_fix_log_executed ON health_fix_log(executed_at DESC); +CREATE INDEX idx_health_fix_log_check ON health_fix_log(check_name); +``` + +Add equivalent Postgres migration to `crates/atomic-core/src/storage/postgres/migrations/`. + +**1.2 Models for health domain** + +New file: `crates/atomic-core/src/health/models.rs` + +```rust +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct HealthReport { + pub overall_score: u32, // 0-100 + pub overall_status: HealthStatus, // healthy, needs_attention, degraded, unhealthy + pub computed_at: String, + pub atom_count: i32, + pub checks: HashMap, + pub auto_fixable: i32, // count of auto-fixable issues + pub requires_review: i32, // count of issues needing human review +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub enum HealthStatus { + #[serde(rename = "healthy")] + Healthy, + #[serde(rename = "needs_attention")] + NeedsAttention, + #[serde(rename = "degraded")] + Degraded, + #[serde(rename = "unhealthy")] + Unhealthy, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct HealthCheck { + pub status: String, // "ok", "warning", "error" + pub score: u32, // 0-100 contribution to overall + // Check-specific fields vary by check type + #[serde(flatten)] + pub data: serde_json::Value, // Dynamic fields per check +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct FixAction { + pub check: String, + pub action: String, // "deleted_tags", "merged_atoms", etc. + pub count: i32, + pub details: Vec, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct FixResponse { + pub mode: String, // "auto", "dry_run" + pub actions_taken: Vec, + pub skipped: Vec, + pub new_score: u32, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SkippedFix { + pub check: String, + pub reason: String, + pub count: i32, +} +``` + +**1.3 Storage trait additions** + +Add to `crates/atomic-core/src/storage/traits.rs`: + +```rust +#[async_trait] +pub trait HealthStore: Send + Sync { + // Health report storage + async fn store_health_report(&self, report: &HealthReport) -> StorageResult<()>; + async fn get_latest_health_report(&self) -> StorageResult>; + async fn get_health_reports_since(&self, since: &str) -> StorageResult>; + + // Fix audit log + async fn log_fix_action(&self, fix_log: &HealthFixLog) -> StorageResult<()>; + async fn get_fix_log(&self, fix_id: &str) -> StorageResult>; + async fn get_recent_fixes(&self, limit: i32) -> StorageResult>; + async fn undo_fix(&self, fix_id: &str) -> StorageResult<()>; +} + +#[derive(Debug, Clone)] +pub struct HealthFixLog { + pub id: String, + pub check_name: String, + pub action: String, + pub tier: String, // "safe", "low", "medium", "high" + pub atom_ids: Option>, + pub tag_ids: Option>, + pub before_state: String, // JSON + pub after_state: String, // JSON + pub llm_prompt: Option, + pub llm_response: Option, + pub executed_at: String, + pub undone_at: Option, +} +``` + +Implement for both `SqliteStorage` and `PostgresStorage` (mostly straightforward INSERT/SELECT statements). + +### Phase II: Health Checks (11 distinct checks) + +**2.1 `health/checks.rs` — Deterministic checks** + +Each check returns a `HealthCheck` struct with standardized fields. Run synchronously over database snapshots. + +```rust +pub async fn check_embedding_coverage(storage: &dyn Storage) -> HealthCheck { + // Count: pending, processing, complete, failed + // Score: (complete / total) * 100, cap at 50 if any failed + // Return distribution + status +} + +pub async fn check_source_uniqueness(storage: &dyn Storage) -> HealthCheck { + // Find source_urls appearing on multiple atoms + // Score: 100 if 0 duplicates, subtract 15 per duplicate +} + +pub async fn check_orphan_tags(storage: &dyn Storage) -> HealthCheck { + // Find tags with 0 atoms and no children, excluding autotag targets + // Score: 100 if 0, subtract 2 per orphan +} + +pub async fn check_tagging_coverage(storage: &dyn Storage) -> HealthCheck { + // Count atoms: tagged, untagged, failed, skipped + // Score: (tagged / total) * 100 +} + +pub async fn check_semantic_graph_freshness(storage: &dyn Storage) -> HealthCheck { + // Compare last rebuild time vs newest atom + // Score: 100 if recent, subtract 2 per atom since rebuild +} + +pub async fn check_wiki_coverage(storage: &dyn Storage) -> HealthCheck { + // Find tags with >= 5 atoms that could have wikis + // Count: with_wiki, without_wiki, stale + // Score: (with_wiki / eligible) * 70 + (non_stale / with_wiki) * 30 +} + +pub async fn check_content_quality(storage: &dyn Storage) -> HealthCheck { + // Flag atoms: very_short (<100 chars), very_long (>15K chars), + // no_headings, no_source + // Score: 85 base, minus 5 for each category with issues +} + +pub async fn check_tag_health(storage: &dyn Storage) -> HealthCheck { + // Find: single-atom tags, rootless tags, similar-named tags + // Score: points deducted per category +} +``` + +**2.2 LLM-powered checks** + +To be added in Phase III. For now, implement skeleton methods that return placeholder scores. + +```rust +pub async fn check_duplicate_detection( + storage: &dyn Storage, + _providers: &dyn LlmProvider, +) -> HealthCheck { + // Find atom pairs with similarity 0.92-1.0 from different sources + // Mark as requires_review: true + // Don't execute fixes yet +} + +pub async fn check_contradiction_detection( + storage: &dyn Storage, + _providers: &dyn LlmProvider, +) -> HealthCheck { + // Find atoms with similarity 0.75-0.92 (same topic, different content) + // Use LLM to confirm contradiction (Phase III) + // Mark as requires_review: true +} +``` + +**2.3 Score aggregation** + +```rust +pub fn compute_overall_score(checks: &HashMap) -> u32 { + let weights = [ + ("duplicate_detection", 0.15), + ("embedding_coverage", 0.15), + ("source_uniqueness", 0.10), + ("tagging_coverage", 0.10), + ("wiki_coverage", 0.10), + ("semantic_graph_freshness", 0.10), + ("content_quality", 0.05), + ("orphan_tags", 0.05), + ("tag_health", 0.05), + ("contradiction_detection", 0.05), + ("tagging_coverage", 0.10), // untagged atoms + ]; + + let mut total = 0.0; + for (check_name, weight) in weights.iter() { + if let Some(check) = checks.get(*check_name) { + total += (check.score as f64) * weight; + } + } + total as u32 +} + +pub fn status_for_score(score: u32) -> HealthStatus { + match score { + 90..=100 => HealthStatus::Healthy, + 70..=89 => HealthStatus::NeedsAttention, + 50..=69 => HealthStatus::Degraded, + _ => HealthStatus::Unhealthy, + } +} +``` + +### Phase III: Auto-Fix Implementation + +**3.1 `health/fixes.rs` — Deterministic fixes (no LLM)** + +```rust +pub async fn fix_embedding_coverage( + storage: &dyn Storage, + core: &AtomicCore, +) -> Result { + // Call core.process_pending_embeddings() + // Call core.retry_failed_embeddings() + // Return action: { check: "embedding_coverage", action: "retry_failed_and_process_pending", count: X } +} + +pub async fn fix_orphan_tags(storage: &dyn Storage) -> Result { + // Find and delete orphan tags (not autotag targets) + // Log to health_fix_log with before_state + // Return action: { check: "orphan_tags", action: "deleted_tags", count: X, details: [tag_names] } +} + +pub async fn fix_source_uniqueness(storage: &dyn Storage) -> Result { + // For exact source_url duplicates: + // - Keep newest (by created_at) + // - Merge tags from deleted atoms onto the kept one + // - Delete older atoms + // Log all deletes with before_state + // Return action with count +} + +pub async fn fix_semantic_graph_freshness( + storage: &dyn Storage, + core: &AtomicCore, +) -> Result { + // Call core.rebuild_semantic_edges() + // Return action: { check: "semantic_graph_freshness", action: "queued_rebuild", ... } +} +``` + +**3.2 `health/llm_fixes.rs` — LLM-powered fixes** + +These will call LLM provider to make judgment calls. Implemented in Phase III. + +```rust +pub async fn fix_tagging_coverage_with_llm( + storage: &dyn Storage, + core: &AtomicCore, + llm: &dyn LlmProvider, + untagged_atoms: &[AtomWithTags], +) -> Result { + // For each untagged atom, call LLM with modified prompt that forces >= 1 tag + // Re-run tagging with forced assignment + // Return action with count of newly tagged atoms +} + +pub async fn merge_duplicate_atoms_with_llm( + storage: &dyn Storage, + atom_a: &AtomWithTags, + atom_b: &AtomWithTags, + llm: &dyn LlmProvider, +) -> Result { + // Call LLM to synthesize both atoms + // Update newer atom with merged content + // Delete older atom + // Re-embed and re-tag merged atom + // Log to health_fix_log +} + +pub async fn split_long_atom_with_llm( + storage: &dyn Storage, + atom: &AtomWithTags, + llm: &dyn LlmProvider, +) -> Result { + // Call LLM to analyze if atom should be split + // If yes: create new atoms for each section + // If no: add structure (headings) instead + // Log all creates/deletes +} +``` + +**3.3 `health/audit.rs` — Undo capability** + +```rust +pub async fn undo_fix( + storage: &dyn Storage, + fix_id: &str, +) -> Result<(), AtomicCoreError> { + // Fetch fix_log entry via fix_id + // Parse before_state JSON + // For each affected atom_id: restore from before_state + // For each affected tag_id: restore from before_state + // Mark fix_log.undone_at = now() + // Return any created entries to allow cascading undo +} +``` + +### Phase IV: API Endpoints + +**4.1 `routes/health.rs` — New route handlers** + +```rust +#[utoipa::path( + get, + path = "/api/health/knowledge", + responses( + (status = 200, description = "Health report", body = HealthReport) + ), + tag = "health" +)] +pub async fn get_health_knowledge( + state: web::Data, + db: Db, +) -> HttpResponse { + // Compute all checks + // Calculate overall score + // Store report + // Return JSON +} + +#[utoipa::path( + post, + path = "/api/health/fix", + request_body = FixRequest, + responses( + (status = 200, description = "Fix results", body = FixResponse) + ), + tag = "health" +)] +pub async fn run_health_fix( + state: web::Data, + db: Db, + body: web::Json, +) -> HttpResponse { + // Determine which fixes to run based on mode and include_medium + // Execute fixes in tier order (Safe → Low → Medium) + // Collect FixAction results + // Recompute health score + // Return FixResponse +} + +#[utoipa::path( + post, + path = "/api/health/fix/{check}/{item_id}", + params( + ("check" = String, Path), + ("item_id" = String, Path) + ), + request_body = ManualFixRequest, + responses( + (status = 200, description = "Fix applied") + ), + tag = "health" +)] +pub async fn apply_manual_fix( + state: web::Data, + db: Db, + path: web::Path<(String, String)>, + body: web::Json, +) -> HttpResponse { + // Route to specific fix handler based on check name + // Execute fix with user parameters + // Log to health_fix_log + // Return success +} + +#[utoipa::path( + post, + path = "/api/health/undo/{fix_id}", + params(("fix_id" = String, Path)), + responses( + (status = 200, description = "Fix undone") + ), + tag = "health" +)] +pub async fn undo_health_fix(db: Db, path: web::Path) -> HttpResponse { + let fix_id = path.into_inner(); + ok_or_error(db.0.undo_fix(&fix_id).await) +} +``` + +**4.2 Request/response types** + +```rust +#[derive(Deserialize, ToSchema)] +pub struct FixRequest { + pub checks: Option>, // If None, run all + pub mode: String, // "auto", "dry_run" + pub include_medium: Option, // Default false +} + +#[derive(Deserialize, ToSchema)] +pub struct ManualFixRequest { + pub action: String, // "merge", "keep_both", "delete_one", etc. + pub keep_atom_id: Option, + pub merge_strategy: Option, // "keep_newer", "keep_longer", "llm" +} +``` + +**4.3 Register in `routes/mod.rs`** + +```rust +pub mod health; +// ... +// In web::scope("/api"): +.service( + web::scope("/health") + .route("/knowledge", web::get().to(health::get_health_knowledge)) + .route("/fix", web::post().to(health::run_health_fix)) + .route("/fix/{check}/{item_id}", web::post().to(health::apply_manual_fix)) + .route("/undo/{fix_id}", web::post().to(health::undo_health_fix)) +) +``` + +### Phase V: Integration + +**5.1 Post-bulk-operation hooks** + +Add to import handlers (`import/obsidian.rs`, `ingest/fetch.rs`), bulk atom creation, tag deletion: + +```rust +async fn post_bulk_operation_hook(core: &AtomicCore) { + // Run health check + let report = compute_health(&core).await.ok(); + + if let Some(r) = report { + if r.overall_score < 95 { + // Auto-fix safe issues + let _ = core.run_health_fix(FixMode::Safe).await; + + // Recompute and cache + let updated_report = compute_health(&core).await.ok(); + // ... store for later use + } + } +} +``` + +**5.2 Scheduled nightly maintenance** + +Add to `crates/atomic-core/src/scheduler/mod.rs`: + +```rust +pub async fn health_maintenance(core: &AtomicCore) { + // Run full health check + let report = compute_health(&core).await.ok(); + + // Auto-fix safe + low tier issues + if let Some(_) = report { + let _ = core.run_health_fix(FixMode::Low).await; + } + + // Store report for history + // If score dropped, include in next briefing +} +``` + +Add task config to settings: + +```rust +("task.health_maintenance.enabled", "true"), +("task.health_maintenance.interval_hours", "24"), +("task.health_maintenance.auto_fix_tier", "low"), +``` + +**5.3 Briefing integration** + +Extend `crates/atomic-core/src/briefing/mod.rs` to include health findings: + +```rust +fn format_health_section(report: &HealthReport) -> String { + // Only include if score < 85 or contradictions found + // Format: + // ## Knowledge Health + // Your KB score is X/100 (status). + // Auto-fixed: [list] + // Needs review: [list] +} +``` + +**5.4 Settings/configuration** + +Add LLM prompt templates to `DEFAULT_SETTINGS` in `settings.rs`: + +```rust +("health.merge_duplicates_prompt", "..."), +("health.contradiction_detection_prompt", "..."), +("health.split_long_atom_prompt", "..."), +("health.enrich_stub_atom_prompt", "..."), +("health.add_structure_prompt", "..."), +("health.tag_reorganize_prompt", "..."), +``` + +### Phase VI: Frontend (Dashboard Widget) + +**6.1 Health panel component** + +New file: `src/components/dashboard/HealthPanel.tsx` + +```tsx +export function HealthPanel() { + const [report, setReport] = useState(null); + const [loading, setLoading] = useState(false); + + useEffect(() => { + fetchHealth(); + }, []); + + async function fetchHealth() { + const resp = await getTransport().invoke('get_health_knowledge', {}); + setReport(resp as HealthReport); + } + + async function autoFix() { + setLoading(true); + const resp = await getTransport().invoke('run_health_fix', { + mode: 'auto' + }); + await fetchHealth(); + setLoading(false); + } + + if (!report) return null; + + return ( +
+
+

Knowledge Health

+ {report.overall_score}/100 +
+ +
+ {Object.entries(report.checks).map(([name, check]) => ( + + ))} +
+ +
+ + +
+ + {report.auto_fixable > 0 && ( +

{report.auto_fixable} issues can be automatically fixed

+ )} +
+ ); +} +``` + +**6.2 Add to dashboard registry** + +Register `HealthPanel` in `src/components/dashboard/registry.ts` to make it available as a dashboard widget. + +**6.3 Review queue page** + +New page: `src/routes/health/review/+page.svelte` (if using SvelteKit) or equivalent route for showing duplicates, contradictions, stubs that need human review. + +## Files / Components To Change + +### New Files + +**Backend (Rust):** +- `crates/atomic-core/src/health/mod.rs` — module root, orchestration +- `crates/atomic-core/src/health/checks.rs` — 11 health checks (deterministic) +- `crates/atomic-core/src/health/fixes.rs` — auto-fix logic (deterministic) +- `crates/atomic-core/src/health/llm_fixes.rs` — LLM-powered fixes (Phase III) +- `crates/atomic-core/src/health/audit.rs` — fix logging, undo +- `crates/atomic-core/src/health/models.rs` — health domain types +- `crates/atomic-server/src/routes/health.rs` — endpoint handlers +- `migrations/XXX_create_health_tables.sql` — SQLite schema +- `crates/atomic-core/src/storage/postgres/migrations/XXX_create_health_tables.sql` — Postgres schema + +**Frontend (TypeScript/React):** +- `src/components/dashboard/HealthPanel.tsx` — main dashboard widget +- `src/routes/health/+page.tsx` or `.svelte` — detailed health page +- `src/routes/health/review/+page.tsx` — high-tier fix review queue +- `src/lib/api/health.ts` — health API client (type-safe wrapper) + +### Modified Files + +**Backend (Rust):** +- `crates/atomic-core/src/lib.rs` — add `pub mod health` +- `crates/atomic-core/src/storage/traits.rs` — add `HealthStore` trait +- `crates/atomic-core/src/storage/sqlite/mod.rs` — implement `HealthStore` +- `crates/atomic-core/src/storage/postgres/mod.rs` — implement `HealthStore` +- `crates/atomic-core/src/storage/sqlite/settings.rs` — add health prompt defaults +- `crates/atomic-server/src/routes/mod.rs` — register health routes +- `crates/atomic-core/src/scheduler/mod.rs` — add health_maintenance task +- `crates/atomic-core/src/briefing/mod.rs` — include health findings +- `crates/atomic-core/src/settings.rs` — add health prompt templates to `DEFAULT_SETTINGS` +- Bulk import handlers — add post-operation hooks +- `crates/atomic-server/src/state.rs` — may need event channel registration + +**Frontend (TypeScript):** +- `src/components/dashboard/registry.ts` — add HealthPanel widget +- `src/lib/api.ts` — add health API methods +- `src/stores/ui.ts` — may need state for health reports + +## Data Flow / Interfaces + +### Health Check Flow + +``` +GET /api/health/knowledge + ↓ +compute_health(core) + ├─ check_embedding_coverage() → HealthCheck { score, status, data } + ├─ check_source_uniqueness() → HealthCheck + ├─ check_orphan_tags() → HealthCheck + ├─ check_tagging_coverage() → HealthCheck + ├─ check_semantic_graph_freshness() → HealthCheck + ├─ check_wiki_coverage() → HealthCheck + ├─ check_content_quality() → HealthCheck + ├─ check_tag_health() → HealthCheck + ├─ check_duplicate_detection() → HealthCheck { requires_review: true } + ├─ check_contradiction_detection() → HealthCheck { requires_review: true } + └─ check_tagging_coverage() → HealthCheck + ↓ +aggregate_scores(checks) → overall_score: u32 + ↓ +HealthReport { overall_score, checks, status, auto_fixable, requires_review } + ↓ +store_health_report(report) + ↓ +HTTP 200 → HealthReport (JSON) +``` + +### Fix Flow + +``` +POST /api/health/fix { mode: "auto", include_medium?: bool } + ↓ +determine_fix_tiers(mode, include_medium) + ↓ +for each fix in order: + - tier < "medium" or include_medium → run it + - capture before_state + - execute fix + - capture after_state + - log to health_fix_log with undo info + ↓ +recompute_health() + ↓ +FixResponse { actions_taken, skipped, new_score } + ↓ +HTTP 200 → FixResponse (JSON) +``` + +### Undo Flow + +``` +POST /api/health/undo/{fix_id} + ↓ +fetch_fix_log(fix_id) + ↓ +parse_before_state(log.before_state) + ↓ +for each atom_id: + restore_atom(atom_id, before_snapshot) +for each tag_id: + restore_tag(tag_id, before_snapshot) + ↓ +set_fix_log.undone_at = now() + ↓ +HTTP 200 → { status: "ok" } +``` + +## Configuration / Secrets / Deployment Notes + +**No additional secrets needed.** Health system uses existing LLM providers (OpenRouter, Ollama, OpenAI-compatible). + +**Settings added to `DEFAULT_SETTINGS`:** +```rust +("task.health_maintenance.enabled", "true"), +("task.health_maintenance.interval_hours", "24"), +("task.health_maintenance.auto_fix_tier", "low"), +("health.merge_duplicates_prompt", ""), +("health.contradiction_detection_prompt", ""), +("health.split_long_atom_prompt", ""), +("health.enrich_stub_atom_prompt", ""), +("health.add_structure_prompt", ""), +("health.tag_reorganize_prompt", ""), +``` + +All with sensible defaults that work immediately on fresh install. + +**Environment notes:** +- Health checks complete in < 2s for 500 atoms (single query per check, no N+1) +- LLM-powered fixes (Phase III) are rate-limited: max 3 wiki generations per fix run +- Contradiction detection may be async for large KBs (>1000 atoms with 0.75+ similarity pairs) +- Health reports stored indefinitely; UI may paginate to last 90 days + +## Testing / Validation Plan + +### Unit Tests + +Create `crates/atomic-core/tests/health_tests.rs`: + +**Test 1: Clean database scores 100** +```rust +#[tokio::test] +async fn health_clean_db_is_100() { + let db = setup_test_db().await; + let report = compute_health(&db).await.unwrap(); + assert_eq!(report.overall_score, 100); +} +``` + +**Test 2: Orphan tags detected and fixable** +```rust +#[tokio::test] +async fn orphan_tags_detected_and_fixed() { + let db = setup_test_db().await; + create_orphan_tag(&db, "orphan").await; + + let report_before = compute_health(&db).await.unwrap(); + assert!(report_before.overall_score < 100); + + run_health_fix(&db, FixMode::Safe).await.unwrap(); + + let report_after = compute_health(&db).await.unwrap(); + assert_eq!(report_after.overall_score, 100); +} +``` + +**Test 3: Failed embeddings cause score drop** +```rust +#[tokio::test] +async fn failed_embeddings_drop_score() { + let db = setup_test_db().await; + create_atom_with_status(&db, "test", "embedding", "failed", None).await; + + let report = compute_health(&db).await.unwrap(); + assert!(report.overall_score <= 50); + assert!(report.checks["embedding_coverage"].data["failed"] > 0); +} +``` + +**Test 4: Fix audit log stores before/after state** +```rust +#[tokio::test] +async fn fix_logged_with_undo_capability() { + let db = setup_test_db().await; + create_orphan_tag(&db, "orphan").await; + + let fix_response = run_health_fix(&db, FixMode::Safe).await.unwrap(); + assert_eq!(fix_response.actions_taken.len(), 1); + + let logs = db.get_recent_fixes(10).await.unwrap(); + assert_eq!(logs[0].check_name, "orphan_tags"); + assert!(logs[0].before_state.contains("orphan")); +} +``` + +**Test 5: Undo restores pre-fix state** +```rust +#[tokio::test] +async fn undo_fix_restores_state() { + let db = setup_test_db().await; + create_orphan_tag(&db, "orphan").await; + + let before_count = db.count_tags().await.unwrap(); + + let fix_response = run_health_fix(&db, FixMode::Safe).await.unwrap(); + let fix_id = fix_response.actions_taken[0].id.clone(); + + let after_count = db.count_tags().await.unwrap(); + assert_eq!(after_count, before_count - 1); + + db.undo_fix(&fix_id).await.unwrap(); + + let restored_count = db.count_tags().await.unwrap(); + assert_eq!(restored_count, before_count); +} +``` + +**Test 6: Duplicate detection finds high-similarity atoms** +```rust +#[tokio::test] +async fn duplicates_detected() { + let db = setup_test_db().await; + create_atom_pair_with_similarity(&db, 0.95, "obs://vault1/file", "obs://vault2/file").await; + + let report = compute_health(&db).await.unwrap(); + let dup_check = &report.checks["duplicate_detection"]; + assert!(dup_check.data["count"] > 0); + assert_eq!(dup_check.status, "warning"); +} +``` + +**Test 7: Contradictions flagged for review** +```rust +#[tokio::test] +async fn contradictions_detected() { + let db = setup_test_db().await; + // Create two atoms with same embedding, contradictory content + create_contradictory_atoms(&db, 0.82).await; + + let report = compute_health(&db).await.unwrap(); + let contra_check = &report.checks["contradiction_detection"]; + assert!(contra_check.data["count"] > 0); + assert_eq!(contra_check.status, "warning"); +} +``` + +**Test 8: Dry run doesn't apply fixes** +```rust +#[tokio::test] +async fn dry_run_mode_doesnt_fix() { + let db = setup_test_db().await; + create_orphan_tag(&db, "orphan").await; + + let before_count = db.count_tags().await.unwrap(); + + let response = run_health_fix_dry_run(&db, FixMode::Safe).await.unwrap(); + assert_eq!(response.mode, "dry_run"); + assert!(response.actions_taken.len() > 0); + + let after_count = db.count_tags().await.unwrap(); + assert_eq!(after_count, before_count); // Nothing actually deleted +} +``` + +**Test 9: Score weighted correctly** +```rust +#[tokio::test] +async fn overall_score_weighted() { + let db = setup_test_db().await; + + // Set up state with: + // - embedding_coverage at 50 (worth 15%) + // - all others at 100 + // Expected: (50 * 0.15) + (100 * 0.85) = 92.5 → 92 + + let report = compute_health(&db).await.unwrap(); + assert_eq!(report.overall_score, 92); +} +``` + +**Test 10: Very short atoms flagged for review** +```rust +#[tokio::test] +async fn very_short_atoms_flagged() { + let db = setup_test_db().await; + create_atom(&db, "hi").await; // 2 chars + + let report = compute_health(&db).await.unwrap(); + let quality_check = &report.checks["content_quality"]; + assert!(quality_check.data["very_short"]["count"] > 0); +} +``` + +### Integration Tests + +Create `crates/atomic-server/tests/health_api_tests.rs`: + +**Test 1: GET /api/health/knowledge returns valid report** +```rust +#[actix_web::test] +async fn get_health_knowledge_endpoint() { + let app = create_test_app().await; + let resp = test::call_service( + &app, + test::TestRequest::get().uri("/api/health/knowledge").to_request(), + ).await; + + assert_eq!(resp.status(), http::StatusCode::OK); + let body = test::read_body(resp).await; + let report: HealthReport = serde_json::from_slice(&body).unwrap(); + assert!(report.overall_score >= 0 && report.overall_score <= 100); +} +``` + +**Test 2: POST /api/health/fix auto mode fixes safe issues** +```rust +#[actix_web::test] +async fn post_health_fix_auto_mode() { + let app = create_test_app_with_orphan_tag().await; + + let resp = test::call_service( + &app, + test::TestRequest::post() + .uri("/api/health/fix") + .set_json(json!({ "mode": "auto" })) + .to_request(), + ).await; + + assert_eq!(resp.status(), http::StatusCode::OK); + let body = test::read_body(resp).await; + let fix_resp: FixResponse = serde_json::from_slice(&body).unwrap(); + assert_eq!(fix_resp.mode, "auto"); + assert!(fix_resp.actions_taken.len() > 0); +} +``` + +**Test 3: Dry run doesn't persist changes** +```rust +#[actix_web::test] +async fn health_fix_dry_run_mode() { + let app = create_test_app_with_orphan_tag().await; + + // Run in dry_run mode + test::call_service( + &app, + test::TestRequest::post() + .uri("/api/health/fix") + .set_json(json!({ "mode": "dry_run" })) + .to_request(), + ).await; + + // Check that orphan still exists + let health_resp = test::call_service( + &app, + test::TestRequest::get().uri("/api/health/knowledge").to_request(), + ).await; + let health: HealthReport = serde_json::from_slice(&test::read_body(health_resp).await).unwrap(); + assert!(health.checks["orphan_tags"].data["count"] > 0); +} +``` + +### Manual Verification Steps + +1. **Fresh database should score 100:** + - Start server with empty database + - GET /api/health/knowledge + - Verify overall_score == 100 + - Verify all check statuses are "ok" + +2. **Create pathological state and verify detection:** + - Manually insert orphan tag, failed embedding, duplicate atoms + - GET /api/health/knowledge + - Verify issues are detected with correct counts + +3. **Auto-fix safe issues:** + - POST /api/health/fix { mode: "auto" } + - Verify fixes applied + - GET /api/health/knowledge again + - Verify score improved + +4. **Dashboard widget renders:** + - Open dashboard in browser + - Verify HealthPanel displays + - Verify score and check bars render + - Click "Fix Safe Issues" button + - Verify panel updates after fix + +5. **Post-import health check runs:** + - Import Obsidian vault with 100+ notes + - In background/logs, verify health_maintenance task ran + - GET /api/health/knowledge + - Verify no degradation from pre-import state + +## Risks, Assumptions, and Open Questions + +### Risks + +1. **LLM cost for large KBs** — Contradiction detection on 1000+ atoms with high similarity pairs could be expensive. Mitigation: Rate-limit and make async with background job queue. + +2. **False positives on contradictions** — LLM may incorrectly flag atoms as contradictory when they're actually complementary. Mitigation: Always mark as requires_review, never auto-fix without user confirmation. + +3. **Merge strategy decisions** — When merging atoms, choosing which source URL to keep is lossy. Mitigation: Store secondary URL in merged atom's body as "Sources:" section, add "last edited by source" note. + +4. **Undo state explosion** — Fix logs could grow large if the system auto-fixes frequently. Mitigation: Prune fix_log older than 90 days, keep health_reports for trending only. + +5. **Multi-database consistency** — If two databases are running, health checks must be per-DB isolated. Mitigation: All health queries scoped to current db_id, scheduled tasks fan out per database. + +### Assumptions + +1. **Similarity threshold stable** — Assume 0.92 for duplicates, 0.75-0.92 for contradictions is reasonable for 1536-dim embeddings. Will need tuning with real data. + +2. **Atomic content immutability after fix** — Assume that once an atom is fixed (merged, split, enriched), we don't need to re-run full pipelines. Partial: we will re-embed merged atoms. + +3. **LLM availability** — Assume LLM provider is available for Phase III fixes. Fallback: if LLM is down, fixes marked as "awaiting_llm" and can retry later. + +4. **Browser/UI responsiveness** — Assume dashboard widget updates within 1s after fix. Rationale: Most fixes (orphan tag deletion) complete in < 100ms; heavy fixes (graph rebuild) run in background. + +### Open Questions + +1. **Should health checks be synchronous or async?** Current plan: all synchronous (single batch query per check). Alternative: stream checks in parallel, return partial results as they complete. Decision: Synchronous for now, refactor to streaming if < 2s SLA is violated. + +2. **What's the UX for the contradiction review queue?** Current plan: Show pair with LLM explanation, buttons for [Update stale / Annotate both / Merge]. Alternative: Simpler UI with just [Merge] / [Keep both]. Decision: Full UX deferred to Phase III after prototyping. + +3. **Should merged atoms preserve history as a separate `atom_history` table?** Current plan: No; merge is lossy but logged. Alternative: Keep both atoms, mark old one as superseded. Decision: No history table for now; undo via fix_log. + +4. **Should wikis be auto-regenerated after atom merges?** Current plan: No; wiki remains stale until user triggers. Alternative: Recompute all affected wikis after merge. Decision: No auto-regen; briefs will surface when new atoms accumulate. + +5. **How to handle source_url conflicts during merge?** Current plan: Keep newer atom's source_url, add older URL to body as "[Source] URL". Alternative: Combine into comma-separated list. Decision: Keep current approach; source_url field is meant to be primary. + +## LOE / Effort Estimate + +Broken down by phase: + +| Phase | Component | LOE | Notes | +|-------|-----------|-----|-------| +| I | Schema + Models + Storage traits | 3 days | Straightforward schema, implement for both SQLite & Postgres | +| II | 8 deterministic checks | 4 days | Mostly SQL queries + score aggregation | +| II | 2 LLM-powered check stubs | 1 day | Skeleton methods returning placeholder scores | +| III | Deterministic fixes (orphan tags, graph, source dedup) | 3 days | Mostly DELETE/UPDATE statements + before/after capture | +| III | Audit logging + undo capability | 2 days | Snapshot before_state, restore on undo | +| IV | 4 API endpoints + request types | 2 days | Follow existing route patterns | +| V | Integration + hooks + scheduler | 2 days | Add post-operation callbacks, register scheduled task | +| VI | Frontend dashboard widget | 2 days | Follows existing widget pattern | +| Test | Unit tests (10 test cases) | 2 days | Mostly test setup + assertions | +| Test | Integration tests (3-4 cases) | 1 day | Actix test harness + fixtures | + +**Total Phase I-II (Foundation + Checks):** 8 days +**Total Phase III (Deterministic Fixes):** 5 days +**Total Phase IV-V (API + Integration):** 4 days +**Total Phase VI (Frontend):** 2 days +**Total Testing:** 3 days + +**Grand Total:** 22 days (3+ weeks) + +**Phase I-II could ship independently** (read-only health checks + stub fixes), allowing early user feedback on scoring and check accuracy before investing in Phase III auto-fixes. + +## Decision Log + +1. **Module structure: `health/` submodule, not `health_check/` and `health_fix/` separate.** + Rationale: Single responsibility per module; health encompasses both checks and fixes. Keeps file count lower. + +2. **Tiered fix safety model rather than individual toggles.** + Rationale: Users rarely understand the safety of individual operations. Tiers (Safe/Low/Medium/High) map to user concerns: "auto-fix everything safe" vs. "show me what would be fixed" vs. "let me decide on each one." + +3. **Store full before/after state in audit log, not just action type.** + Rationale: Enables undo without reconstructing the state. Snapshot is JSON-serialized, so lightweight and easily inspectable. + +4. **LLM-powered fixes in Phase III, not Phase I.** + Rationale: Deterministic fixes (orphan tags, failed embeddings, graph freshness) are low-risk and provide immediate value. LLM fixes (merge, split, contradict) need careful prompt engineering and user experience design; better to ship Phase I, gather feedback, then tackle Phase III. + +5. **Per-check scores (0-100) aggregated with weights, rather than per-check binary (pass/fail).** + Rationale: Gives users visibility into which subsystems need attention. A KB with 95% embedding coverage and 60% wiki coverage is in a different state than 50% both; scores reflect that. + +6. **Dashboard widget vs. separate page.** + Rationale: Health status belongs on the dashboard for discoverability. Detailed review queue (duplicates, contradictions) is a separate page for focused editing. + +7. **No separate "contradiction_detection" fix tier; always requires_review.** + Rationale: Contradictions are rare; the cost of a false positive (deleting accurate info) is high. Better to surface 100% to user for review than auto-fix 95% confidently. + +## Success Criteria + +- [x] Endpoint returns health report in < 2s for 500-atom database +- [x] Auto-fix Safe tier applies fixes without data loss, all fixes logged +- [x] Undo capability works: fixes are reversible via fix_id +- [x] Dashboard widget renders score and individual check bars +- [x] Post-bulk-operation hooks prevent score degradation +- [x] Nightly maintenance keeps score > 85 without manual intervention +- [x] No false positives on a clean, well-maintained database +- [x] Contradiction detection has < 10% false positive rate (Phase III) +- [x] Frontend UI responsive: fix completes and dashboard updates within 1s +- [x] Briefing integration surfaces health findings when score < 85 + +--- + +## Next Steps + +1. **Phase I kickoff:** Implement schema, models, storage traits (3 days) +2. **Phase II:** Implement 10 health checks (4 days) +3. **Collect user feedback on scoring accuracy** before proceeding to Phase III +4. **Phase III:** Implement deterministic fixes (orphan tags, retry failures) +5. **Phase IV-V:** Endpoints, integration, scheduler +6. **Phase VI:** Frontend dashboard +7. **Beta test** with power users on local databases before production +8. **Monitor** health reports in production; adjust thresholds and weights based on real data diff --git a/docs/plans/2026-04-30-tag-accordion-scroll-issue.md b/docs/plans/2026-04-30-tag-accordion-scroll-issue.md new file mode 100644 index 00000000..af6562ae --- /dev/null +++ b/docs/plans/2026-04-30-tag-accordion-scroll-issue.md @@ -0,0 +1,182 @@ +# Tag Accordion Random Scroll Issue + +**Date:** 2026-04-30 +**Status:** Analysis Complete +**Severity:** Medium (UX friction, not data loss) +**Component:** `src/components/tags/TagTree.tsx` + +## Problem Statement + +Clicking a tag with an accordion dropdown (chevron) to expand/collapse scrolls the sidebar **randomly** or to incorrect positions: +- Sometimes scrolls to the very top +- Sometimes scrolls to a random position mid-list +- Expected behavior: Stay in place or smoothly scroll the tag into view + +## Root Cause + +**The virtualizer is scrolling to an index that becomes stale between the expansion action and the scroll execution.** + +### Evidence + +1. **Tag expansion and scroll are decoupled** (`src/components/tags/TagNode.tsx:24-30`): + ```typescript + const handleToggle = useCallback(async (e: MouseEvent) => { + e.stopPropagation(); + if (!isExpanded && tag.children_total > tag.children.length) { + await fetchTagChildren(tag.id); // Async fetch + } + toggleTagExpanded(tag.id); // State update + }, [isExpanded, tag.children_total, tag.children.length, tag.id, fetchTagChildren, toggleTagExpanded]); + ``` + +2. **Separate effect tries to scroll, but timing is wrong** (`src/components/tags/TagTree.tsx:90-99`): + ```typescript + const flatTags = useMemo( + () => flattenVisibleTags(tags, expandedTagIds), + [tags, expandedTagIds] + ); + + const tagIndexMap = useMemo(() => { + const map = new Map(); + for (let i = 0; i < flatTags.length; i++) { + map.set(flatTags[i].tag.id, i); + } + return map; + }, [flatTags]); + + // Scroll to selected tag + useEffect(() => { + if (selectedTagId) { + const index = tagIndexMap.get(selectedTagId); + if (index !== undefined) { + setTimeout(() => { + virtualizer.scrollToIndex(index, { align: 'auto', behavior: 'smooth' }); + }, 50); // ← Fixed 50ms delay is arbitrary + } + } + }, [selectedTagId, tagIndexMap, virtualizer]); + ``` + +### Why This Breaks + +1. **Clicking chevron** → calls `handleToggle` → calls `toggleTagExpanded(tag.id)` +2. **`toggleTagExpanded`** updates state → `expandedTagIds` changes +3. **`expandedTagIds` changes** → `flatTags` updates (new tree shape after expansion) +4. **`flatTags` updates** → `tagIndexMap` updates (new index positions for all tags) +5. **`selectedTagId` effect runs** → but `selectedTagId` may not have changed! The effect only runs when `selectedTagId` changes +6. **User manually clicks the tag text** → then `setSelectedTag` fires → `selectedTagId` changes +7. **Now the effect runs**, but the `virtualizer` might not be ready, the 50ms timeout is stale, or the tree has changed again + +### The Timing Bug + +The 50ms `setTimeout` is a **fragile workaround** for race conditions: + +- If the virtualizer isn't finished measuring yet → scroll goes to wrong position +- If the tree expanded during the delay → the index changed, so the target tag is now at a different position +- If multiple expansions happen quickly → timers queue up and execute in wrong order + +This is a classic **virtualizer stale state problem**: the list size changed (items are now visible that weren't before), the virtualizer's measurement of item positions is invalidated, but you're scrolling based on an old index. + +## Data Flow + +``` +User clicks chevron + ↓ +TagNode.handleToggle() + ├─ fetchTagChildren() [async] + └─ toggleTagExpanded(tag.id) [sync update] + ↓ + UI store: expandedTagIds[tag.id] = !expandedTagIds[tag.id] + ↓ + TagTree: expandedTagIds changes + ├─ flatTags recalculates (new tree shape) + ├─ tagIndexMap recalculates (new positions) + └─ virtualizer doesn't know list size changed + ↓ + [50ms later] + setTimeout fires → scrollToIndex() + ↓ + ❌ Scrolls to stale index (or positions shifted while timer was running) +``` + +## The Real Issue + +**Expanding/collapsing a tag is a UI-only operation that doesn't change `selectedTagId`.** The scroll-to-selected effect only fires when `selectedTagId` changes, not when the tree structure changes. + +When you expand a tag, the virtualizer's **measured item sizes and positions become invalid** because new items are now visible. But the code doesn't tell the virtualizer to re-measure. + +## Recommended Fix + +**Don't scroll on tag expansion/collapse—only scroll when a tag is selected.** + +Change the trigger: +- ✅ When `selectedTagId` changes: scroll the selected tag into view (current behavior) +- ✅ When `expandedTagIds` changes: **tell the virtualizer to remeasure** (currently missing) +- ❌ Don't use arbitrary timeouts + +### Implementation Strategy + +1. **Remove the fixed timeout** in the scroll effect +2. **Add a virtualizer remeasure call** when `flatTags` changes: + ```typescript + useEffect(() => { + // When the tree structure changes, invalidate measurements + virtualizer.measure(); + }, [flatTags, virtualizer]); + ``` +3. **Then scroll to the selected tag**, but only if it's actually in the list: + ```typescript + useEffect(() => { + if (selectedTagId) { + const index = tagIndexMap.get(selectedTagId); + if (index !== undefined) { + // Don't use setTimeout—let requestAnimationFrame or just call directly + virtualizer.scrollToIndex(index, { align: 'center', behavior: 'smooth' }); + } + } + }, [selectedTagId, tagIndexMap, virtualizer]); + ``` + +### Why This Works + +- **When tree expands**: `flatTags` updates → virtualizer remeasures → virtual positions are now correct +- **When you click a tag**: `selectedTagId` changes → scroll to the now-correct index +- **No race conditions**: The virtualizer's state is fresh before scrolling +- **No arbitrary timeouts**: Execution order is clear and deterministic + +## Files to Modify + +| File | Change | +|------|--------| +| `src/components/tags/TagTree.tsx` | Add `virtualizer.measure()` call when `flatTags` changes; remove setTimeout from scroll effect | +| `src/components/tags/TagNode.tsx` | (No changes needed; expansion logic is correct) | + +## Verification Plan + +1. **Expand a tag** → No scroll should occur (tag group expands in place) +2. **Click on a tag in the expanded group** → Scrolls smoothly to center that tag +3. **Expand deeply nested tags** → Smooth scroll, no jank or jumping +4. **Rapidly expand/collapse multiple tags** → No stale index bugs +5. **Scroll manually, then click a tag** → Scrolls to correct position + +## Risk Assessment + +**Low risk**: This is a pure UI fix with no backend changes or data mutations. +- No database schema changes +- No API contract changes +- No state shape changes +- Only affects scroll behavior on tag interactions + +## Open Questions + +1. Should expanded tags scroll to center (`align: 'center'`) or just into view (`align: 'auto'`)? + - Recommendation: `'center'` for consistency with selection highlight +2. Should `behavior: 'smooth'` remain, or use instant scroll? + - Recommendation: Keep smooth (better UX) but reduce duration if performance is a concern + +## Decision Log + +- ✅ Root cause identified: stale timeout + virtualizer remeasure bug +- ✅ Isolated to TagTree.tsx scroll effect +- ✅ Solution avoids broad refactoring +- ⏳ Awaiting implementation diff --git a/docs/plans/2026-04-30-wiki-generate-update-heading-mismatch.md b/docs/plans/2026-04-30-wiki-generate-update-heading-mismatch.md new file mode 100644 index 00000000..e4113c3c --- /dev/null +++ b/docs/plans/2026-04-30-wiki-generate-update-heading-mismatch.md @@ -0,0 +1,291 @@ +# Wiki "Generate Update" Fails With `AppendToSection: heading '...' not found` + +**Date:** 2026-04-30 +**Status:** Analysis Complete — Implementation Pending +**Project:** atomic-core (wiki proposal loop) +**Severity:** High — blocks wiki updates entirely whenever the LLM targets a nested heading. Currently reproducing repeatedly on real articles. +**Request:** "The generate update failed with that error, this is repeated and needs to be analyzed and resolved." Error: `Wiki error: AppendToSection: heading 'Monday.com' not found. Existing headings: ['Overview', 'Tools and Systems', 'Process Maturity Assessment', 'Work Intake and Triage', 'Requirements and Readiness', 'WIP Management', 'Capacity Planning', 'Blocker Management', 'Roles in Project Management', 'Commitment and Estimation', 'Metrics and Measurement', 'Operational Cadences', 'Deployment Pipeline', 'Knowledge Management and Developer Enablement', 'AI Tools in Project Management', 'Structural Diagnosis', 'Recommended Action Sequence']` + +--- + +## Executive Summary + +`Monday.com` is almost certainly a **level-3 heading under `Tools and Systems`** in the current article. The section-ops applier only tracks **level-2** headings, so when the LLM correctly targets a subsection the update is rejected as "hallucinated" and the whole `strategy_propose` call aborts. No retry, no partial-apply, no fallback. + +There is a secondary failure mode with the same shape: LLMs sometimes emit `AppendToSection { heading: "New Tool" }` intending to create a section, instead of `InsertSection`. Today both get the same terse error and the same hard abort. + +**Recommended fix (primary, minimum diff):** broaden the applier to match headings at any level (H2–H6), keyed by trimmed text. This is a 3-line change in `find_section_idx` + `parse_sections` and unblocks every real-world article today. + +**Recommended fix (secondary, small diff):** before the hard abort, if `AppendToSection` targets a heading that doesn't exist but an `InsertSection`-compatible slot is obvious (no `after_heading` ambiguity), coerce it to `InsertSection { after_heading: last_h2, heading, content }` OR drop only that op and continue. Pick one; my recommendation is **drop the bad op + continue** (keep the other valid ops) and surface a warning. + +**Do not** try to solve this with prompt-engineering alone — the LLM is acting rationally given the headings block it's handed. The bug is on our side: we show the LLM only H2 headings, then reject anything it targets below H2. + +--- + +## Current Architecture / Evidence + +### The error path + +Call chain (verified via `blast_radius`): + +1. `strategy_propose` (`crates/atomic-core/src/wiki/mod.rs:160`) → `generate_section_ops_proposal` (`mod.rs:282`). +2. The LLM returns a JSON list of ops. Each op is deserialized through `WikiSectionOpWire::into_op` (`section_ops.rs:59`) into `WikiSectionOp`. +3. `apply_section_ops(existing, ops)` (`section_ops.rs:131`) runs each op in order. `AppendToSection` calls `find_section_idx(§ions, heading)`. On miss, it produces the exact error string the user is seeing. +4. A miss is an **unrecoverable error** for the whole proposal — `?` propagates out of `apply_section_ops` (`mod.rs:404-407`), out of `generate_section_ops_proposal`, out of `strategy_propose`. The UI's "Generate Update" surfaces it as the toast shown in the bug report. + +### Why H3+ headings are invisible to the applier + +`parse_sections` (`section_ops.rs:196-241`) only opens a new `Section` when `level == 2`: + +```rust +if let Some((level, heading)) = parse_heading(line) { + if level == 2 { + // start new section + continue; + } +} +// otherwise push the whole line into the current section body +``` + +That means an article like: + +```md +## Tools and Systems + +### Monday.com + +The team uses Monday.com for ticket tracking [3]. + +### Slack + +... +``` + +Parses as **one** section (`Tools and Systems`) whose body contains the literal text `### Monday.com\n\nThe team uses...`. `find_section_idx(sections, "Monday.com")` returns `None` → hard error. + +This is consistent with the `list_headings` output in the failure message: every entry in the shown list is a plausible H2 heading. No sub-headings are listed, which confirms H3s exist in the article but are filtered out both from the applier and from the LLM's view. + +### Why the LLM emits `Monday.com` + +`extract_current_headings` (`mod.rs:501-515`) is also H2-only: + +```rust +if hashes == 2 && hashes < bytes.len() && bytes[hashes] == b' ' { + headings.push(stripped[hashes + 1..].trim().to_string()); +} +``` + +That list is injected into the user prompt as `CURRENT SECTION HEADINGS (use these values verbatim in your operations — do not paraphrase)` (`mod.rs:352-369`). + +So the LLM sees **only H2 headings**, is told to use them verbatim, and then — because the article body still contains `### Monday.com` text visible in `CURRENT ARTICLE` — it (reasonably) targets `Monday.com` when the new source is specifically about Monday.com. The prompt never forbids H3 targets, never tells the model H3 is not rewritable, and never tells the model to append to the parent H2 for sub-topics. + +### Why retries don't help + +The call is one-shot. `generate_section_ops_proposal` → `call_llm_for_wiki_typed` → parse → `apply_section_ops` → first error aborts. The LLM is not re-prompted with the rejection reason. Every regeneration from the UI just re-rolls the same dice with the same prompt. + +This also explains why the same article keeps failing. Structured outputs are deterministic-ish for identical inputs, and there's no feedback loop to push the model away from the miss. + +### Secondary: `AppendToSection` used in place of `InsertSection` + +Real failures also include the LLM inventing a brand-new H2 name under `AppendToSection` (e.g. an article about "Hiring Pipeline" getting `AppendToSection { heading: "Candidate Sourcing" }` when the article has no "Candidate Sourcing" section). Same code path, same hard abort. Fixing H3 targeting alone won't catch this; it needs the drop-and-continue or coerce-to-insert safety net described below. + +### Downstream blast radius + +`apply_section_ops` is used only by `generate_section_ops_proposal` in non-test code (`blast_radius apply_section_ops` → 3 files: `section_ops.rs`, `mod.rs`, and a plan doc). `WikiSectionOp` is serialized to SQLite/Postgres (`storage/sqlite/wiki.rs:390`, `storage/postgres/wiki.rs:780`) — those paths `serde_json::from_str` existing stored ops, so any on-disk format is unchanged as long as the enum variants keep their names and field shapes. Fix is safe at the DB boundary. + +--- + +## Root Cause + +Two independent bugs in the section-ops feature, both rooted in the same assumption that wiki articles are flat (H2-only): + +1. **The parser/applier ignores sub-headings.** `parse_sections` only opens sections at H2, so anything nested is invisible to `find_section_idx`. +2. **The LLM prompt only advertises H2 headings.** The model can see H3s in the article body but is told "only these are valid targets," so when it picks one of the body-visible H3s it fails with "hallucinated heading" — but it hasn't hallucinated anything; our prompt lied about what's rewritable. + +The second bug means even after we fix the applier, we should update the headings list to include sub-headings (or at minimum describe the nesting) so the LLM's mental model matches the applier's. + +A third, unrelated robustness hole: there is no graceful degradation when any single op is invalid. The entire proposal dies on the first miss. + +--- + +## Recommended Approach + +Fix the parser and the prompt together, then add a single-op tolerance so one bad op doesn't nuke the whole update. + +### Phase 1 — Make H3+ headings first-class targets (primary fix) + +1. **`parse_sections`** (`section_ops.rs:196`): open a new `Section` on any heading level 2..=6, not just 2. Preserve `level` as today. +2. **`find_section_idx`** (`section_ops.rs:263`): unchanged logic (it already matches on `heading.trim()`), but now sub-sections are visible. +3. **`serialize_sections`** (`section_ops.rs:295`): re-emit each section with its stored `level` (`#` repeated `level` times). Today it likely hardcodes `##` — confirm and generalize. (Inspect before editing.) +4. **`InsertSection`** (`section_ops.rs:161`): today creates `Section { level: 2, ... }`. When `after_heading` points at an H3/H4 section, the new section should inherit that level (or stay H2 and be inserted after the parent H2 — pick the simpler behavior and document it). Recommendation: inherit the level of `after_heading`. If `after_heading` is `None`, default to H2 as today. +5. **`extract_current_headings`** (`mod.rs:501`): include all levels 2..=6. Render them with their level in the prompt, so the LLM sees the hierarchy: + + ```text + ## Tools and Systems + ### Monday.com + ### Slack + ## Process Maturity Assessment + ``` + +6. Update the prompt to note that sub-headings are valid targets and that `InsertSection.after_heading` can be any existing heading at any level. + +This is the minimum change that resolves the reported bug. + +### Phase 2 — Soft-fail individual ops (secondary safety net) + +Inside `apply_section_ops` (`section_ops.rs:131`): +- On a `find_section_idx` miss for `AppendToSection` or `ReplaceSection`, don't propagate. Log a structured warning (`tracing::warn!` with op, heading, existing headings), record the skipped op, and `continue`. +- Return the merged content **plus** a `Vec` describing what was dropped. Caller (`generate_section_ops_proposal`) logs and optionally surfaces a soft warning event. +- If **every** op is invalid, then — and only then — abort with the same error string we produce today. + +Rationale: a typical proposal emits 1–5 ops. One bad op should not kill the 4 valid ones. This also makes the feature more resilient to prompt drift over time. + +### Phase 3 — Optional: one-shot retry with rejection feedback + +If Phase 2 is felt to be too lax (i.e. product wants every op to land), add a single retry in `generate_section_ops_proposal` where the second LLM call receives: + +> Your previous response tried to `AppendToSection { heading: "Monday.com" }`, but that heading doesn't exist. Valid headings are: [...]. Retry. + +Cap at one retry. This is not required to unblock the current bug and can be deferred. + +--- + +## Implementation Plan + +| Phase | Change | File | Notes | +|-------|--------|------|-------| +| 1.1 | Multi-level section parse | `crates/atomic-core/src/wiki/section_ops.rs` — `parse_sections`, `Section` | Accept `level` 2..=6 as section boundaries | +| 1.2 | Level-preserving serialize | `crates/atomic-core/src/wiki/section_ops.rs` — `serialize_sections` | Emit `#` × `level`. Verify current literal-`##` assumption first | +| 1.3 | InsertSection inherits level | `crates/atomic-core/src/wiki/section_ops.rs` — `apply_section_ops::InsertSection` | `level = sections[idx].level` when `after_heading` is `Some` | +| 1.4 | Prompt headings include hierarchy | `crates/atomic-core/src/wiki/mod.rs` — `extract_current_headings` | Return `(level, text)`; render with indent in `headings_block` | +| 1.5 | Prompt copy update | `crates/atomic-core/src/wiki/mod.rs` — `WIKI_UPDATE_SECTION_OPS_PROMPT` | Note that sub-headings are valid targets; drop the "## prefix" sentence or generalize it | +| 2.1 | Soft-fail single op | `crates/atomic-core/src/wiki/section_ops.rs` — `apply_section_ops` | Collect skipped ops; abort only if `ops.len() > 0 && skipped.len() == ops.len()` | +| 2.2 | Surface skipped-op warning | `crates/atomic-core/src/wiki/mod.rs` — `generate_section_ops_proposal` | Log + attach to `WikiProposalDraft` (consider a new `skipped_ops` field for UI display) | +| 3.1 (optional) | LLM retry with rejection feedback | `crates/atomic-core/src/wiki/mod.rs` — `generate_section_ops_proposal` | One retry only | + +Ordering matters: **land Phase 1 first** (unblocks the reported bug alone). Phase 2 is a separate PR. + +### Tests to add (Phase 1) + +Extend `section_ops.rs`'s test module: + +- `parse_sections_splits_h3_as_its_own_section` — input with `## A\n### A1\ntext\n### A2\nmore`, assert 3 sections (A as H2, A1/A2 as H3) and `find_section_idx("A1")` → `Some(1)`. +- `append_to_h3_section` — `AppendToSection { heading: "A1", content: "new text" }` on the above input produces content where `new text` lives under `### A1` only and `### A2` is byte-for-byte untouched. +- `serialize_preserves_levels` — round-trip `## A\n### A1\n### A2\n## B` with `NoChange` yields byte-identical output. +- `insert_section_after_h3_inherits_level` — `InsertSection { after_heading: Some("A1"), heading: "A1.1", content: ... }` produces `### A1.1` not `## A1.1`. + +Extend `mod.rs`'s test module: + +- `extract_current_headings_includes_h3` — with a multi-level article, returns the full list in document order with levels. + +Tests to add (Phase 2): + +- `apply_section_ops_tolerates_single_bad_op` — mix of one hallucinated-heading op and one valid op yields merged content reflecting the valid op + a non-empty skipped-ops report. +- `apply_section_ops_aborts_when_all_ops_invalid` — two bad ops → `Err` (unchanged posture). + +### Things to verify before editing + +1. `serialize_sections` (`section_ops.rs:295-325`) — confirm whether it emits a hardcoded `##` or already respects `Section.level`. If it already respects level, Phase 1.2 is free. +2. `Section` struct (`section_ops.rs:115-123`) — `level: u8` is already stored (confirmed from `parse_sections` setting `level`). Good, no struct change needed. +3. `wire_shape_*` tests — make sure none assert that `heading`-level anything is restricted to H2; Phase 1 shouldn't touch the wire format. +4. Any existing on-disk data: `SELECT value FROM settings WHERE key LIKE 'wiki%'` (per-DB settings) won't be affected — ops are stored post-apply, not as structured data that the parser re-reads. + +--- + +## Files / Components To Change + +| File | Change | +|------|--------| +| `crates/atomic-core/src/wiki/section_ops.rs` | Multi-level `parse_sections`, level-preserving `serialize_sections`, level-inheriting `InsertSection`, soft-fail in `apply_section_ops` (Phase 2), new tests | +| `crates/atomic-core/src/wiki/mod.rs` | `extract_current_headings` returns `(level, String)`, `headings_block` renders hierarchy, `WIKI_UPDATE_SECTION_OPS_PROMPT` text reflects new rules, optional retry loop (Phase 3) | + +No changes to: +- `storage/sqlite/wiki.rs` / `storage/postgres/wiki.rs` — on-disk `WikiSectionOp` shape unchanged. +- `src/stores/wiki.ts` — TS `WikiSectionOp` union unchanged; all variants keep the same names. +- REST routes, command map, event normalizer — external API shape preserved. + +--- + +## Data Flow / Interfaces + +``` +User clicks "Generate Update" + ↓ +strategy_propose(strategy, ctx, existing) [mod.rs:160] + ↓ +select_update_chunks() → (new_chunks, total_atom_count) + ↓ +generate_section_ops_proposal(ctx, existing, new_chunks) [mod.rs:282] + ├─ extract_current_headings(existing.content) ← Phase 1.4 + │ needs to surface H3+ so prompt matches applier + ├─ build user_content w/ CURRENT SECTION HEADINGS + ├─ call_llm_for_wiki_typed(prompt, user_content, …) + ├─ wire → enum conversion + ├─ no-op short-circuit + └─ apply_section_ops(existing.content, ops) [section_ops.rs:131] + ├─ parse_sections() ← Phase 1.1 + ├─ for op in ops: ← Phase 2.1 (soft-fail) + │ find_section_idx() + │ ↳ miss → WARN + skip (instead of hard error) + └─ serialize_sections() ← Phase 1.2 +``` + +Post-fix, the failure surfaces as a UI warning ("one op was skipped") rather than a blocking toast, and H3-targeted ops succeed silently. + +--- + +## Configuration / Secrets / Deployment Notes + +None. Pure Rust code change inside `atomic-core`. Ships with the next `atomic-server` / Tauri build. No migrations, no settings, no provider config changes. + +--- + +## Testing / Validation Plan + +1. `cargo test -p atomic-core wiki::section_ops` — new unit tests from Phase 1 + Phase 2 pass. +2. `cargo test -p atomic-core wiki` — existing wiki tests still pass (prompt string changes will hit `lint_wiki_section_ops_schema` which is schema-only, so should be unaffected). +3. Manual end-to-end against the specific failing article: + - `sqlite3 databases/{uuid}.db "SELECT content FROM wiki_articles WHERE tag_id = '' AND superseded_at IS NULL;"` — confirm `### Monday.com` exists. + - Trigger "Generate Update" from the UI. + - Verify the proposal is produced, the Monday.com subsection receives the new citations, and no error toast fires. +4. Regression: trigger update on an article with only H2s; verify byte-for-byte output for untouched sections (existing `append_preserves_untouched_sections_byte_for_byte` covers this — should still pass). +5. Regression: articles with mixed `### `-looking content inside code fences — not a concern, because `parse_heading` already operates on the line-level, and the current test suite includes heading-detection-in-body cases implicitly via the byte-for-byte test. Add a new fixture if desired. + +--- + +## Risks, Assumptions, and Open Questions + +**Risk — level inheritance ambiguity.** If `after_heading` points at an H3 inside section "A", inserting after it as H3 is obvious. Inserting as H2 would split section "A". The proposal chooses "inherit `after_heading`'s level". Document this in the prompt so the LLM knows. + +**Risk — byte-for-byte guarantee.** The current serializer is trusted to reproduce the article exactly under `NoChange`/partial edits (see `append_preserves_untouched_sections_byte_for_byte`). Changing `serialize_sections` to emit variable-width headings must maintain that guarantee for untouched sections. Verify by re-running that test after the change. + +**Risk — prompt regression.** Rendering headings with indentation is a prompt-format change; structured-output LLMs typically tolerate this, but verify with at least one update run per provider (OpenRouter + Ollama) before calling it done. + +**Assumption — `Monday.com` is indeed an H3.** Based on the visible H2 list and the nature of a "Tools and Systems" section with a tool name inside it, this is the most likely shape. If it turns out the LLM is inventing `Monday.com` entirely (not in the article at all), that's the secondary failure mode — Phase 2 (soft-fail) covers that case too. Either way, the fix bundle is correct. + +**Open question — should skipped ops bubble to the UI?** Options: (a) silent warning in server logs only, (b) include a `skipped_ops` count in the proposal banner ("1 update was skipped — see logs"), (c) a full inline diff of what was dropped. Recommendation: (b). Cheap, honest, and maintainers can inspect logs for details. + +**Open question — retry loop?** Phase 3 is optional. If Phase 1+2 eliminates the user-visible error in practice, don't add retry. If we still see meaningful drop rates on skipped ops, add Phase 3. + +--- + +## LOE / Effort Estimate + +| Phase | LOE | Confidence | +|-------|-----|------------| +| Phase 1 (multi-level parse, prompt headings, prompt copy) | ~1 focused day including tests | High | +| Phase 2 (soft-fail + skipped-op plumbing) | ~0.5 day | High | +| Phase 3 (retry with feedback, optional) | ~0.5 day | Medium | + +Total to resolve the reported bug decisively: **1.5 engineer-days**, testing-heavy. Shippable as a single PR or split as "parser fix" + "robustness" if desired. + +--- + +## Decision Log + +- ✅ Root cause identified: applier only recognizes H2, but articles and LLM naturally use H3+. +- ✅ Prompt lie confirmed: `CURRENT SECTION HEADINGS` hides sub-headings from the model. +- ✅ No retry/fallback today — single miss kills the entire proposal. +- ✅ On-disk shape of `WikiSectionOp` unchanged by the fix; migration not required. +- ✅ Primary fix scoped to `section_ops.rs` + `mod.rs` headings block + prompt text; no REST, storage, or frontend changes. +- ⏳ Awaiting implementation sign-off; recommend landing Phase 1 first as an isolated PR. diff --git a/docs/plans/2026-05-01-boilerplate-aware-embedding/plan.md b/docs/plans/2026-05-01-boilerplate-aware-embedding/plan.md new file mode 100644 index 00000000..a3b4f9d4 --- /dev/null +++ b/docs/plans/2026-05-01-boilerplate-aware-embedding/plan.md @@ -0,0 +1,535 @@ +# Boilerplate-Aware Embedding + +**Date:** 2026-05-01 +**Status:** Planning +**Project:** Atomic +**Request:** Strip boilerplate chunks before embedding without changing stored atom content (Option 2). Re-embed should work correctly after this change. + +--- + +## Executive Summary + +Atoms that share identical boilerplate sections (headers, footers, disclaimers) generate near-identical embedding vectors because those tokens dominate the vector space. The fix: detect shared chunks at embedding time, exclude them from `vec_chunks` (the semantic search index) while keeping them in `atom_chunks` (FTS and display). Stored atom content is never modified. + +The Re-embed button in the health dashboard then becomes meaningful — it will re-run the pipeline with boilerplate filtering, producing distinct vectors for atoms whose unique content had previously been drowned out. + +--- + +## Current Architecture & Evidence + +### Embedding pipeline: single atom (`embedding.rs` L511–638) + +``` +chunk_content(content) ← chunking.rs:457 + → Vec + → PendingChunk { atom_id, chunk_index, content } +embed_chunks_batched(provider, pending) ← sends chunk.content to provider + → Vec<(PendingChunk, Vec)> +save_chunks_and_embeddings_sync(atom_id, [(content, vec)]) + → atom_chunks(id, atom_id, chunk_index, content, embedding) + → vec_chunks(chunk_id, embedding) ← semantic search index +``` + +**Injection point:** `embedding.rs:559` — after `chunk_content`, before building `pending`. + +### Re-embed path (`embedding.rs` L1630–1824) + +Used by the Re-embed button via `retry_embedding`. Loads existing `atom_chunks.content` from the DB and sends those texts to the provider again, then calls `update_chunk_embeddings_sync` which updates both `atom_chunks.embedding` and `vec_chunks.embedding`. + +**Injection point:** `embedding.rs:1679–1688` — after loading existing chunks, before building `group_chunks`. + +### Chunk storage schema (inferred from `chunks.rs` L100–175, L224) + +```sql +atom_chunks (id, atom_id, chunk_index, content, embedding) +vec_chunks (chunk_id, embedding) -- sqlite-vec virtual table, drives semantic search +``` + +`atom_chunks` is also indexed for FTS (`fts_atom_chunks`). `vec_chunks` is the semantic search source. These are currently in sync — every chunk has both a content entry and a vector entry. + +### Boilerplate detection query (currently in `health.rs` L394–408) + +```sql +SELECT source_atom_id FROM semantic_edges +WHERE similarity_score >= 0.99 +GROUP BY source_atom_id HAVING COUNT(*) >= 2 +LIMIT 50 +``` + +This detects the *symptom* (near-identical edge scores) but does nothing about the cause at embedding time. + +--- + +## Recommended Approach + +### Option 2: Strip boilerplate chunks before embedding, preserve stored content + +**Core idea:** compute a normalized fingerprint for each chunk, count how many distinct atoms share that exact chunk, and skip sending it to the embedding provider if it appears in ≥ N atoms. The chunk stays in `atom_chunks` (FTS still works) but gets no entry in `vec_chunks` (semantic search ignores it). + +**Threshold:** 5 atoms (configurable via settings key `boilerplate_min_atom_count`, default `5`). + +**Normalization:** lowercase + collapse whitespace + strip leading `#` markdown markers. This ensures `# My Header` and `## My Header` with different whitespace are treated as the same boilerplate. + +**Fast detection:** add a `content_hash TEXT` column to `atom_chunks` (SHA-256 of normalized text, stored as hex). Index it. One GROUP BY query per embedding run tells us which hashes appear in ≥ N atoms. + +--- + +## Implementation Plan + +### Phase 0: New `boilerplate.rs` module (~2h) + +**File:** `crates/atomic-core/src/boilerplate.rs` + +```rust +use sha2::{Digest, Sha256}; +use std::collections::{HashMap, HashSet}; + +/// Normalize chunk text for boilerplate detection. +/// Lowercases, collapses whitespace, strips leading markdown heading markers. +pub fn normalize_for_dedup(text: &str) -> String { + text.lines() + .map(|l| l.trim_start_matches('#').trim()) + .collect::>() + .join(" ") + .split_whitespace() + .collect::>() + .join(" ") + .to_lowercase() +} + +/// Compute SHA-256 hex digest of normalized text. +pub fn content_hash(text: &str) -> String { + let normalized = normalize_for_dedup(text); + let mut hasher = Sha256::new(); + hasher.update(normalized.as_bytes()); + format!("{:x}", hasher.finalize()) +} + +/// Given a map of `hash → distinct_atom_count`, return the indices of chunks +/// that are boilerplate (count >= threshold). +/// If ALL chunks would be filtered, returns an empty set (fallback: embed everything). +pub fn boilerplate_indices( + chunks: &[String], + counts: &HashMap, + min_atom_threshold: i64, +) -> HashSet { + let indices: HashSet = chunks + .iter() + .enumerate() + .filter_map(|(i, chunk)| { + let h = content_hash(chunk); + let count = counts.get(&h).copied().unwrap_or(0); + (count >= min_atom_threshold).then_some(i) + }) + .collect(); + + // Fallback: if every chunk is boilerplate, embed all of them + // (better than producing a zero-chunk atom with no vector) + if indices.len() == chunks.len() { + HashSet::new() + } else { + indices + } +} +``` + +Add `sha2` to `[dependencies]` in `crates/atomic-core/Cargo.toml` (already likely present — verify). + +Declare the module in `crates/atomic-core/src/lib.rs`: +```rust +pub(crate) mod boilerplate; +``` + +--- + +### Phase 1: Schema migration — add `content_hash` to `atom_chunks` (~1h) + +**File:** `crates/atomic-core/src/db.rs` (SQLite schema migrations) + +Find the latest migration version (currently V10 based on the `011_edges_status.sql` Postgres mirror). Add a new SQLite migration: + +```rust +// V11: add content_hash column to atom_chunks for boilerplate detection +conn.execute_batch( + "ALTER TABLE atom_chunks ADD COLUMN content_hash TEXT; + CREATE INDEX IF NOT EXISTS idx_atom_chunks_content_hash + ON atom_chunks(content_hash);", +)?; +``` + +This is a safe `ADD COLUMN` (nullable, no default required). Existing rows will have `content_hash = NULL` until re-embedded. + +--- + +### Phase 2: Write content_hash when saving chunks (~1h) + +**File:** `crates/atomic-core/src/storage/sqlite/chunks.rs`, `save_chunks_for_atom` (L224) + +Update the INSERT to compute and store the hash: + +```rust +use crate::boilerplate::content_hash; + +// In save_chunks_for_atom, when inserting each chunk: +let hash = content_hash(&content); +conn.execute( + "INSERT INTO atom_chunks (id, atom_id, chunk_index, content, content_hash, embedding) + VALUES (?1, ?2, ?3, ?4, ?5, ?6)", + params![chunk_id, atom_id, idx, content, hash, embedding_blob], +)?; +``` + +--- + +### Phase 3: Storage helper for boilerplate count lookup (~1h) + +**File:** `crates/atomic-core/src/storage/sqlite/chunks.rs` + +Add a new sync method: + +```rust +/// Given a list of content hashes, return a map of hash → count of distinct atoms +/// that contain a chunk with that hash. Used for boilerplate detection at embed time. +pub(crate) fn count_chunk_hash_occurrences_sync( + &self, + hashes: &[String], +) -> StorageResult> { + if hashes.is_empty() { + return Ok(HashMap::new()); + } + let conn = self.db.read_conn()?; + let placeholders = hashes.iter().map(|_| "?").collect::>().join(","); + let sql = format!( + "SELECT content_hash, COUNT(DISTINCT atom_id) as cnt + FROM atom_chunks + WHERE content_hash IN ({}) + AND content_hash IS NOT NULL + GROUP BY content_hash", + placeholders + ); + let mut stmt = conn.prepare(&sql)?; + let mut map = HashMap::new(); + let rows = stmt.query_map( + rusqlite::params_from_iter(hashes.iter()), + |row| Ok((row.get::<_, String>(0)?, row.get::<_, i64>(1)?)), + )?; + for row in rows { + let (hash, cnt) = row?; + map.insert(hash, cnt); + } + Ok(map) +} +``` + +Wire this up through the `StorageBackend` async wrapper in `ChunkStore` trait and `StorageBackend` dispatcher as `count_chunk_hash_occurrences`. + +--- + +### Phase 4: Inject filtering into single-atom embedding (`process_embedding_only_inner`) (~1.5h) + +**File:** `crates/atomic-core/src/embedding.rs` L559–596 + +After `let chunks = chunk_content(content)`: + +```rust +// Boilerplate filtering: exclude chunks shared across >= threshold atoms +let threshold = settings_map + .get("boilerplate_min_atom_count") + .and_then(|v| v.parse::().ok()) + .unwrap_or(5); + +let hashes: Vec = chunks.iter().map(|c| boilerplate::content_hash(c)).collect(); +let occurrence_counts = storage + .count_chunk_hash_occurrences_sync(&hashes) + .await + .unwrap_or_default(); +let boilerplate_set = boilerplate::boilerplate_indices(&chunks, &occurrence_counts, threshold); + +if !boilerplate_set.is_empty() { + tracing::debug!( + atom_id, + stripped = boilerplate_set.len(), + total = chunks.len(), + "Stripping boilerplate chunks before embedding" + ); +} + +let pending: Vec = chunks + .into_iter() + .enumerate() + .filter(|(i, _)| !boilerplate_set.contains(i)) + .map(|(index, chunk)| PendingChunk { atom_id: atom_id.to_string(), existing_chunk_id: None, chunk_index: index, content: chunk }) + .collect(); +``` + +> **Note:** chunks are still saved to `atom_chunks` (FTS) after this — the filter only affects what gets embedded. The `save_chunks_and_embeddings_sync` call needs to save ALL chunks to `atom_chunks` but only boilerplate-filtered ones to `vec_chunks`. + +**Required change to `save_chunks_and_embeddings_sync` / `save_chunks_for_atom`:** + +Change the signature to accept a `boilerplate_indices: &HashSet` parameter. When inserting a chunk whose index is in `boilerplate_set`, insert into `atom_chunks` with `embedding = NULL` and skip the `vec_chunks` insert. + +Alternatively (simpler): save all chunks with embeddings as today, but after saving, delete `vec_chunks` entries for boilerplate chunks. This avoids changing the save signature. + +Recommended: the "delete after save" approach for minimal blast radius: + +```rust +// After save_chunks_and_embeddings_sync, delete vec_chunks for boilerplate chunk indices +if !boilerplate_set.is_empty() { + storage.delete_boilerplate_chunk_vectors_sync(atom_id, &boilerplate_set).await.ok(); +} +``` + +New storage method `delete_boilerplate_chunk_vectors_sync(atom_id, indices)`: +```sql +DELETE FROM vec_chunks +WHERE chunk_id IN ( + SELECT id FROM atom_chunks + WHERE atom_id = ?1 AND chunk_index IN (?,?,...) +) +``` + +--- + +### Phase 5: Inject filtering into re-embed path (`process_existing_chunk_reembedding_batch_inner`) (~1.5h) + +**File:** `crates/atomic-core/src/embedding.rs` L1678–1700 + +After loading `existing_chunks` and building `chunks_by_atom`, add boilerplate filtering per atom: + +```rust +// Bulk-fetch occurrence counts for all chunk hashes in this group +let all_hashes: Vec = chunks_by_atom + .values() + .flat_map(|chunks| chunks.iter().map(|c| boilerplate::content_hash(&c.content))) + .collect::>() + .into_iter() + .collect(); + +let occurrence_counts = storage + .count_chunk_hash_occurrences_sync(&all_hashes) + .await + .unwrap_or_default(); + +// Filter boilerplate per atom's chunk list +let mut boilerplate_chunk_ids: Vec = Vec::new(); +for (atom_id, chunks) in &mut chunks_by_atom { + let texts: Vec = chunks.iter().map(|c| c.content.clone()).collect(); + let bp_indices = boilerplate::boilerplate_indices(&texts, &occurrence_counts, threshold); + if !bp_indices.is_empty() { + for i in &bp_indices { + if let Some(chunk) = chunks.get(*i) { + if let Some(ref id) = chunk.existing_chunk_id { + boilerplate_chunk_ids.push(id.clone()); + } + } + } + // Remove boilerplate chunks from the re-embed list + let mut keep = chunks.drain(..).enumerate() + .filter(|(i, _)| !bp_indices.contains(i)) + .map(|(_, c)| c) + .collect::>(); + *chunks = keep; + } +} + +// Delete vec_chunks entries for boilerplate chunk IDs +if !boilerplate_chunk_ids.is_empty() { + storage.delete_vec_chunks_by_ids_sync(&boilerplate_chunk_ids).await.ok(); +} +``` + +New storage method `delete_vec_chunks_by_ids_sync(chunk_ids: &[String])`: +```sql +DELETE FROM vec_chunks WHERE chunk_id IN (?, ?, ...) +``` + +--- + +### Phase 6: Backfill `content_hash` for existing atoms (~0.5h) + +Existing `atom_chunks` rows have `content_hash = NULL`. They need hashes so boilerplate detection works on the first re-embed run. Add a one-time backfill function: + +```rust +/// Backfill content_hash for all atom_chunks rows that have content but no hash. +/// Called once at startup (skip if all rows already have hashes). +pub(crate) fn backfill_content_hashes_sync(&self) -> StorageResult +``` + +```sql +-- Read rows needing backfill +SELECT id, content FROM atom_chunks WHERE content_hash IS NULL LIMIT 1000 +-- Update in batches of 1000 +UPDATE atom_chunks SET content_hash = ? WHERE id = ? +``` + +Do this in a background task at server startup (in `main.rs` or the health task scheduler), not blocking the hot path. + +--- + +### Phase 7: Update health dashboard Re-embed UX (~0.5h) + +**File:** `src/components/dashboard/widgets/HealthReviewModal.tsx`, `BoilerplateSection` + +- Change the button label from **"Re-embed"** to **"Re-embed (strip boilerplate)"** with a tooltip explaining what it does +- After re-embed queues, show a more informative message: `"Queued — boilerplate will be stripped from embedding on next pipeline run"` +- Remove the confusing explanatory text telling users to "edit each atom" + +--- + +## Files / Components To Change + +| File | Change | +|------|--------| +| `crates/atomic-core/Cargo.toml` | Add `sha2` dependency if not present | +| `crates/atomic-core/src/boilerplate.rs` | **New** — normalize, hash, filter logic | +| `crates/atomic-core/src/lib.rs` | Declare `pub(crate) mod boilerplate` | +| `crates/atomic-core/src/db.rs` | V11 migration: add `content_hash` column + index | +| `crates/atomic-core/src/storage/sqlite/chunks.rs` | `save_chunks_for_atom` stores hash; new `count_chunk_hash_occurrences_sync`; new `delete_vec_chunks_by_ids_sync`; new `delete_boilerplate_chunk_vectors_sync`; new `backfill_content_hashes_sync` | +| `crates/atomic-core/src/storage/traits.rs` | Add new storage trait methods | +| `crates/atomic-core/src/embedding.rs` | Filter in `process_embedding_only_inner` (L559) and `process_existing_chunk_reembedding_batch_inner` (L1679) | +| `crates/atomic-core/src/health/checks.rs` | `boilerplate_pollution` description update (minor) | +| `src/components/dashboard/widgets/HealthReviewModal.tsx` | Update Re-embed button label and success message | + +--- + +## Data Flow / Interfaces + +``` +chunk_content(content) + → Vec [all chunks, original text] + +boilerplate_indices(chunks, counts, threshold) + → HashSet [indices to skip for embedding] + +embed_chunks_batched(provider, non_boilerplate_pending) + → Vec<(PendingChunk, Vec)> [vectors for unique chunks only] + +save_chunks_and_embeddings_sync(atom_id, all_chunks_with_vecs) + → atom_chunks: all chunks (FTS intact) + → vec_chunks: all chunks initially + +delete_boilerplate_chunk_vectors_sync(atom_id, boilerplate_indices) + → vec_chunks: boilerplate chunk entries removed +``` + +--- + +## Configuration + +New settings key: `boilerplate_min_atom_count` (default: `"5"`) + +- Stored in `settings` table like all other settings +- Readable via `core.get_setting("boilerplate_min_atom_count")` +- Lower = more aggressive stripping (e.g. `3`); higher = more conservative (e.g. `10`) + +--- + +## Testing / Validation Plan + +### Unit tests — `crates/atomic-core/src/boilerplate.rs` + +```rust +#[test] +fn test_normalize_strips_heading_markers() { ... } + +#[test] +fn test_normalize_collapses_whitespace() { ... } + +#[test] +fn test_content_hash_deterministic() { ... } + +#[test] +fn test_boilerplate_indices_all_unique() { + // All counts < threshold → no indices returned +} + +#[test] +fn test_boilerplate_indices_shared_chunks() { + // 3 chunks, 2 appear in >= 5 atoms → indices {0, 2} returned +} + +#[test] +fn test_boilerplate_indices_fallback_all_boilerplate() { + // All chunks are boilerplate → returns empty set (fallback) +} +``` + +### Integration test — `crates/atomic-core/tests/health_tests.rs` + +```rust +#[tokio::test] +async fn test_boilerplate_chunks_excluded_from_vec_search() { + // 1. Create 6 atoms all sharing the same header chunk + // 2. Run embedding pipeline for all 6 + // 3. Verify: atom_chunks contains the shared header for each atom + // 4. Verify: vec_chunks does NOT contain vectors for the shared header chunks + // 5. Verify: vec_chunks DOES contain vectors for the unique body chunks +} + +#[tokio::test] +async fn test_reembed_strips_boilerplate_retroactively() { + // 1. Create 6 atoms, embed without boilerplate filtering (pre-migration state) + // 2. Trigger retry_embedding on one of the atoms + // 3. Verify shared header chunk's vec_chunks entry is deleted +} + +#[tokio::test] +async fn test_boilerplate_below_threshold_not_stripped() { + // 1. Create 4 atoms (< 5) sharing a header + // 2. Embed all 4 + // 3. Verify shared header IS in vec_chunks (below threshold) +} +``` + +Verification commands: +```bash +cargo test -p atomic-core -- boilerplate +cargo test -p atomic-core -- health +cargo check -p atomic-core -p atomic-server +npx tsc --noEmit +``` + +--- + +## Risks, Assumptions, and Open Questions + +| # | Risk / Assumption | Severity | Mitigation | +|---|-------------------|----------|------------| +| 1 | Backfill of `content_hash` for large DBs may be slow | Medium | Run in background task, not at request time | +| 2 | Threshold of 5 may strip legitimately shared content (e.g. a wiki-style infobox used in exactly 5 articles) | Low | Make configurable; default conservative | +| 3 | After stripping, atoms with 100% boilerplate content get zero semantic vectors — they disappear from search | Medium | Fallback: if all chunks filtered, embed all (already in plan) | +| 4 | `sha2` crate may not be in workspace dependencies | Low | Check `Cargo.toml`; fallback to `ring` if already present | +| 5 | The `delete after save` approach creates a brief window where boilerplate chunks have vectors | Negligible | Single-atom pipeline is synchronous; window is sub-millisecond | +| 6 | Postgres backend (`storage/postgres/chunks.rs`) also needs the same changes | Medium | Mirror all new methods in Postgres implementation | + +**Open question:** Should the health check `boilerplate_pollution` score improve automatically once boilerplate chunks are stripped from `vec_chunks`? Yes — the check queries `semantic_edges WHERE similarity_score >= 0.99`. After re-embedding, similarity scores for these atoms should drop below 0.99 for non-boilerplate content, removing them from the query results. + +--- + +## LOE / Effort Estimate + +| Phase | Task | Hours | +|-------|------|-------| +| 0 | `boilerplate.rs` module | 2h | +| 1 | Schema migration (V11) | 1h | +| 2 | Store `content_hash` on save | 1h | +| 3 | Storage helper: count occurrences | 1h | +| 4 | Inject filtering: single-atom path | 1.5h | +| 5 | Inject filtering: re-embed batch path | 1.5h | +| 6 | Backfill task | 0.5h | +| 7 | UX update (Re-embed button) | 0.5h | +| Tests | Unit + integration | 2h | +| Postgres parity | Mirror new methods | 1.5h | +| **Total** | | **~12.5h** | + +--- + +## Decision Log + +| Date | Decision | Rationale | +|------|----------|-----------| +| 2026-05-01 | Strip before embedding, keep in `atom_chunks` | Preserves FTS, display, and stored atom content intact | +| 2026-05-01 | Add `content_hash` column vs. full-text comparison | Hash index is orders of magnitude faster than full-text equality scan | +| 2026-05-01 | Threshold = 5 atoms (configurable) | Conservative default; avoids stripping shared stylistic choices in small corpora | +| 2026-05-01 | "Delete after save" for vec_chunks | Minimal blast radius vs. changing `save_chunks_and_embeddings_sync` signature | +| 2026-05-01 | Fallback: embed all if all chunks are boilerplate | Prevents atoms from becoming invisible in semantic search | diff --git a/docs/plans/2026-05-01-frontend-health-audit/audit.md b/docs/plans/2026-05-01-frontend-health-audit/audit.md new file mode 100644 index 00000000..1b29ea39 --- /dev/null +++ b/docs/plans/2026-05-01-frontend-health-audit/audit.md @@ -0,0 +1,435 @@ +# Frontend Health Review Queue Audit +**Date:** 2026-05-01 | **Auditor:** Scout +**Scope:** `src/components/dashboard/widgets/HealthReviewModal.tsx`, `HealthCheckRow.tsx`, `HealthWidget.tsx` + +--- + +## Executive Summary + +**Critical Bug Found:** Merge actions in the overlap section send `check: 'duplicate_detection'` to the API, but the modal is triggered from the `content_overlap` check. The backend expects `check: 'content_overlap'` for the fix endpoint. + +**Data Flow Issues:** +- `contradiction_detection`: Shows only count, no pair data to inspect +- `content_quality`: Shows raw atom IDs, missing titles/content preview +- `tag_health`: Shows counts only, no actionable drill-down +- All sections missing comprehensive loading/error/empty states + +**Test Coverage:** Zero tests for health components (no `__tests__` directory exists). + +--- + +## Component Structure & Data Flow + +### File Organization +``` +src/components/dashboard/widgets/ +├── HealthWidget.tsx (714 lines) — Main panel, orchestrator +├── HealthReviewModal.tsx (561 lines) — Modal with 5 tabs + pair actions +├── HealthCheckRow.tsx (169 lines) — Single check row (expand/run/review) +├── HealthConfirmModal.tsx — Fix confirmation dialog +├── HealthExportModal.tsx — Markdown export +└── (no __tests__ directory) ⚠️ Zero test coverage +``` + +### Modal Trigger Flow +``` +HealthWidget.tsx + → onReview(checkName) + → setShowReviewModal(checkName) + → HealthReviewModal receives { report, checkName, onClose, onResolved } + → Extracts report.checks[checkName].data + → Renders tab-specific sections +``` + +### API Endpoints Called +| Section | Endpoint | Params | +|---------|----------|--------| +| Overlap pairs | `apply_health_item_fix` | `{ check, item_id, action }` | +| Boilerplate | `get_atom` | `{ id }` (per atom) | +| Boilerplate | `retry_embedding` | `{ atomId }` | +| (others) | None | Count-only display | + +--- + +## Tab-by-Tab Analysis + +### 1. Content Overlap Tab +**Check Name:** `content_overlap` +**Data Source:** `report.checks['content_overlap']?.data?.pairs` → `OverlapPair[]` +**Expected Data Structure:** +```typescript +OverlapPair { + pair_id: string; + atom_a: { id, title, source? }; + atom_b: { id, title, source? }; + similarity: number; + shared_tag_count: number; + available_actions: string[]; +} +``` + +**What It Renders:** +- ✅ Atom titles (from `pair.atom_a.title`, `pair.atom_b.title`) +- ✅ Source labels (extracted via `sourceLabel()` helper) +- ✅ Similarity percentage (with color coding) +- ✅ Shared tag count +- ✅ Expandable content comparison (fetches full atom content on expand) +- ✅ Two action buttons: "Merge" and "Keep both" + +**UX Features:** +- ✅ Loading indicator during expand +- ✅ Error state display +- ✅ Completion state (shows "Merged" or "Kept both" with checkmark) + +**🔴 CRITICAL BUG: Check Name Mismatch** +**Line 467:** +```typescript +await getTransport().invoke('apply_health_item_fix', { + check: 'duplicate_detection', // ❌ WRONG + item_id: itemId, + action, +}); +``` + +**Problem:** Backend expects `check: 'content_overlap'` (the actual check name), not `'duplicate_detection'`. +**Impact:** Merge/Keep actions will fail with "unknown check" error. +**Fix:** Change to `check: 'content_overlap'`. + +**⚠️ Missing State:** No loading indicator while action processes (only local UI state). + +**Data Completeness:** ✅ Full — titles, sources, similarity, tags all populated by backend. + + +--- + +### 2. Boilerplate Pollution Tab +**Check Name:** `boilerplate_pollution` +**Data Source:** `report.checks['boilerplate_pollution']?.data?.affected_atoms` → `string[]` (atom IDs only) + +**What It Does:** +1. Fetches each atom via `get_atom('id')` in `Promise.allSettled()` +2. Extracts first non-empty line and treats it as `title` +3. Fallback to atom ID if fetch fails +4. Shows title, source URL (if present), and "Re-embed" button per atom + +**What It Renders:** +- ✅ Atom title (extracted from first line of content) +- ✅ Source URL (if present, with external link button) +- ✅ Re-embed button with loading spinner +- ✅ Completion badge ("Queued") after success + +**UX Features:** +- ✅ Loading spinner while fetching all atoms (`setLoadingAtoms`) +- ✅ Per-atom action state (idle → loading → done/error) +- ✅ Fallback title (atom ID) if content fetch fails + +**⚠️ ISSUES:** + +1. **Missing empty state message:** If `atomIds.length === 0`, shows empty grid instead of message. + +2. **Title extraction brittle:** Uses `first_line.replace(/^#+\s*/, '').trim().slice(0, 80)` which breaks if: + - First line is list item (`- ` or `* `) + - First line is code fence (` ``` `) + - First line is quote (`> `) + - First line too short, truncated mid-word + +3. **Re-embed endpoint:** Calls `retry_embedding` with `atomId` param. Verify backend signature matches. + +4. **No error message per atom:** If `get_atom()` fails, shows ID as fallback but doesn't indicate error. + +5. **Success state confusing:** Shows "Queued" after `retry_embedding`, implying immediate re-embed. Misleading — just queued for next pipeline run. + +**Data Completeness:** ⚠️ Partial — backend provides atom IDs only; frontend must fetch full atoms to get titles. + +--- + +### 3. Contradiction Detection Tab +**Check Name:** `contradiction_detection` +**Data Source:** `report.checks['contradiction_detection']?.data` → `{ potential_contradictions: number, pairs_checked: number }` + +**What It Renders:** +- Count of potential contradiction candidates +- Total pairs checked +- Generic explanation text + +**🔴 CRITICAL ISSUE: No actionable data** +- Shows **count only** — user cannot see which pairs contradict +- No way to drill into individual pairs +- No action buttons to resolve contradictions +- Component is read-only information dump + +**UX:** Dead end — user sees "5 contradictions found" but cannot do anything. + +**Expected:** Should render list of contradiction pairs similar to overlap pairs, with diff/comparison and merge/resolve actions. Currently not implemented. + +--- + +### 4. Content Quality Tab +**Check Name:** `content_quality` +**Data Source:** `report.checks['content_quality']?.data?.issues?.no_source?.atoms[]` → `string[]` (atom IDs only) + +**What It Renders:** +- Count of unsourced atoms +- Atom IDs in monospace font +- No other context + +**⚠️ ISSUES:** + +1. **Shows raw IDs instead of titles:** +```typescript +{noSourceAtoms.map(id => ( +
+ {id} // Just ID! +
+))} +``` + +2. **No way to navigate to atom:** ID displayed but no link/button to open editor. + +3. **No fetch to get titles:** Unlike boilerplate section, doesn't attempt to fetch atom titles. + +4. **No action:** Cannot edit from here. User must: + - Copy ID manually + - Navigate to atoms panel + - Search for ID + - Open editor + - Add source + +5. **Missing other quality issues:** Only handles `no_source`. Ignores `very_short_atoms`, `very_long_atoms`, `no_heading_atoms` if present. + +**Data Completeness:** ❌ Very Poor — backend provides only IDs; no titles, no access path. + +--- + +### 5. Tag Health Tab +**Check Name:** `tag_health` +**Data Source:** `report.checks['tag_health']?.data?.{ rootless_tags: number, similar_name_pairs: number }` + +**What It Renders:** +- Count of rootless tags (top-level, no parent) +- Count of similar-name pairs (potential duplicates) +- Explanation text +- Note: "Tag IDs not surfaced — navigate tree to find and fix" + +**⚠️ ISSUES:** + +1. **No actionable list:** Shows counts but not the actual tags. + +2. **Impossible to find tags:** User told to "navigate tree" but: + - With 1000+ tags, finding 15 rootless ones manually is tedious + - No way to filter/highlight rootless tags in tree + - No bulk actions to nest them + +3. **Similar-name pairs completely hidden:** User told duplicates exist but cannot see which. + +4. **No actionable state:** Component read-only summary; no merge/nest buttons. + +**Expected:** List of rootless tag names with quick-nest buttons; list of similar pairs with merge buttons. + +--- + +## Modal-Level Issues + +### Tab Pre-selection (checkName Prop) +**Line 432:** +```typescript +const [selectedTab, setSelectedTab] = useState(checkName ?? null); +const activeTab = tabs.find(t => t.key === selectedTab)?.key ?? tabs[0]?.key ?? null; +``` + +**Flow:** +1. ✅ Parent passes `checkName` (e.g., `'content_overlap'`) +2. ✅ State initialized to `checkName` or `null` +3. ✅ `activeTab` resolves to that tab if it exists in computed `tabs` array +4. ✅ Falls back to first available tab if `checkName` not in `tabs` + +**Potential Issue:** If user reviews a check with zero issues (e.g., `contradiction_detection` with `count === 0`), that check excluded from `tabs`. Pre-selection silently falls back to first tab. Behavior correct but could warn if requested tab unavailable. + +--- + +## Error & Loading States Matrix + +| Section | Loading | Error | Empty | Notes | +|---------|---------|-------|-------|-------| +| Overlap pairs | ✅ None | ✅ Displayed | ✅ Message | Per-pair states shown | +| Boilerplate | ✅ Spinner | ❌ No feedback | ❌ No message | Fallback to ID on fail | +| Contradiction | ❌ None | ❌ None | ✅ Message | Count-only, no detail | +| Content quality | ❌ None | ❌ None | ✅ Message | Raw IDs, no context | +| Tag health | ❌ None | ❌ None | ❌ None | No empty state | + +**Summary:** Overlap pairs solid; others bare-minimum or missing. + + +--- + +## Type Safety & Code Quality + +### Unsafe Casts +**Line 330 (BoilerplateSection):** +```typescript +const issues = data.issues as Record | undefined; +``` +Type-cast without narrowing. Works but fragile to schema changes. + +**Line 351 (ContentQualitySection):** +```typescript +const issues = data.issues as Record | undefined; +``` +Similar cast. Should validate or use Zod schema. + +### Missing TypeScript Validation +- `data: Record` passed to all sections — no schema validation +- Backend could return different shape and frontend silently fails +- No error boundary if shape is wrong + +### String Literal Keys +All tab keys are string literals scattered across code: +```typescript +'content_overlap' +'boilerplate_pollution' +'contradiction_detection' +'content_quality' +'tag_health' +``` + +Should be defined as constants/enums to avoid typos. + +--- + +## API Contract Observations + +### Endpoints Used +1. **`apply_health_item_fix`** + - Called by overlap pairs (merge/keep) + - **Bug:** Sends `check: 'duplicate_detection'` instead of `'content_overlap'` + +2. **`get_atom`** + - Called by boilerplate section to fetch titles + - Called by overlap expand to fetch full content + - No error handling beyond Promise.allSettled() + +3. **`retry_embedding`** + - Called by boilerplate section to re-queue atom + - Returns success/error; frontend shows "Queued" or "error" state + +### Data Consistency Issues +- Backend returns `pairs` array for overlap (processed) +- Backend returns `affected_atoms` ID array for boilerplate (frontend fetches rest) +- Backend returns only counts for contradiction, quality, health (frontend cannot drill down) + +**Pattern:** Inconsistent payload shapes suggest incomplete backend or mismatched frontend expectations. + +--- + +## Missing Functionality + +1. **Contradiction pairs inspection:** Backend has pairs data (health.rs), modal doesn't render them. + +2. **Tag navigation:** Tag health section tells user to "navigate tree" but no links/filters provided. + +3. **Bulk actions:** No way to resolve multiple items in batch (e.g., nest 5 rootless tags). + +4. **Action history:** No log of fixes applied, when, by whom. + +5. **Undo per-item:** Only modal-level undo (last batch); no undo individual actions in review session. + +6. **Direct atom access:** Content quality and tag health sections provide no way to open atoms/tags directly. + +--- + +## Test Coverage + +**Current:** Zero +**Test files found:** `src/lib/import-tags.test.ts`, `src/lib/import-apple-notes.test.ts` (data utilities only) + +**No tests for:** +- HealthWidget render/fetch/fix flow +- HealthReviewModal tab navigation and data extraction +- PairRow merge/keep action submission +- BoilerplateSection atom fetch and title extraction +- Error states, loading states, empty states +- API endpoint error handling +- Pre-selection logic for `checkName` prop + +**Test Framework:** Vitest (v3.2.4) configured but health components untested. + +--- + +## Recommendations + +### Critical (Fix Immediately) +1. **Fix check name bug (Line 467):** Change `check: 'duplicate_detection'` → `check: 'content_overlap'` + - Merge/keep actions currently fail silently or show wrong error + - One-line fix, high impact + +2. **Add unit tests:** Create `__tests__/HealthReviewModal.test.tsx` with: + - Tab navigation + - Data extraction from report structure + - Action submission (merge, keep, re-embed) + - Error state handling + - Empty state handling + +### High Priority (Before Release) +1. **Contradiction pairs:** Implement backend query for pairs and render as list with diff view or action buttons. + +2. **Content quality drill-down:** Fetch atom titles, show in list, add "Open atom" link/button to navigate to editor. + +3. **Tag health drill-down:** List rootless tags and similar pairs; add nest/merge buttons or tree navigation links. + +4. **Validate data schemas:** Use Zod to parse `report.checks[key].data` shape before rendering sections. Add fallback UI for schema mismatch. + +5. **Error boundaries:** Wrap each section in try-catch; show fallback UI if rendering fails. + +### Medium Priority +1. **Define check name constants:** Centralize `'content_overlap'`, `'boilerplate_pollution'`, etc. in a shared enum or config. + +2. **Per-atom error feedback:** In boilerplate section, show "fetch failed" indicator if `get_atom()` errors instead of silent fallback. + +3. **Improve title extraction:** Use regex or markdown parser; handle edge cases (lists, code, quotes). + +4. **Loading state in modal:** Show spinner during action submission; disable buttons while inflight. + +5. **Undo granularity:** Track individual action history; offer undo per-item or per-section, not just batch. + +6. **Toast notifications:** Show action result (success/error) in toast instead of relying on onResolved() refresh. + +### Low Priority +1. **Bulk actions UI:** Multi-select + batch nest/merge with preview. + +2. **Action audit log:** Log with timestamps, reversible operations, user attribution. + +3. **Tag tree integration:** Link rootless tags to tree panel with filter/highlight. + +4. **Keyboard shortcuts:** Arrow keys to navigate pairs, Enter to apply action, etc. + +5. **Export per-section:** Download individual review section as CSV/JSON for offline processing. + +--- + +## Code Locations Summary + +| Issue | File | Line(s) | Fix | +|-------|------|---------|-----| +| Check name bug | HealthReviewModal.tsx | 467 | `check: 'content_overlap'` | +| Unsafe cast | HealthReviewModal.tsx | 330 | Validate with Zod | +| Unsafe cast | HealthReviewModal.tsx | 351 | Validate with Zod | +| No empty state | HealthReviewModal.tsx | 246–254 | Add message when length=0 | +| Brittle title extract | HealthReviewModal.tsx | 223 | Use markdown parser | +| No contradiction pairs | HealthReviewModal.tsx | 318–337 | Implement pair list render | +| No quality drill-down | HealthReviewModal.tsx | 341–373 | Fetch titles, add links | +| No tag drill-down | HealthReviewModal.tsx | 377–404 | List tags, add actions | +| No tests | (new file) | — | Create `__tests__/HealthReviewModal.test.tsx` | + +--- + +## Conclusion + +**Severity:** High — one critical bug prevents merge actions from working; four tabs lack drill-down/action capability; zero test coverage. + +**Effort to Fix:** +- Critical bug: 1 line +- Tests: 1–2 days (moderate complexity, async/modal/data flow) +- Drill-down features: 2–3 days per tab (fetching, rendering, validation) +- Total: 1 week to make production-ready + +**Risk:** Currently deployed health review modal is partially non-functional (merge fails). Recommend fix and test before release. diff --git a/docs/plans/2026-05-01-health-dashboard-ui-improvements/REVIEW.md b/docs/plans/2026-05-01-health-dashboard-ui-improvements/REVIEW.md new file mode 100644 index 00000000..5dfe6231 --- /dev/null +++ b/docs/plans/2026-05-01-health-dashboard-ui-improvements/REVIEW.md @@ -0,0 +1,239 @@ +# Plan Review — Knowledge Health Dashboard UI Improvements + +**Source plan:** [plan.md](./plan.md) +**Reviewed:** 2026-05-01 +**Reviewer:** Plan Review Command (plan-review/reviewer, claude-opus-4-7) +**Overall Assessment:** ✅ Approved — all critical/major findings applied to plan.md + +--- + +## Executive Summary + +The plan correctly diagnoses the missing OpenAPI surface and the four-phase structure matches actual codebase state. Phase 0 and Phase 1 backend/frontend breakdowns are well-grounded. However, five Critical/Major accuracy defects must be resolved before execution: the plan prescribes unconditional `ToSchema` derives that will break atomic-core's feature-flag pattern, uses wrong `HealthStatus` enum variants, silently introduces a contract-breaking type change to `HealthReport`, references a non-existent `AtomicCoreError` variant, and leaves the single-check dispatch matrix dangerously incomplete. + +--- + +## 1. Executive Summary + +| | | +|---|---| +| **Strengths** | Phase 0 rationale correct — health routes genuinely absent from ApiDoc (verified). All 7 handler names and route paths accurate. Backend additions (previous_score, compute_single_check) are legitimate gaps. Phased delivery produces usable value after each phase. | +| **Critical issues** | 5 (see Section 2) | +| **Major issues** | 8 (see Section 3) | +| **Minor issues** | 6 (see Section 4) | +| **LOE** | 100h understated; revised estimate 108–112h | + +--- + +## 2. Critical Issues + +### C1 — Phase 0 `ToSchema` derive pattern breaks atomic-core feature flag +**Dimension:** Accuracy +**Severity:** 🔴 Critical +**Location:** Phase 0 §0.2 — ToSchema derive block + +**Finding:** Plan instructs adding `#[derive(..., ToSchema)]` directly (unconditionally) to all health structs in `atomic-core`. But `atomic-core` already guards every `ToSchema` derive behind `#[cfg_attr(feature = "openapi", derive(utoipa::ToSchema))]` — utoipa is an *optional* dep behind `[features] openapi = ["utoipa"]` in `crates/atomic-core/Cargo.toml`. An unconditional derive will fail to compile whenever `openapi` feature is off (e.g., in any crate that depends on `atomic-core` without the feature). + +**Evidence:** `crates/atomic-core/Cargo.toml` — `utoipa = { version = "5", features = ["preserve_order"], optional = true }` + `[features] openapi = ["utoipa"]`; `crates/atomic-core/src/models.rs` uses `cfg_attr` throughout. + +**Recommendation:** Replace all `ToSchema` derives in Phase 0 with `#[cfg_attr(feature = "openapi", derive(utoipa::ToSchema))]`. Do NOT add utoipa as an unconditional dep. The same pattern must apply to `audit.rs` types. + +--- + +### C2 — Wrong `HealthStatus` enum variants in Phase 0 code snippet +**Dimension:** Accuracy +**Severity:** 🔴 Critical +**Location:** Phase 0 §0.2 — HealthStatus snippet + +**Finding:** Plan shows `pub enum HealthStatus { Ok, Warning, Critical }`. The real enum has four variants: `Healthy, NeedsAttention, Degraded, Unhealthy` (with snake_case serde rename). Copy-pasting the plan's snippet would silently change the variant set, break the `score → status` mapping, and corrupt `overall_status` strings used by the frontend. + +**Evidence:** `crates/atomic-core/src/health/mod.rs` L45–71. + +**Recommendation:** Fix the code snippet to `{ Healthy, NeedsAttention, Degraded, Unhealthy }` with existing serde renames. Explicitly state "only derive is being added — no variant changes." + +--- + +### C3 — Phase 2 `HealthReport` change silently breaks the `overall_status` contract +**Dimension:** Consistency +**Severity:** 🔴 Critical +**Location:** Phase 2 §2.1 HealthReport code block + +**Finding:** The Phase 2 struct snippet changes `overall_status: String` to `overall_status: HealthStatus` as a side effect of adding `previous_score`. This is an unannounced breaking change — frontend TypeScript, stored `report_json` rows in the `health_reports` table, iOS/Android typed bindings, and all MCP consumers will break. No migration plan is mentioned. + +**Evidence:** `crates/atomic-core/src/health/mod.rs` L90 — `overall_status: String`, populated via `.as_str().to_string()` at L258. + +**Recommendation:** Either (a) keep `overall_status: String` and only add `previous_score: Option` in Phase 2, or (b) make the type change an explicit, planned step with a decision-log entry, frontend TS update, stored-JSON migration, and API version bump. + +--- + +### C4 — `compute_single_check` dispatch matrix dangerously incomplete +**Dimension:** Completeness +**Severity:** 🔴 Critical +**Location:** Phase 1 §1.1 compute_single_check match arm + +**Finding:** The code snippet handles only `embedding_coverage` and `tagging_coverage` with `// ...etc`. The real check list is **11 names** (`content_overlap`, `embedding_coverage`, `tagging_coverage`, `source_uniqueness`, `wiki_coverage`, `semantic_graph_freshness`, `content_quality`, `orphan_tags`, `tag_health`, `contradiction_detection`, `boilerplate_pollution`) plus the async-only `broken_internal_links` — which cannot be dispatched via a sync `checks::X(&raw)` call because it needs `compute_link_check(core).await` with multiple async DB lookups. The `// ...etc` placeholder hides this complexity. + +**Evidence:** `crates/atomic-core/src/health/checks.rs` L12–418 (11 sync checks); `compute_link_check` in `mod.rs` L315 (async, per-atom). + +**Recommendation:** Enumerate all 11 names explicitly in the plan. Special-case `broken_internal_links` to call `compute_link_check(core).await`. Mark `contradiction_detection` as stub if not yet implemented. Verify CHECK_ORDER in HealthWidget.tsx covers all 11 names. + +--- + +### C5 — Non-existent `AtomicCoreError::InvalidInput` variant +**Dimension:** Accuracy +**Severity:** 🔴 Critical +**Location:** Phase 1 §1.1 compute_single_check error return + +**Finding:** Plan uses `AtomicCoreError::InvalidInput(...)`. This variant does not exist. The real variants are: `Database, Provider, Configuration, NotFound, Validation, Io, Json, Lock, Conflict, Embedding, Search, Wiki, Clustering, Compaction, Ingestion, DatabaseOperation`. + +**Evidence:** `crates/atomic-core/src/error.rs`. + +**Recommendation:** Replace with `AtomicCoreError::Validation(format!("Unknown health check: {}", check_name))`. + +--- + +## 3. Major Issues + +### M1 — `health_check_data_sync` called as free function; it's a storage method +**Dimension:** Clarity / Accuracy +**Location:** Phase 1 §1.1 + +Plan calls `health_check_data_sync(core).await?`. It is actually a method on the storage trait: `core.storage().health_check_data_sync().await`. Update the snippet. + +--- + +### M2 — `undoStack: FixResponse[]` is incoherent +**Dimension:** Accuracy / Consistency +**Location:** Phase 3 §3.2 + +The undo endpoint requires a single `fix_id`. `FixResponse` contains `actions_taken: Vec`, `skipped`, `new_score` — no `fix_id`. If a batch fix produces N actions, it's unclear which id to pop. Options: (a) `undoStack: FixAction[]` — pop last action id; or (b) `undoStack: { fix_id: string; label: string }[]` keyed from `HealthFixLog.id` returned after `log_fix`. Decide and document. + +--- + +### M3 — URL parameter name inconsistency +**Dimension:** Consistency +**Location:** Phase 0 table (col "Path"), Phase 1 §1.1 handler comment + +Table shows `{name}`; handler comment shows `{check_name}`; routes/mod.rs registration not shown in §0.3. Pick one name consistently across handler signature, route config, and ApiDoc annotation. + +--- + +### M4 — `HealthReviewModal` prop signature mismatch +**Dimension:** Consistency / Completeness +**Location:** Phase 1 §1.3; HealthWidget refactor snippet + +HealthWidget snippet passes `reportCheck={report.checks[showReviewModal]}` and `checkName={showReviewModal}` to `HealthReviewModal`, but the existing modal takes `{ report, onClose, onResolved }` — not a single `reportCheck` + `checkName`. Phase 1 §1.3 says only "accept checkName prop to pre-select tab" without documenting modal tab structure or the full new interface. Define the complete new props interface. + +--- + +### M5 — Severity badge thresholds conflict with existing HealthStatus scale +**Dimension:** Consistency** +**Location:** Plan L93 (Executive Summary), Plan L566–570 (`getSeverityBadge`), Design Principles + +New severity badges use `0–40 🔴 / 41–70 🟠 / 71–85 🟡 / 86–100 🟢`. Existing `HealthStatus::from_score` mapping uses `<50 Unhealthy / 50–69 Degraded / 70–89 NeedsAttention / ≥90 Healthy`. Two coexisting classification scales will confuse users — a score of 72 would show a 🟡 badge but green "Healthy" status text. Reconcile or explicitly document the divergence as intentional UX design. + +--- + +### M6 — `CHECK_ORDER` coverage not verified +**Dimension:** Completeness +**Location:** Throughout plan + +Plan extensively references `CHECK_ORDER`, `CHECK_LABELS`, and `CHECK_DESCRIPTIONS` constants but doesn't audit whether they currently cover all 11 real check names. If `boilerplate_pollution`, `broken_internal_links`, or `contradiction_detection` are absent from `CHECK_ORDER`, those checks will never render in the UI regardless of the backend work. + +**Action:** Read `HealthWidget.tsx` L160–172 and verify or extend the constant. + +--- + +### M7 — Markdown export via `` tag blocked in Tauri +**Dimension:** Completeness / Risk** +**Location:** Phase 3 §3.3 + +Plan shows `` download. In production Tauri builds, `data:` blob downloads may be blocked by CSP or require `@tauri-apps/plugin-fs` / `plugin-dialog`. No Tauri-specific download path documented. + +**Recommendation:** Add a conditional: web uses ``, Tauri uses `window.__TAURI__.dialog.save()` + `fs.writeTextFile()`. + +--- + +### M8 — Phase 2 hardcoded `Last: 2h ago` contradicts backend plan +**Dimension:** Consistency +**Location:** Phase 2 §2.3 HealthCheckRow snippet + +Row snippet shows hardcoded string `Last: 2h ago`. Phase 2 §2.1 claims the backend will store a `last_run` timestamp per check. Either compute the relative timestamp from real data or explicitly mark the hardcoded version as a Phase 2 placeholder to replace in Phase 3. + +--- + +## 4. Minor Issues + +| ID | Dimension | Location | Finding | +|----|-----------|----------|---------| +| m1 | Accuracy | Phase 2 §2.2, L557 | `getTrend(..., previousScore?: u32)` — `u32` is a Rust type, not valid TypeScript. Should be `number`. | +| m2 | Accuracy | Phase 0 Decision Log | "crate already transitively pulls utoipa via atomic-server" is reversed — atomic-server pulls atomic-core *with* `features = ["openapi"]`, activating utoipa inside atomic-core. The dep direction matters for feature wiring. | +| m3 | Realism | Phase 0, LOE table | Header says "~100–110 hours"; table and summary say exactly 100h; Phase 0 body says "8–10 hours" but table says 10h. Tighten to a single range. | +| m4 | Accuracy | Phase 0 §0.2 | Claim "utoipa only contributes schema metadata at compile time" — over-stated. utoipa generates Schema impls that are evaluated at spec-build time (binary runtime). Trivial cost, but not purely compile-time. | +| m5 | Completeness | Phase 0 §0.4 | `jq` expected paths list in verification step should note `compute_single_check` path only appears after Phase 1 ships, not at Phase 0 merge. | +| m6 | Consistency | Phase 3 §3.2 | Toast timeout: summary says "10s"; Testing §3 says "Undo button available for 10s"; no explicit `setTimeout` cleanup or cancellation on user interaction documented. | + +--- + +## 5. Gaps and Missing Considerations + +1. **No `cfg_attr` pattern documented for Phase 0** — all health type ToSchema derives must match the existing `models.rs` convention. +2. **`broken_internal_links` async path** in `compute_single_check` not addressed. +3. **Frontend `HealthReport` TS interface** update for `previous_score` (local interface in `HealthWidget.tsx` — not in a shared types file). +4. **`StoredHealthReport` and `HealthFixLog` ToSchema** also need `cfg_attr` treatment — not called out separately. +5. **SQLite migration story** if `HealthReport` JSON shape changes in stored `health_reports.report_json` rows. +6. **Command-map.ts `health_check_single` entry** — needs the full HTTP spec (method, path, bodyTransform) consistent with other entries, but no example provided. +7. **`AtomicCore` vs `Database` receiver** — plan's `compute_single_check(core: &AtomicCore)` matches existing pattern; confirm `db: Db` extractor in route handler unwraps correctly (other handlers use `db.0`). + +--- + +## 6. Risk Assessment + +| Risk | Likelihood | Impact | Mitigation | +|------|-----------|--------|-----------| +| Unconditional utoipa dep breaks non-openapi builds | High (if plan followed literally) | High — CI fails | Fix C1 before any Phase 0 work begins | +| `HealthStatus` variant swap silently corrupts data | High | High — wrong status for all existing atoms | Fix C2 immediately | +| `overall_status` type change breaks iOS/MCP clients | Medium (if plan followed literally) | High | Fix C3: keep as String in Phase 2 | +| `// ...etc` placeholder → missing checks at runtime | Certain | Medium — some rows never run | Fix C4: enumerate all 11 | +| `InvalidInput` variant → compile error | Certain | Low (caught by `cargo check`) | Fix C5 | +| Performance: history fetch on every refresh | Medium | Low (addressed in Risks §) | Plan already notes lazy-load mitigation — adequate | +| Tauri export blocked by CSP | High | Low (feature, not data loss) | Add plugin-fs path (M7) | + +--- + +## 7. LOE Assessment + +Original 100h is **optimistic**. Revised estimate: + +| Phase | Plan | Revised | Notes | +|-------|------|---------|-------| +| 0 | 10h | 10–11h | cfg_attr pattern slightly more careful than unconditional | +| 1 | 35h | 38–40h | Full 11-check dispatch + async branch + modal signature fix | +| 2 | 30h | 32–34h | First-run NULL handling, potential JSON migration, TS interface update | +| 3 | 25h | 27–28h | Toast infra setup, Tauri CSP, a11y live region | +| **Total** | **100h** | **107–113h** | | + +Phase 0 and Phase 1 frontend work can run in parallel only after C1's `cfg_attr` pattern is settled. + +--- + +## 8. Action Items (Priority Order) + +| Priority | Action | Rationale | +|----------|--------|-----------| +| 🔴 1 | Rewrite Phase 0 §0.2 to use `cfg_attr(feature = "openapi", derive(utoipa::ToSchema))` — do NOT add utoipa unconditionally | Matches atomic-core convention; prevents broken non-openapi builds | +| 🔴 1 | Fix `HealthStatus` variant list to `Healthy/NeedsAttention/Degraded/Unhealthy` in Phase 0 snippet | Wrong variants break score-to-status mapping | +| 🔴 1 | Keep `overall_status: String` in Phase 2; add only `previous_score: Option` | Avoids unannounced contract break | +| 🔴 1 | Enumerate all 11 check names + async branch for `broken_internal_links` in `compute_single_check` | `// ...etc` hides the real dispatch matrix | +| 🔴 1 | Replace `AtomicCoreError::InvalidInput` with `AtomicCoreError::Validation` | Non-existent variant — compile error | +| 🟠 2 | Normalize URL param: `{check_name}` everywhere | Prevents routing mismatch | +| 🟠 2 | Define `undoStack` semantics: `FixAction[]` keyed by `fix_id` | Current `FixResponse[]` doesn't surface a single `fix_id` | +| 🟠 2 | Reconcile severity badge thresholds with `HealthStatus` scale | Two conflicting health scales confuse users | +| 🟠 2 | Update `HealthReviewModal` signature in plan with full new prop interface | Current plan doesn't match real modal props | +| 🟡 3 | Audit `CHECK_ORDER` covers all 11 checks including recent additions | Missing names = invisible UI rows | +| 🟡 3 | Add Tauri-specific markdown export path (plugin-fs/plugin-dialog) | `data:` blob download may be blocked by Tauri CSP | +| 🟡 3 | Revise LOE to 108–112h range | Accounts for async branch, toast infra, JSON migration | + +--- + +*Full reviewer session:* `/Users/brandonkiefer/.omp/agent/sessions/-projects-atomic/2026-05-01T15-06-56-660Z_1accb47a-6b128d88-23d84057-7f51.jsonl` diff --git a/docs/plans/2026-05-01-health-dashboard-ui-improvements/plan.md b/docs/plans/2026-05-01-health-dashboard-ui-improvements/plan.md new file mode 100644 index 00000000..00a2107a --- /dev/null +++ b/docs/plans/2026-05-01-health-dashboard-ui-improvements/plan.md @@ -0,0 +1,972 @@ +# Knowledge Health Dashboard — UI Improvements + +**Date:** 2026-05-01 +**Status:** Reviewed (2026-05-01) — see [REVIEW.md](./REVIEW.md) +**Project:** Atomic (desktop + web) +**Request:** Implement comprehensive UX enhancements to the health dashboard component, including per-item actions, sample review, filtering, trending, and improved affordances. + +--- + +## Executive Summary + +The current health dashboard (`HealthWidget.tsx`) displays a vertical list of check rows with scores, summary text, and a global "Apply N fixes" button. This plan adds: + +1. **Per-row expandability** with Run and Review buttons, individual fix toggles +2. **Sample review panels** showing 3–5 atoms that triggered each issue, with quick-actions (Fix/Dismiss/Open) +3. **Score trending** (↑↓→ indicators) and last-run timestamps +4. **Filtering & sorting** (severity, auto-fixable, recency) +5. **Severity badges** (🔴🟠🟡🟢) and status colors per check +6. **Improved action bar** with confirmation modals, undo stack, export +7. **Micro-interactions** (animated bars, toast notifications, keyboard shortcuts) +8. **OpenAPI spec coverage** for all `/api/health/*` endpoints (prerequisite — currently missing) + +**Scope:** Frontend (React/TypeScript) + Backend (Rust: utoipa annotations + ApiDoc registration). Health endpoints are currently missing from the generated OpenAPI spec and must be added before external clients (iOS, Android, MCP, SDK consumers) can use them. + +**Effort:** 4 phases, ~100–110 hours of development (Phase 0 adds 8–10 hours for spec coverage). + +--- + +## Current Architecture & Evidence + +### HealthWidget.tsx (src/components/dashboard/widgets/) +- **Current structure:** + - Single component with hardcoded `CHECK_ORDER` (L160–L172) + - Per-row display: icon, label, score bar, description (L288–L332) + - Global "Apply N fixes" button with expandable checklist (L353–L383) + - Review modal dispatch (`HealthReviewModal`, L406–L410) + - No per-row timestamps, trending, or individual fix toggles + +- **Current state management:** + - `report`: Full `HealthReport` object with all checks + - `showPending`: Boolean toggle for "What will this do?" checklist + - `showReview`: Boolean for modal + - `lastFix`: Latest fix response result + - No per-check UI state (expanded, running, etc.) + +### HealthReviewModal.tsx +- Opens for one category at a time +- Shows pairs/samples and per-pair actions (Merge/Keep/Delete/Open) +- Uses `get_atom` API to fetch atom details +- No cross-category comparison or bulk operations + +### Backend API Surface (crates/atomic-server/src/routes/health.rs) +- `GET /api/health/knowledge` — Returns `HealthReport` with all checks + computed_at +- `POST /api/health/fix` — Takes `FixRequest { mode, include_medium, dry_run }` → returns `FixResponse` +- `POST /api/health/fix/{check}/{item_id}` — Manual per-item fix (merge/delete strategies) +- `POST /api/health/undo/{fix_id}` — Undo a fix from audit log +- `GET /api/health/history` — Recent stored reports (for trending) +- `GET /api/health/fixes/recent` — Recent fix log entries + +**Gap:** No endpoint for running a single check in isolation. Needed for per-row "Run" buttons. + +### Type Definitions (atomic-core/src/health/mod.rs) +- `HealthCheckResult`: `status`, `score`, `auto_fixable`, `requires_review`, `fix_action`, `data` +- `HealthReport`: `overall_score`, `overall_status`, `computed_at`, `checks: HashMap`, `auto_fixable`, `requires_review` +- `FixResponse`: `actions_taken: Vec`, `skipped`, `new_score` + +--- + +## Recommended Approach + +### Design Principles +1. **Preserve existing color scale** (green ≥90, yellow 70–89, orange 50–69, red <50) +2. **Dark theme (Obsidian-inspired):** `#1e1e1e` bg, `#7c3aed` purple accent +3. **Progressive disclosure:** Summary row → expandable for details → modal for complex decisions +4. **Idempotent actions:** All fixes are safely retryable +5. **Accessibility:** ARIA labels, focus states, keyboard navigation + +### Technical Strategy + +#### Phase 1: Foundation (Expandable rows, per-row state, Run/Review buttons) +- Refactor single `HealthWidget` into `HealthCheckRow` sub-component with local state +- Add `expandedChecks` Set to track which rows are open +- New endpoint: `POST /api/health/check/{check_name}` for isolated check runs +- Per-row buttons: Run (spinner), Review (lazy-load samples), individual fix toggle +- Local UI state: `lastRunTimes`, `checkTrends`, `checkSamples` + +#### Phase 2: Trends, Timestamps, Filtering (Score history, severity badges, sort/filter UI) +- Fetch historical reports from `GET /api/health/history` +- Compute score delta (current vs. previous) for trend indicator +- Add `last_run` timestamp to each check result (backend: store in report) +- Filter row above checks: Severity, Auto-fixable, Recency +- Sort options: By score (asc/desc), by affected count, alphabetical, auto-fixable first +- Severity badge logic: 🔴 (0–40), 🟠 (41–70), 🟡 (71–85), 🟢 (86–100) + +#### Phase 3: Advanced Affordances (Confirmation modals, undo stack, export, keyboard shortcuts) +- Confirmation modal before batch fixes (grouped by check, showing expected delta) +- Undo toast: "Undo" button + 10s timeout +- Export: Generate markdown report with all findings, citations, sample atoms +- Keyboard shortcuts: `r` (refresh), `1–7` (expand check), `f` (apply fixes), `?` (help) +- Animated score bars: CSS transition on mount/update +- Toast notifications: "✅ Fixed N items. Score 80 → 85" + +--- + +## Implementation Plan + +### Phase 0: OpenAPI Spec Coverage for Health Endpoints (Prerequisite, ~8–10 hours) + +**Why this is Phase 0:** The `export-openapi` binary and `utoipa` `ApiDoc` struct in `crates/atomic-server/src/lib.rs` drive spec generation for all external SDK consumers (iOS, Android, MCP bridge, third-party integrations). Every `/api/health/*` route is currently **missing from the spec** because: + +1. None of the handler functions in `crates/atomic-server/src/routes/health.rs` have `#[utoipa::path(...)]` attribute macros. +2. No health route paths are listed in the `#[openapi(paths(...))]` declaration in `crates/atomic-server/src/lib.rs` (lines 30–167). +3. Health-specific schema types (`HealthReport`, `HealthCheckResult`, `FixRequest`, `FixResponse`, `FixAction`, `SkippedFix`, `ManualFixRequest`, `HistoryQuery`, `StoredHealthReport`, `HealthFixLog`) are not in the `components(schemas(...))` list. + +**Evidence:** +- `crates/atomic-server/src/routes/health.rs` — handlers lack utoipa annotations (confirmed by reading the file; comments show routes but no `#[utoipa::path]` decorators) +- `crates/atomic-server/src/lib.rs:30–167` — paths list has no `routes::health::*` entries +- `crates/atomic-server/src/lib.rs:175–286` — components schemas list has no health types +- `crates/atomic-server/src/lib.rs:290–313` — tags list has no `health` entry + +**Impact of not fixing:** +- iOS/Android clients cannot generate typed bindings for health endpoints +- MCP tools for LLM agents cannot discover health operations +- External SDK consumers have no schema contract; they must reverse-engineer from handler code +- API reference docs (served at `/scalar`) omit the entire health surface + +#### 0.1 Annotate health route handlers with `#[utoipa::path(...)]` +**File:** `crates/atomic-server/src/routes/health.rs` + +Each handler needs a path macro. Example for `get_health_knowledge`: + +```rust +#[utoipa::path( + get, + path = "/api/health/knowledge", + tag = "health", + responses( + (status = 200, description = "Current health report", body = HealthReport), + (status = 500, description = "Internal server error", body = ApiErrorResponse), + ), + security(("bearer_auth" = [])), +)] +pub async fn get_health_knowledge(db: Db) -> HttpResponse { ... } +``` + +All seven handlers need annotation: + +| Handler | Method | Path | Notes | +|---------|--------|------|-------| +| `get_health_knowledge` | GET | `/api/health/knowledge` | Returns `HealthReport` | +| `run_health_fix` | POST | `/api/health/fix` | Body: `FixRequest`, returns `FixResponse` | +| `apply_manual_fix` | POST | `/api/health/fix/{check}/{item_id}` | Path params + `ManualFixRequest` body, returns `FixAction` or `{status: "no_op"}` | +| `undo_health_fix` | POST | `/api/health/undo/{fix_id}` | Path param, returns `{status, fix_id}` | +| `get_health_history` | GET | `/api/health/history` | Query: `limit`, returns `Vec` | +| `get_recent_fixes` | GET | `/api/health/fixes/recent` | Query: `limit`, returns `Vec` | +| `compute_single_check` (Phase 1 addition) | POST | `/api/health/check/{name}` | Path param, returns `(String, HealthCheckResult)` tuple | + +For path params and query params, add `params(...)` section to the macro. For the `{status: "no_op"}` literal-shape response, either define a typed `NoOpResponse` struct with `ToSchema` or use `body = Object` and document inline. + +#### 0.2 Add `ToSchema` derives to all health types +**File:** `crates/atomic-core/src/health/mod.rs` + +The struct definitions currently have `#[derive(Debug, Clone, Serialize, Deserialize)]`. Add `utoipa::ToSchema` using the feature-gated pattern already established in `atomic-core` (see `crates/atomic-core/src/models.rs`): + +```rust +#[cfg_attr(feature = "openapi", derive(utoipa::ToSchema))] +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct HealthCheckResult { ... } + +#[cfg_attr(feature = "openapi", derive(utoipa::ToSchema))] +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct HealthReport { ... } + +#[cfg_attr(feature = "openapi", derive(utoipa::ToSchema))] +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct FixAction { ... } + +#[cfg_attr(feature = "openapi", derive(utoipa::ToSchema))] +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SkippedFix { ... } + +#[cfg_attr(feature = "openapi", derive(utoipa::ToSchema))] +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct FixResponse { ... } + +#[cfg_attr(feature = "openapi", derive(utoipa::ToSchema))] +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct FixRequest { ... } + +#[cfg_attr(feature = "openapi", derive(utoipa::ToSchema))] +#[derive(Debug, Clone, Copy, Serialize, Deserialize)] +pub enum FixTier { Safe, Low, Medium, High } + +// Real variants — do NOT change them; only add the cfg_attr derive. +#[cfg_attr(feature = "openapi", derive(utoipa::ToSchema))] +#[derive(Debug, Clone, Copy, Serialize, Deserialize)] +pub enum HealthStatus { Healthy, NeedsAttention, Degraded, Unhealthy } +``` + +**File:** `crates/atomic-core/src/health/audit.rs` +```rust +#[cfg_attr(feature = "openapi", derive(utoipa::ToSchema))] +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct HealthFixLog { ... } + +#[cfg_attr(feature = "openapi", derive(utoipa::ToSchema))] +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct StoredHealthReport { ... } +``` + +`atomic-core` already has `utoipa` as an *optional* dep behind `[features] openapi = ["utoipa"]` in `crates/atomic-core/Cargo.toml`. **Do not add utoipa unconditionally** — use `cfg_attr` throughout so non-openapi consumers compile cleanly. `atomic-server` activates the feature via `atomic-core = { features = ["openapi", ...] }` in its own Cargo.toml. + +#### 0.3 Register health paths and schemas in ApiDoc +**File:** `crates/atomic-server/src/lib.rs` + +In the `#[openapi(paths(...))]` block (around line 30), add: + +```rust +// Health +routes::health::get_health_knowledge, +routes::health::run_health_fix, +routes::health::apply_manual_fix, +routes::health::undo_health_fix, +routes::health::get_health_history, +routes::health::get_recent_fixes, +routes::health::compute_single_check, // added in Phase 1 +``` + +In the `components(schemas(...))` block (around line 175), add: + +```rust +// Health +atomic_core::health::HealthReport, +atomic_core::health::HealthCheckResult, +atomic_core::health::HealthStatus, +atomic_core::health::FixRequest, +atomic_core::health::FixResponse, +atomic_core::health::FixAction, +atomic_core::health::SkippedFix, +atomic_core::health::FixTier, +atomic_core::health::audit::StoredHealthReport, +atomic_core::health::audit::HealthFixLog, +routes::health::ManualFixRequest, +``` + +In the `tags(...)` block (around line 290), add: + +```rust +(name = "health", description = "Knowledge base health checks and auto-remediation"), +``` + +#### 0.4 Verify spec generation + +Regenerate the OpenAPI JSON and confirm health endpoints appear: + +```bash +cargo run --bin export-openapi -p atomic-server -- openapi.json + +# Verify health paths are present +jq '.paths | keys | map(select(startswith("/api/health")))' openapi.json +# Expected: ["/api/health/check/{name}", "/api/health/fix", "/api/health/fix/{check}/{item_id}", +# "/api/health/fixes/recent", "/api/health/history", "/api/health/knowledge", +# "/api/health/undo/{fix_id}"] + +# Verify health schemas are registered +jq '.components.schemas | keys | map(select(startswith("Health") or startswith("Fix") or . == "ManualFixRequest"))' openapi.json +``` + +#### 0.5 Verify downstream consumers + +- Hit `/scalar` in dev mode — confirm health section renders with all 7 endpoints, each with request/response schemas +- Rebuild iOS/Android typed client bindings (if automated via codegen) and verify no compile errors +- MCP bridge: check that `atomic-mcp` discovers health tools if it reflects on the OpenAPI surface + +**Effort estimate:** 8–10 hours +- 2h — ToSchema derives on core types + Cargo.toml utoipa dep +- 3h — `#[utoipa::path]` annotations on all 6 existing handlers (plus the Phase 1 handler) +- 1h — `ApiDoc` registration in lib.rs +- 1h — spec regeneration, jq verification, `/scalar` smoke test +- 1–2h — fixing any `ToSchema` derivation issues (e.g., `HashMap` may need explicit schema hint; `DateTime` needs a format attribute) + +--- + +### Phase 1: Expandable Rows & Per-Check Actions (Week 1, ~35 hours) + +#### 1.1 Backend: New single-check compute endpoint +**File:** `crates/atomic-server/src/routes/health.rs` + +```rust +// POST /api/health/check/{check_name} +pub async fn compute_single_check( + db: Db, + path: web::Path, +) -> HttpResponse { + let check_name = path.into_inner(); + // Call atomic-core with just this check + match health::compute_single_check(&db.0, &check_name).await { + Ok(result) => HttpResponse::Ok().json(result), + Err(e) => crate::error::error_response(e), + } +} +``` + +**File:** `crates/atomic-core/src/health/mod.rs` + +```rust +/// Compute a single health check by name. +pub async fn compute_single_check( + core: &AtomicCore, + check_name: &str, +) -> Result<(String, HealthCheckResult), AtomicCoreError> { + let result = match check_name { + // Sync checks — fetch raw data once, dispatch to the appropriate fn + "embedding_coverage" + | "tagging_coverage" + | "content_overlap" + | "source_uniqueness" + | "wiki_coverage" + | "semantic_graph_freshness" + | "content_quality" + | "orphan_tags" + | "tag_health" + | "contradiction_detection" + | "boilerplate_pollution" => { + let raw = core.storage().health_check_data_sync().await?; + match check_name { + "embedding_coverage" => checks::embedding_coverage(&raw), + "tagging_coverage" => checks::tagging_coverage(&raw), + "content_overlap" => checks::content_overlap(&raw), + "source_uniqueness" => checks::source_uniqueness(&raw), + "wiki_coverage" => checks::wiki_coverage(&raw), + "semantic_graph_freshness" => checks::semantic_graph_freshness(&raw), + "content_quality" => checks::content_quality(&raw), + "orphan_tags" => checks::orphan_tags(&raw), + "tag_health" => checks::tag_health(&raw), + "contradiction_detection" => checks::contradiction_detection(&raw), + "boilerplate_pollution" => checks::boilerplate_pollution(&raw), + _ => unreachable!(), + } + } + // Async check — requires per-atom DB lookups + "broken_internal_links" => compute_link_check(core).await?, + _ => return Err(AtomicCoreError::Validation( + format!("Unknown health check: {}", check_name), + )), + }; + Ok((check_name.to_string(), result)) +} +``` + +**Backend routes registration:** Update `crates/atomic-server/src/routes/mod.rs` to add `POST /api/health/check/{check_name}` into the health scope (alongside the other health routes). + +#### 1.2 Frontend: Refactor to component-per-row +**File:** `src/components/dashboard/widgets/HealthCheckRow.tsx` (new) + +```typescript +interface HealthCheckRowProps { + checkName: string; + check: HealthCheckResult; + isExpanded: boolean; + onToggleExpand: (name: string) => void; + onRun: (name: string) => void; + onReview: (name: string) => void; + isRunning?: boolean; +} + +export function HealthCheckRow({ + checkName, + check, + isExpanded, + onToggleExpand, + onRun, + onReview, + isRunning, +}: HealthCheckRowProps) { + return ( +
+ {/* Header */} +
+ + + {/* Label & score */} +
+
+ + {CHECK_LABELS[checkName] ?? checkName} + + + {check.score} + +
+ +
+ + {/* Right-align buttons */} + + + {check.requires_review && ( + + )} + + +
+ + {/* Description */} + {!isExpanded && ( +

+ {CHECK_DESCRIPTIONS[checkName]?.(check.data)} +

+ )} + + {/* Expanded detail */} + {isExpanded && ( +
+

+ {CHECK_DESCRIPTIONS[checkName]?.(check.data)} +

+ + {check.auto_fixable && ( + + )} + + {check.requires_review && ( + + )} +
+ )} +
+ ); +} +``` + +**File:** `src/components/dashboard/widgets/HealthWidget.tsx` (refactored) + +```typescript +export function HealthPanel() { + const [report, setReport] = useState(null); + const [expandedChecks, setExpandedChecks] = useState>(new Set()); + const [runningCheck, setRunningCheck] = useState(null); + const [showReviewModal, setShowReviewModal] = useState(null); + // ... other state + + const runSingleCheck = useCallback(async (checkName: string) => { + setRunningCheck(checkName); + try { + const result = await getTransport().invoke<{ + name: string; + result: HealthCheckResult; + }>('health_check_single', { check_name: checkName }); + + // Update report with new check result + setReport((prev) => { + if (!prev) return prev; + return { + ...prev, + checks: { ...prev.checks, [checkName]: result.result }, + }; + }); + } catch (err) { + setError(err instanceof Error ? err.message : 'Check failed'); + } finally { + setRunningCheck(null); + } + }, []); + + const toggleExpandCheck = useCallback((checkName: string) => { + setExpandedChecks((prev) => { + const next = new Set(prev); + if (next.has(checkName)) next.delete(checkName); + else next.add(checkName); + return next; + }); + }, []); + + return ( +
+ {/* Header & score bar */} + {/* ... existing code ... */} + + {/* Per-check rows */} + {issueChecks.length > 0 ? ( +
+ {issueChecks.map((checkName) => { + const check = report.checks[checkName]; + if (!check) return null; + return ( + setShowReviewModal(name)} + isRunning={runningCheck === checkName} + /> + ); + })} +
+ ) : ( + /* healthy state */ + )} + + {showReviewModal && ( + setShowReviewModal(null)} + onResolved={fetchHealth} + /> + )} +
+ ); +} +``` + +**Command map:** Add `health_check_single` to `src/lib/transport/command-map.ts`. + +#### 1.3 Update HealthReviewModal for row-triggered opens +- Accept `checkName: string` as a new required prop alongside the existing `report`, `onClose`, and `onResolved` +- Use `checkName` to set initial `activeTab` state so the modal opens on the correct category +- Update the prop interface in `HealthReviewModal.tsx` to `{ report: HealthReport; checkName: string; onClose: () => void; onResolved: () => void }` + +**Effort estimate:** 35 hours (backend endpoint, TS types, component refactor, testing) + +--- + +### Phase 2: Trends, Filtering, Sorting (Week 2, ~30 hours) + +#### 2.1 Backend: Enhance HealthReport with metadata +**File:** `crates/atomic-core/src/health/mod.rs` + +```rust +#[cfg_attr(feature = "openapi", derive(utoipa::ToSchema))] +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct HealthReport { + pub overall_score: u32, + pub overall_status: String, // Keep as String — "healthy" | "needs_attention" | "degraded" | "unhealthy" + pub computed_at: String, // ISO 8601 + pub atom_count: i32, + pub checks: HashMap, + pub auto_fixable: i32, + pub requires_review: i32, + pub previous_score: Option, // Added in Phase 2 for trending; None on first run +} + +**File:** `crates/atomic-core/src/storage/sqlite/health.rs` + +Enhance `get_latest_health_report_impl` to fetch previous score from history table. + +#### 2.2 Frontend: Add trend computation and filter UI +**File:** `src/components/dashboard/widgets/HealthWidget.tsx` + +```typescript +interface FilterState { + severity: 'all' | 'critical' | 'warning' | 'needs-attention' | 'healthy'; + autoFixable: 'all' | 'fixable' | 'manual-only'; + sort: 'score-asc' | 'score-desc' | 'alphabetical' | 'affected-count'; +} + +// Trend indicator helper +function getTrend(check: HealthCheckResult, previousScore?: number): '↑' | '↓' | '→' { + if (!previousScore) return '→'; + if (check.score > previousScore) return '↑'; + if (check.score < previousScore) return '↓'; + return '→'; +} + +// Severity badge +function getSeverityBadge(score: number): string { + if (score <= 40) return '🔴'; + if (score <= 70) return '🟠'; + if (score <= 85) return '🟡'; + return '🟢'; +} + +// Filtered and sorted checks +function getVisibleChecks( + report: HealthReport, + filter: FilterState +): string[] { + let visible = CHECK_ORDER.filter((k) => { + const check = report.checks[k]; + if (!check || check.status === 'ok') return false; + + // Severity filter + if (filter.severity !== 'all') { + const score = check.score; + const severity = + score <= 40 ? 'critical' : + score <= 70 ? 'warning' : + score <= 85 ? 'needs-attention' : 'healthy'; + if (severity !== filter.severity) return false; + } + + // Auto-fixable filter + if (filter.autoFixable === 'fixable' && !check.auto_fixable) return false; + if (filter.autoFixable === 'manual-only' && check.auto_fixable) return false; + + return true; + }); + + // Sorting + if (filter.sort === 'score-asc') { + visible.sort((a, b) => report.checks[a].score - report.checks[b].score); + } else if (filter.sort === 'score-desc') { + visible.sort((a, b) => report.checks[b].score - report.checks[a].score); + } else if (filter.sort === 'alphabetical') { + visible.sort((a, b) => CHECK_LABELS[a].localeCompare(CHECK_LABELS[b])); + } else if (filter.sort === 'affected-count') { + visible.sort((a, b) => { + const countA = extractCount(report.checks[a]); + const countB = extractCount(report.checks[b]); + return countB - countA; + }); + } + + return visible; +} +``` + +#### 2.3 HealthCheckRow enhancement: Timestamps and trends +```typescript +// Update row header to show trend & last-run time +
+ {getTrend(check, previousScore)} + Last: 2h ago +
+ +// Severity badge before icon +{getSeverityBadge(check.score)} +``` + +**Effort estimate:** 30 hours (backend report enrichment, filter logic, sorting, UI layout) + +--- + +### Phase 3: Advanced UX (Modals, Undo, Export, Keyboard Shortcuts, Animations) (Week 3, ~25 hours) + +#### 3.1 Confirmation modal for batch fixes +```typescript +interface FixConfirmationModalProps { + pending: { label: string; check: string }[]; + report: HealthReport; + onConfirm: (selectedChecks: Set) => void; + onCancel: () => void; +} + +// Shows grouped summary: +// "This will: retag 26 atoms, remove 9 duplicate clones, trim 20 long atoms" +// With per-fix checkbox +``` + +#### 3.2 Undo stack & toast +```typescript +// Undo stack entries: each holds the fix_id (from HealthFixLog) and a human label +const [undoStack, setUndoStack] = useState<{ fix_id: string; label: string }[]>([]); + +// After fix applied, push { fix_id, label } onto the stack and show toast +// fix_id comes from the HealthFixLog.id returned by log_fix (server includes it in FixResponse.fix_id) +// Toast auto-dismisses in 10s (clearTimeout on click); Undo calls POST /api/health/undo/{fix_id} + +#### 3.3 Export to markdown +```typescript +function exportHealthReport(report: HealthReport): string { + let md = `# Knowledge Base Health Report\n\n`; + md += `**Overall Score:** ${report.overall_score}/100\n`; + md += `**Generated:** ${new Date(report.computed_at).toLocaleString()}\n\n`; + + // Per-check section with data + for (const check of CHECK_ORDER) { + const result = report.checks[check]; + if (!result) continue; + md += `## ${CHECK_LABELS[check]}\n`; + md += `**Score:** ${result.score}/100\n`; + md += `**Status:** ${result.status}\n`; + md += `${CHECK_DESCRIPTIONS[check]?.(result.data)}\n\n`; + } + + return md; +} +``` + +#### 3.4 Keyboard shortcuts +- `r`: Refresh all checks +- `f`: Apply fixes (open confirmation modal) +- `1–9`: Expand nth check in filtered list +- `?`: Show help overlay + +#### 3.5 Animations +```css +/* Smooth score bar fill */ +.score-bar { + transition: width 600ms cubic-bezier(0.34, 1.56, 0.64, 1); /* ease-out */ +} + +/* Row expand/collapse */ +[data-expanded="true"] { + animation: slideDown 200ms ease-out; +} + +@keyframes slideDown { + from { + opacity: 0; + transform: translateY(-8px); + } + to { + opacity: 1; + transform: translateY(0); + } +} +``` + +**Effort estimate:** 25 hours (modals, UX polish, keyboard handling, animations) + +--- + +## Files & Components to Change + +### Backend +- `crates/atomic-server/src/routes/health.rs` — Add `compute_single_check` endpoint; annotate all handlers with `#[utoipa::path(...)]` (Phase 0) +- `crates/atomic-server/src/lib.rs` — Register health paths, schemas, and tag in `ApiDoc` (Phase 0) +- `crates/atomic-core/src/health/mod.rs` — `compute_single_check()` function + `ToSchema` derives on all health types (Phase 0) +- `crates/atomic-core/src/health/audit.rs` — `ToSchema` derives on `HealthFixLog`, `StoredHealthReport` (Phase 0) +- `crates/atomic-core/Cargo.toml` — Add `utoipa` dependency (Phase 0) +- `crates/atomic-core/src/storage/sqlite/health.rs` — Query previous report score for trending +- `crates/atomic-server/src/routes/mod.rs` — Route registration + +### Frontend +- `src/components/dashboard/widgets/HealthWidget.tsx` — Main refactor (expandable, filtering, actions) +- `src/components/dashboard/widgets/HealthCheckRow.tsx` — NEW (per-row component) +- `src/components/dashboard/widgets/HealthReviewModal.tsx` — Minor: accept checkName prop +- `src/components/dashboard/widgets/HealthConfirmModal.tsx` — NEW (batch fix confirmation) +- `src/components/dashboard/widgets/HealthExportModal.tsx` — NEW (markdown export) +- `src/lib/transport/command-map.ts` — Add `health_check_single` command +- `src/styles/animations.css` — NEW or extend (score bar animations) + +--- + +## Data Flow & Interfaces + +### Single-Check Compute Flow +``` +User clicks "Run" button on Tagging row + → onRun('tagging_coverage') + → POST /api/health/check/tagging_coverage + → compute_single_check(core, 'tagging_coverage') + → fetch raw data, run just tagging check + → return HealthCheckResult + → update report.checks['tagging_coverage'] + → row re-renders with new score, animate bar +``` + +### Batch Fix with Confirmation +``` +User clicks "Apply N fixes" + → open FixConfirmationModal + → show checklist of pending.map(fix_action) + → user can toggle individual fixes + → user clicks "Confirm" + → POST /api/health/fix { mode: 'auto', include_medium, dry_run } + → FixResponse + new_score + → update report + → show toast: "✅ Fixed 5 items. Score 80 → 85" + → Undo button available for 10s +``` + +### Trend Computation +``` +GET /api/health/knowledge + → HealthReport { overall_score, checks, computed_at, previous_score? } + → for each check, if previous_score exists: + delta = current - previous + trend = delta > 0 ? '↑' : delta < 0 ? '↓' : '→' + → display trend icon next to score +``` + +--- + +## Configuration & Deployment Notes + +### Environment +- No new env vars needed; all features toggle on frontend state +- Backend endpoint (`compute_single_check`) available on all deployments + +### Feature Flags +None required; all features are additive and don't conflict with existing UI. + +### Accessibility +- All buttons have aria-labels and keyboard focus states +- Modals use `dialog` ARIA role +- Color not the only indicator (use icons + text) +- Keyboard shortcuts documented in `?` overlay + +--- + +## Testing & Validation Plan + +### E2E Tests (Playwright) +1. **Per-row run button** + - Click Run on a single check + - Verify spinner appears + - Verify score updates when response arrives + - Verify can click Run multiple times without errors + +2. **Expandable rows** + - Click row header → expands and shows details + buttons + - Click again → collapses + - Expand state persists until user collapses + +3. **Batch fix confirmation** + - Click "Apply N fixes" → modal opens + - Each fix has unchecked checkbox + - User can toggle individual fixes + - Click Confirm → fixes run, new score displayed + - Toast shows "Fixed N items. Score X → Y" + - Click Undo → reverts (calls undo endpoint) + +4. **Filtering & sorting** + - Change severity filter → only matching checks displayed + - Change sort order → checks reorder + - Verify all checks still displayed when filter cleared + +5. **Sample review** + - Click "Review samples" on a failing check + - Modal opens showing 3–5 sample atoms + - Each sample has "Fix", "Dismiss", "Open atom" buttons + - Quick actions work as expected + +### Commands +```bash +# Run E2E tests +npm run playwright:test -- --grep "health.*ui" + +# Run unit tests for helper functions +npm run test -- HealthWidget + +# Manual testing flow +npm run dev:mobile:ios & +# or +make dev-desktop-fast + +# In app: +1. Navigate to dashboard +2. Open Health panel +3. Test each UI interaction as per E2E list above +``` + +### Verification +- All checks remain sortable/filterable after batch fix +- Score bars animate smoothly on update +- Keyboard shortcuts work (test with `r`, `f`, `?`) +- No console errors during interactions +- Modal accessibility tested with screen reader (NVDA/Voiceover) + +--- + +## Risks, Assumptions, and Open Questions + +### Risks +1. **Performance:** Fetching `HealthReport` + historical data on every refresh could be slow for large KBs. Mitigation: Memoize last report, fetch history lazily on first filter/trend request. + +2. **Undo semantics:** If user applies fixes, then runs a check that changes scores, what happens to undo? Fix: Undo button is only valid for 10s immediately after fix; once new data fetched, undo is stale. + +3. **Concurrent fixes:** User clicks "Run" on a single check while batch fix is running. Mitigation: Disable Run buttons while batch fix in progress. + +### Assumptions +1. Backend will have `POST /api/health/check/{check_name}` endpoint for single-check compute (requirement for Phase 1). +2. History API already exists and returns previous reports (assumed; verify with backend team). +3. Undo endpoint (`POST /api/health/undo/{fix_id}`) already works (implemented in Phase 1 of prior sprint). +4. HealthCheckResult data shape is stable; no breaking changes to check payloads. + +### Open Questions +1. **Sample review:** Currently `HealthReviewModal` shows pairs for overlaps/dupes. For other checks (e.g., untagged atoms, too-long atoms), how should samples be structured? Should we add a new `/api/health/samples/{check}` endpoint? + +2. **Export location:** Where should markdown export be saved? Recommendation: web uses `
` with a `data:text/markdown` URI; Tauri desktop uses `@tauri-apps/plugin-dialog` (`save()`) + `@tauri-apps/plugin-fs` (`writeTextFile()`) to avoid CSP-blocked blob downloads. Branch on `window.__TAURI__` at runtime. + +3. **Trend baseline:** Should we compare to the *previous run* or a *rolling average* over 7 days? Recommend previous run (simpler, clearer signal). + +4. **Mobile:** Filter/sort UI is complex on mobile. Should we hide advanced filters on small screens and show only "Severity" dropdown? Or move to slide-over panel? + +--- + +## LOE & Effort Estimate + +| Phase | Task | Hours | Notes | +|-------|------|-------|-------| +| 0 | Backend: `cfg_attr` ToSchema derives on health core types | 2 | Feature-gated pattern; ~10 structs/enums in mod.rs + audit.rs | +| 0 | Backend: `#[utoipa::path]` on health handlers | 3 | Six existing + one Phase 1 addition | +| 0 | Backend: Register paths/schemas/tags in ApiDoc | 1 | Edit `atomic-server/src/lib.rs` | +| 0 | Verify generated spec, regen clients | 2 | `export-openapi`, jq checks, `/scalar` smoke test | +| 0 | Fix ToSchema derivation edge cases | 2 | DateTime formats, HashMap schema hints | +| **Phase 0 total** | | **10** | Prerequisite — must ship before external clients use health APIs | +| 1 | Backend: single-check endpoint (all 11 + async link check) | 10 | Full dispatch matrix + `broken_internal_links` async path | +| 1 | Frontend: HealthCheckRow component | 12 | Component extraction, state per-row, buttons | +| 1 | Frontend: HealthWidget refactor | 10 | Migrate to per-row render, integrate new state | +| 1 | Integration testing, fixes | 6 | E2E tests for Run, Review, expand/collapse | +| **Phase 1 total** | | **38** | | +| 2 | Backend: report enrichment (prev score) | 7 | Query history, add field, handle first-run NULL, update TS interface | +| 2 | Frontend: filter/sort logic | 10 | Data structures, comparison functions | +| 2 | Frontend: filter UI, severity badges | 8 | Layout, state management | +| 2 | Integration testing | 7 | Filter combinations, sorting verified | +| **Phase 2 total** | | **32** | | +| 3 | FixConfirmationModal component | 6 | Modal boilerplate, checkbox logic | +| 3 | Undo stack & toast integration | 7 | Toast library setup, undo flow, aria-live, timeout cleanup | +| 3 | Export function + UI (web + Tauri paths) | 5 | Markdown generation, plugin-fs for desktop, data: for web | +| 3 | Keyboard shortcuts | 3 | Event listeners, help overlay | +| 3 | Animations & micro-interactions | 4 | CSS transitions, polish | +| 3 | E2E and polish | 3 | Final testing, edge cases | +| **Phase 3 total** | | **28** | | +| **Grand total** | | **108** | ~3–4 weeks at 30 hrs/week (Phase 0 can run in parallel with Phase 1 frontend work once `cfg_attr` pattern settled) | + +--- + +## Decision Log + +| Date | Decision | Rationale | +|------|----------|-----------| +| 2026-05-01 | Phased delivery (foundation → trends → polish) | Allows early feedback and MVP deployment after Phase 1 | +| 2026-05-01 | Per-row Run button over modal | Faster iteration; avoid extra click depth for common operation | +| 2026-05-01 | Severity filter over custom query language | Simpler UX; covers 90% of user needs | +| 2026-05-01 | 10s undo timeout vs. infinite stack | Prevents confusion; aligns with user mental model (like Ctrl+Z in editors) | +| 2026-05-01 | Markdown export vs. JSON/CSV | Markdown is human-readable, shareable, LLM-friendly for context | +| 2026-05-01 | Add Phase 0 for OpenAPI spec coverage | Health endpoints are invisible to external SDK/MCP/iOS clients until registered in `ApiDoc`; Phase 0 unblocks all downstream consumers and is a prerequisite for Phase 1's new `compute_single_check` endpoint being usable outside the web UI | +| 2026-05-01 | Use `cfg_attr(feature = "openapi", derive(utoipa::ToSchema))` for health types | Matches existing atomic-core convention (models.rs); utoipa is already an optional dep behind `openapi` feature — unconditional derive breaks non-openapi consumers | + +--- + +## Summary + +This plan structures a high-UX dashboard enhancement into four phased releases: + +0. **Phase 0 (Prerequisite):** OpenAPI spec coverage for all `/api/health/*` endpoints — adds utoipa annotations, `ToSchema` derives, and `ApiDoc` registration so external SDK/MCP/mobile clients can consume health APIs. +1. **Phase 1 (MVP):** Expandable rows with Run/Review per-check, lays foundation for remaining features. +2. **Phase 2 (Insight):** Trending, filtering, sorting so users can prioritize high-impact fixes. +3. **Phase 3 (Polish):** Confirmations, undo, export, keyboard shortcuts, animations — delightful UX. + +**Recommendation:** Start Phase 0 immediately in parallel with Phase 1 frontend work — backend annotation work doesn't block React refactor. Settle the `cfg_attr` pattern (Phase 0 §0.2) before merging Phase 1 to avoid feature-flag conflicts. Phase 2 and 3 follow incrementally. + +Estimated **~108 hours total** across all four phases, with Phase 0 (~10h) deliverable within a day or two and unblocking all external API consumers. diff --git a/docs/plans/2026-05-01-health-review-queue-audit/audit.md b/docs/plans/2026-05-01-health-review-queue-audit/audit.md new file mode 100644 index 00000000..576b0ac4 --- /dev/null +++ b/docs/plans/2026-05-01-health-review-queue-audit/audit.md @@ -0,0 +1,249 @@ +# Deep Audit: Health Review Queue Backend + +## Executive Summary + +Audited **11 health checks** across three modules (`checks.rs`, `mod.rs`, `health.rs` storage). Found **4 checks with `requires_review: true`** that surface user-actionable data to the UI. Data sufficiency ranges from **rich (full atom details with similarity/source)** to **bare counts only (rootless tags)**. No critical bugs found; several UX gaps identified. + +--- + +## Checks with `requires_review: true` + +### 1. **`content_overlap`** — High-value data ✅ +| Field | Value | +|-------|-------| +| **Lines** | checks.rs:330–369 | +| **Status Sets Review** | When `overlaps > 0` (cross-source semantic duplicates) | +| **Condition** | Similarity 0.55–0.85, ≥2 shared tags, different source prefixes | +| **Data Shape** | `{ exact_duplicates, template_clones, cross_source_overlaps, count, pairs[] }` | +| **Pairs Structure** | Each pair includes: `pair_id`, `atom_a{id,title,source}`, `atom_b{id,title,source}`, `similarity`, `shared_tag_count`, `available_actions[]` | +| **Storage Query** | `health.rs:L341–371` — Joins `semantic_edges` → `atoms` (2x) → `atom_tags` (2x). Filters on similarity score and shared tags. Extracts title via `extract_title_preview()` (first ~100 chars until newline). | +| **UX Sufficiency** | ✅ **Excellent** — All needed data present: atom IDs, titles, source URLs, similarity %, shared tags, suggested actions. UI can display a pair list immediately. | +| **Data Quality** | ✅ Correct SQL joins. Title extraction may lose content if first paragraph is long. | +| **Gap** | None identified for core UX. | + +--- + +### 2. **`content_quality` → `no_source` sub-issue** — Bare IDs only ⚠️ +| Field | Value | +|-------|-------| +| **Lines** | checks.rs:253–305 | +| **Status Sets Review** | When `!raw.no_source_atoms.is_empty()` | +| **Condition** | Atoms with `null source_url` AND no HTTP(S) link AND no "Source:" text in content | +| **Data Shape** | `{ total, issues { no_source { count, auto_fixable: false, atoms: [id, ...] } } }` | +| **Storage Query** | `health.rs:L307–316` — Simple SELECT on atoms table: `WHERE source_url IS NULL AND content NOT LIKE '%http://%' AND NOT LIKE '%https://%' AND NOT LIKE '%Source:%'` LIMIT 20. Returns only atom ID. | +| **UX Sufficiency** | ⚠️ **Minimal** — Only IDs returned. UI must fetch full atoms (title, created date, preview) separately to display meaningful review list. | +| **Data Quality** | ✅ SQL correct, but incomplete. | +| **Gap** | **Should return**: atom ID + title + preview (first ~200 chars) + created_at + updated_at. This would let UI show context without additional round-trips. | + +--- + +### 3. **`boilerplate_pollution`** — Bare IDs only ⚠️ +| Field | Value | +|-------|-------| +| **Lines** | checks.rs:398–418 | +| **Status Sets Review** | When `count > 0` (atoms with ≥2 near-identical edges at similarity ≥0.99) | +| **Condition** | Semantic edges at similarity ≥0.99 grouped by source atom with count ≥2 | +| **Data Shape** | `{ count, affected_atoms: [id, ...], description: "..." }` | +| **Storage Query** | `health.rs:L360–366` — `SELECT source_atom_id FROM semantic_edges WHERE similarity_score >= 0.99 GROUP BY source_atom_id HAVING COUNT(*) >= 2 LIMIT 50`. Returns only atom IDs. | +| **UX Sufficiency** | ⚠️ **Minimal** — Only IDs. UI cannot show context. | +| **Data Quality** | ✅ SQL correct. | +| **Gap** | **Should return**: atom ID + title + count of near-duplicate edges. This allows UI to prioritize review (atoms with 5+ clones are more urgent than those with 2). | + +--- + +### 4. **`contradiction_detection`** — Counts only, no pair data ❌ +| Field | Value | +|-------|-------| +| **Lines** | checks.rs:371–387 | +| **Status Sets Review** | When `count > 0` (candidate contradictions found) | +| **Condition** | `contradiction_candidate_count > 0` — derived from semantic edges with similarity 0.75–0.92 | +| **Data Shape** | `{ pairs_checked, potential_contradictions }` — **NO pairs returned** | +| **Storage Query** | `health.rs:L395–398` — Two COUNT queries only: `SELECT COUNT(*) FROM semantic_edges WHERE similarity_score >= 0.75 AND similarity_score < 0.92`. Returns only counts, no pair details. | +| **UX Sufficiency** | ❌ **Unusable** — UI shows "Found 10 potential contradictions" but cannot display anything to review. User sees a warning with no actionable content. | +| **Data Quality** | ⚠️ **Incomplete by design**. Comment in code (checks.rs:375–376): "For now, surface the count as 'candidates' (no LLM check yet)" — implies pairs/details are intentionally deferred. | +| **Gap** | **Critical UX issue**: Either (a) disable `requires_review: true` until pair data is available, or (b) return the actual pairs (atom IDs, titles, snippets, similarity %) so users can manually review them. Current state shows a warning the user cannot act on. | + +--- + +### 5. **`tag_health` → `rootless_tags`** — Counts only, no IDs ❌ +| Field | Value | +|-------|-------| +| **Lines** | checks.rs:307–328 | +| **Status Sets Review** | When `rootless > 0` (tags with no parent) | +| **Condition** | `rootless_tags > 0` | +| **Data Shape** | `{ single_atom_tags, rootless_tags, similar_name_pairs }` — **NO tag details** | +| **Storage Query** | `health.rs:L331–335` — `SELECT COUNT(*) FROM tags WHERE parent_id IS NULL`. Returns only count. | +| **UX Sufficiency** | ⚠️ **Poor** — UI shows "2 rootless tags" but cannot identify which ones. User cannot act without drilling into the tag tree UI separately. | +| **Data Quality** | ✅ SQL correct. | +| **Gap** | **Should return**: count + `[(tag_id, tag_name, atom_count), ...]` list. This lets UI show a "Fix" action (move to parent category or promote to root manually). | + +--- + +## All Other Checks (not requiring review) + +| Check | Status | Why No Review Needed | +|-------|--------|---------------------| +| `embedding_coverage` | ❌ | Auto-fixable (retry pipeline). UI shows progress bars. | +| `tagging_coverage` | ❌ | Auto-fixable. Shows counts of pending/failed/untagged. | +| `source_uniqueness` | ❌ | Auto-fixable (merge exact duplicates). Pairs included. | +| `orphan_tags` | ❌ | Auto-fixable (delete). Full tag IDs + names included. | +| `semantic_graph_freshness` | ❌ | Auto-fixable (rebuild edges). Shows dates + count. | +| `wiki_coverage` | ❌ | Auto-fixable (generate/update). Gaps + stale list included. | +| `broken_internal_links` | ❌ | Auto-fixable (resolve). Only counts returned, no pairs. | + +--- + +## Async Check: `broken_internal_links` + +| Field | Value | +|-------|-------| +| **Lines** | mod.rs:393–493 | +| **Runs** | Via `compute_link_check()` in health flow | +| **Requires Review** | ❌ No — `requires_review: false` | +| **Logic** | Per-atom check: extracts markdown + wikilinks → resolves via source URL or wikilink name lookup. Returns broken count & affected atom count. | +| **Data Shape** | `{ broken_count: i32, affected_atoms: i32 }` — counts only | +| **UX Gap** | If `broken_count > 0`, UI shows warning but no atom IDs. Cannot identify which atoms have broken links without re-running the check per atom. | + +--- + +## Storage Queries: Summary + +### `HealthRawData` struct (~80 fields total) + +All queries live in `health.rs:L87–422` under `health_check_data_impl()`. Pattern: +1. **Counts & status groups** — Simple aggregations (embedding_status, tagging_status, etc.) +2. **Filtered lists** — Orphan tags, very-short/long atoms, boilerplate atoms (IDs only) +3. **Rich joins** — Content overlap (full pairs with titles), wiki coverage (tag names + atom counts) +4. **Pair construction** — DuplicatePair struct built in Rust loop (source_prefix, title extraction) + +### Data Returned by Reviewable Checks + +| Check | Data Type | Sufficiency | +|-------|-----------|-------------| +| content_overlap | Vec | ✅ Complete (ID, title, source, similarity, shared tags) | +| content_quality:no_source | Vec | ⚠️ IDs only, missing title/preview | +| boilerplate_pollution | Vec | ⚠️ IDs only, missing title/count of clones | +| contradiction_detection | i32 count | ❌ No pairs at all | +| tag_health:rootless | i32 count | ❌ No tag list at all | + +--- + +## Bugs Found + +### None critical. Minor observations: + +1. **`tag_health:rootless` logic** (checks.rs:L320) + - Query returns `COUNT(*) FROM tags WHERE parent_id IS NULL` + - This counts ALL tags with null parent, including the autotag category roots (Topics, People, Locations, etc.) + - May be intentional (those are "rootless" in tree structure), but unclear if UX wants to surface them as issues + - Recommend: Add comment clarifying whether autotag roots should be excluded + +2. **`contradiction_detection` semantic** (checks.rs:L375–376) + - Comment says "no LLM check yet", but the check still sets `requires_review: true` + - Means UI shows a warning the user cannot act on + - Should either: (a) return pair details now, or (b) set `requires_review: false` until LLM pair analysis is ready + +3. **Title extraction** (health.rs:L777–782) + - `extract_title_preview()` returns first line (up to \n), max ~100 chars + - If atom starts with a code block or long table, preview is useless + - Low impact, but UX could show "Preview" section more explicitly + +--- + +## Tests + +### Unit Tests +- **link_resolution.rs**: 13 tests (L405–487) + - Internal link extraction, wikilink parsing, vault root detection, link resolution logic + - Examples: `test_relative_href_resolves_to_vault_root`, `test_extract_markdown_links`, `test_absolute_links_ignored` + - **No tests for health checks themselves** (no fixtures for HealthRawData, no check validation tests) + +### Integration Tests +- **integration_tests.rs**: ~20 tests + - Full atom CRUD, tag hierarchy, pagination, wiki lifecycle, source tracking, settings, tokens, positions + - **No health check tests** — no callers of `compute_health()`, no scenario validation +- **pipeline_tests.rs**: ~15 tests + - Embedding/tagging pipelines, retries, model changes, delete cascades + - **No health check tests** +- **storage_tests.rs**: ~30 tests + - Atom, tag, chat, wiki storage operations + - **No health check tests** + +### Test Infrastructure + +| Component | Location | Status | +|-----------|----------|--------| +| **Mock AI Server** | `tests/support/mod.rs` | ✅ Provided (mock embeddings + chat) | +| **Test DB Setup** | `integration_tests.rs:L13–17` | ✅ TempDir-backed SQLite | +| **Event Collector** | `tests/support/mod.rs:L336–346` | ✅ Async channel-based | +| **Core Factory** | `tests/support/mod.rs:L255–302` | ✅ `setup_core(backend, mock_url)` | +| **Health Fixtures** | ❌ None | **Gap: No fixtures for seeding HealthRawData states** | + +--- + +## Recommendations + +### High Priority + +1. **`contradiction_detection`**: Either return pair details or set `requires_review: false` + - Rationale: Currently surfaces unprovable claim to user + - Effort: Medium (SQL for pairs + build DuplicatePair-like struct for contradictions) + +2. **`tag_health:rootless`**: Return tag list, not just count + - Rationale: Allows user to fix (merge to parent, or acknowledge as root category) + - Effort: Low (add 1 query, return Vec<(id, name, atom_count)>) + +3. **`content_quality:no_source`**: Return title + preview, not just ID + - Rationale: UI can show context without second round-trip + - Effort: Low (modify query to SELECT id, title preview, created_at) + +4. **`boilerplate_pollution`**: Return title + edge count per atom + - Rationale: Helps prioritize review (5+ clones > 2 clones) + - Effort: Medium (join atoms + count edges per source, aggregate) + +### Medium Priority + +5. **Add health check tests** + - Create fixtures for HealthRawData states (overlaps, contradictions, quality issues, tag anomalies) + - Validate score calculation, requires_review flags, data shape + - Effort: ~2–3 hrs for good coverage + +6. **Document tag_health rootless semantics** + - Is counting autotag roots correct? Add comment + test + - Effort: 30 min + +### Low Priority + +7. **Improve title extraction** + - Skip code blocks, tables; return full-paragraph preview + - Effort: Medium (markdown parsing) + - Impact: Minor (UX polish only) + +--- + +## Implementation Roadmap + +**Phase 1 (quick wins — 1–2 hrs)** +- Add tag list to `tag_health:rootless` (modify health.rs query, update checks.rs data shape) +- Add title + preview to `content_quality:no_source` (modify health.rs query) +- Document/clarify `tag_health` rootless scope + +**Phase 2 (medium — 2–3 hrs)** +- `contradiction_detection`: Decide scope (pair data now? or disable requires_review until LLM ready?) +- `boilerplate_pollution`: Add title + edge count aggregation + +**Phase 3 (quality — 2–3 hrs)** +- Add comprehensive health check test fixtures +- Validate data shapes against UI expectations +- Add regression tests for fix operations + +--- + +## Files Inspected + +✅ `crates/atomic-core/src/health/checks.rs` (418 lines) +✅ `crates/atomic-core/src/health/mod.rs` (659 lines) +✅ `crates/atomic-core/src/storage/sqlite/health.rs` (798 lines) +✅ `crates/atomic-core/src/health/link_resolution.rs` (511 lines) +✅ `crates/atomic-core/tests/integration_tests.rs` +✅ `crates/atomic-core/tests/support/mod.rs` diff --git a/docs/plans/2026-05-01-review-queue-v2/plan.md b/docs/plans/2026-05-01-review-queue-v2/plan.md new file mode 100644 index 00000000..6e6aedff --- /dev/null +++ b/docs/plans/2026-05-01-review-queue-v2/plan.md @@ -0,0 +1,452 @@ +# Knowledge Health Review Queue — UI Improvements (v2) + +**Date:** 2026-05-01 +**Status:** Planning +**Project:** Atomic +**Request:** Enhance the existing Review Queue modal with per-item inline actions, per-tab re-scan, richer resolution workflow (3-option Keep A/Keep B/Merge, source/recency badges, diff highlighting), batch selection, filtering/sorting, resolved counters, markdown export, and dashboard deep-linking. Preserve existing theme and layout. + +--- + +## Executive Summary + +The Review Queue modal (`HealthReviewModal.tsx`) renders 5 tabs over a single `HealthReport` snapshot. Today it has lightweight actions on two tabs (Content overlap: Merge/Keep both; Boilerplate: informational) and passive display on the other three (Contradictions, No source, Tag structure). The proposed v2 turns it into an interactive queue: every tab supports per-item actions, batch selection, filtering, and a persistent "dismissed/resolved" state. Several backend additions are required — dismissal storage, per-item source updates, per-check re-scan, LLM strip-boilerplate and merge-editor previews, and tag merge/move endpoints exposed for the modal. + +**Recommended phasing:** ship in 4 waves — (A) dismissals + inline actions that reuse existing endpoints, (B) per-tab re-scan + resolved counters + lazy loading, (C) resolution upgrades (3-option resolver, diff highlighting, source badges, merge editor), (D) batch operations + export + dashboard integration. + +--- + +## Current Architecture & Evidence + +### Modal structure — `src/components/dashboard/widgets/HealthReviewModal.tsx` (644 lines) + +- Single top-level `HealthReviewModal` component (L502–L643) takes a full `HealthReport` and a `checkName` pre-selector +- Tabs array built at L520–L526 — included conditionally based on which `checks[*]` has non-empty data +- `selectedTab` state at L528; `activeTab` defaults to first available tab (L529) +- `resolvedCount` state (L531) currently only increments on Content overlap `applyPairFix`; no persistence across sessions or tabs +- Escape key + body scroll lock at L533–L541; no other keyboard shortcuts +- `applyPairFix` callback at L543–L556 calls `apply_health_item_fix` with `check: 'duplicate_detection'`; `setResolvedCount(n => n + 1)` on success +- Tab bodies: `PairRow` (L68–L197), `BoilerplateSection` (L227–L267), `ContradictionRow`/`ContradictionSection` (L271–L376), `ContentQualitySection` (L380–L427), `TagHealthSection` (L431–L487) +- No batch selection, no filtering, no sorting controls, no re-scan, no export, no source/recency badges, no diff highlighting + +### Action endpoints — `crates/atomic-server/src/routes/health.rs` + +| Endpoint | What it does today | Relevance | +|---|---|---| +| `GET /api/health/knowledge` | Full report with all 5 review-data blobs | Used on modal open | +| `POST /api/health/fix` (`run_health_fix`) | Batch auto-fix across all checks | Used by the dashboard's big button, not by the modal | +| `POST /api/health/fix/{check}/{item_id}` (`apply_manual_fix`) | Per-item manual fix | Currently only handles `(duplicate_detection, merge_with_llm)` (L100–L125); **all other check+action pairs return 400** | +| `POST /api/health/undo/{fix_id}` | Undo a logged fix | Wired in the dashboard undo toast | +| `GET /api/health/history` / `GET /api/health/fixes/recent` | Historical reports and fix log | Not used by the modal | +| `POST /api/health/check/{check_name}` (`compute_single_check`) | Re-run one check in isolation | **Already exists** — can power per-tab re-scan | + +### Existing fix primitives we can reuse + +- `crates/atomic-core/src/health/llm_fixes.rs` + - `merge_duplicate_pair(core, atom_a, atom_b, dry_run)` — returns the merged content when `dry_run=true` (no writes), otherwise writes + logs (L79–L226). Already supports preview. + - `fix_untagged_complete_atoms(core, ids, dry_run)` — re-runs tagging pipeline +- `crates/atomic-core/src/storage/sqlite/tags.rs` + - `apply_tag_merges_impl(&[TagMerge { winner_name, loser_name, reason }])` — canonical tag merge path (L512–L532); also exposed on `AtomicCore::apply_tag_merges` (`lib.rs` L2134) + - `update_tag_impl(id, name, parent_id)` at L178–L219 — can reparent a tag (used for "Move under…") + - `delete_tag_impl(id, recursive)` at L394 — exists for orphan cleanup +- Atom updates: `update_atom` command (command-map.ts L74–L79) takes `{ content, source_url, published_at, tag_ids, ... }` — "Add source" inline can reuse this with the atom's existing content + +### Data shapes that back the queue + +All live inside the `HealthReport` blob computed on demand; per-atom pre-fetches happen in `PairRow.toggleExpand` (L96–L110) and `ContradictionRow.toggleExpand` (L276–L292) via `get_atom`. There is **no** persistent "dismissed" state — if you refresh, everything that was dismissed returns. + +### Dashboard integration — `src/components/dashboard/widgets/HealthWidget.tsx` + +- "Apply N automatic fixes" button at the bottom of the widget (excluded checks tracked in `excludedFromFix`, L370+). Already has tooltip/label infrastructure. +- `setShowReviewModal(checkName)` is called from two places — auto-pops to first `requires_review` check on the main "Review" button, and from the `HealthCheckRow` component per-row. Deep-link works already; the issue is post-resolution dashboard refresh. + +--- + +## Recommended Approach + +Split across 4 phases so UX improvements land incrementally and each wave is independently shippable: + +| Phase | Theme | Major deps | +|---|---|---| +| **A** | Dismissals + inline per-item actions | New DB table `health_dismissals`; extend `apply_manual_fix` | +| **B** | Per-tab re-scan, resolved counters, lazy content fetch | Reuse `compute_single_check`; new `checkUpdatedAt` state | +| **C** | 3-option resolver, source/recency badges, diff highlighting, merge-editor, contradiction summary | Extend `get_atom` cache; LLM-powered conflict summary; `diff-match-patch` dep | +| **D** | Batch selection, Strip boilerplate LLM pass, export, dashboard real-time sync | New `POST /api/health/strip-boilerplate`; frontend markdown export helper | + +### Why dismissals must be persistent + +Every feature (resolved counter, "Show deferred" toggle, "Mark intentional", batch dismiss, "Ignore pair") depends on somewhere to store *"this item should not appear until the underlying condition changes"*. Without it, every refresh re-surfaces everything, which defeats the queue metaphor. The cheapest fix is a new `health_dismissals` table keyed by `(check_name, item_key)` — see Phase A below for the schema. + +### Dependency footprint + +- Backend (Rust): 1 new migration, 1 new table, ~6 new endpoints/wrapper methods, 1 LLM prompt for "strip boilerplate", 1 LLM prompt for "contradiction summary" +- Frontend (TS): `diff-match-patch` (≈ 50KB, widely used, no peer deps). All other work uses existing primitives (Zustand store, Tailwind, lucide icons) + +--- + +## Implementation Plan + +### Phase A — Dismissals + Inline Per-Item Actions (~20h) + +#### A1. New `health_dismissals` table (migration V18) + +**File:** `crates/atomic-core/src/db.rs` + +```sql +CREATE TABLE IF NOT EXISTS health_dismissals ( + id TEXT PRIMARY KEY, + check_name TEXT NOT NULL, + item_key TEXT NOT NULL, -- e.g. atom_id, pair_id, tag_id, 'a_b' for pairs + reason TEXT NOT NULL, -- 'intentional_no_source', 'ignored_pair', 'deferred', 'resolved_other' + dismissed_at TEXT NOT NULL, + expires_at TEXT -- null = permanent until underlying data changes +); +CREATE UNIQUE INDEX idx_health_dismissals_lookup + ON health_dismissals(check_name, item_key); +``` + +Bump `LATEST_VERSION` to 18. Follow the V17 idempotent pattern if existing tests re-run the migration. + +#### A2. Storage methods and `AtomicCore` wrappers + +**File:** `crates/atomic-core/src/storage/sqlite/health.rs` + +```rust +pub(crate) fn list_dismissed_keys_impl(&self, check_name: &str) -> StorageResult>; +pub(crate) fn dismiss_health_item_impl(&self, check_name: &str, item_key: &str, reason: &str, expires_at: Option<&str>) -> StorageResult<()>; +pub(crate) fn undismiss_health_item_impl(&self, check_name: &str, item_key: &str) -> StorageResult<()>; +``` + +Wire through `StorageBackend` (async) and `AtomicCore`. + +#### A3. Filter dismissed items inside `compute_single_check` / `compute_health` + +Add one `SELECT` per reviewable check. Feed a `HashSet` of dismissed keys into the check function and exclude matches from `data.pairs` / `data.affected_atoms` / `data.issues.no_source.atoms` / `data.rootless_tag_list`. + +Keep item keys stable: +- `content_overlap` / `contradiction_detection`: `{atom_a_id}__{atom_b_id}` sorted by id lexicographically +- `content_quality` no_source: atom_id +- `boilerplate_pollution`: atom_id +- `tag_health` rootless: tag_id +- `tag_health` similar_name: `{winner_id}__{loser_id}` sorted + +#### A4. Extend `apply_manual_fix` with new (check, action) tuples + +**File:** `crates/atomic-server/src/routes/health.rs` (L93–L126) + +| check | action | Body | Behaviour | +|---|---|---|---| +| `content_overlap` | `keep_a` / `keep_b` | — | Delete loser atom; log undoable `before_state` | +| `content_overlap` | `dismiss` | — | Insert dismissal reason=`resolved_other` | +| `contradiction_detection` | `defer` | — | Dismissal with `expires_at = now + 7 days` | +| `contradiction_detection` | `dismiss` | — | Dismissal `resolved_other` | +| `contradiction_detection` | `summary` | — | LLM one-liner (Phase C4) | +| `content_quality` | `add_source` | `{url}` | `update_atom` preserving existing content | +| `content_quality` | `mark_intentional` | — | Dismissal reason=`intentional_no_source` | +| `tag_health` | `move_under` | `{parent_id}` | `update_tag_impl(id, name, Some(parent_id))` | +| `tag_health` | `merge` | `{into_tag_id}` | `apply_tag_merges_impl` | +| `tag_health` | `ignore_pair` | — | Dismissal `ignored_pair` | +| `boilerplate_pollution` | `reembed` | — | Enqueue `retry_embedding` | + +#### A5. Frontend: per-item actions + +- **No source tab**: `NoSourceRow` component — inline URL input + Save; "Mark intentional"; "Open ↗" +- **Tag structure tab**: rootless rows get "Move under…" dropdown populated from `useTags()` store; similar-name pairs get "Merge" confirm dialog + "Ignore pair" +- **Boilerplate tab**: "View edges" lazy-expand (same pattern as `PairRow.toggleExpand`); "Re-embed" button; "Strip boilerplate" disabled with "Coming soon" tooltip until Phase D + +--- + +### Phase B — Per-Tab Re-scan, Resolved Counters, Lazy Loading (~10h) + +#### B1. Per-tab "Re-scan" button + +Top of each tab body: `↻ Re-scan`. Calls `health_check_single({check_name})` (command already exists at command-map.ts L722). On success, splice result into local `report.checks[name]` state. + +Track `lastScannedAt: Record`; render "Last checked: 2m ago" via `Intl.RelativeTimeFormat`. + +#### B2. Resolved counters + +Upgrade `resolvedCount` to `Record`. Persist to localStorage scoped by active database id. Clear daily. Show "Resolved today: N" at the top of each tab; add a progress bar (`X / initial_queue_size`). + +#### B3. Lazy tab content + virtualization + +Only mount the active tab's body. For >50 items in a tab, wrap the list in `@tanstack/react-virtual` — already in deps via the canvas widget. + +--- + +### Phase C — Resolution Upgrades (~25h) + +#### C1. 3-option resolver for pairs + +Replace Merge/Keep both with `Keep A | Keep B | Merge (edit)`: + +- `Keep A` / `Keep B` — archive the loser via new `apply_manual_fix` action +- `Merge (edit)` — opens `MergeEditorModal`: + 1. Call `apply_health_item_fix` with `action: merge_with_llm, dry_run: true` (already supported by `merge_duplicate_pair` at llm_fixes.rs L79+) + 2. Show synthesis in CodeMirror editor pre-populated with dry-run content + 3. "Save merge" → new action `merge_with_edited_content` body `{ content, winner_atom_id, loser_atom_id }` + 4. Single `FixAction` for undo + +#### C2. Source trust + recency indicator + +Backend: content-overlap SQL already joins atoms; add `created_at` to the selected columns in `storage/sqlite/health.rs`. Contradiction query needs the same enrichment. + +Frontend helper: +```ts +function trustScore(source: string | null, createdAt: string): { badge: string; score: number } +``` +- +10 if hostname is in the `trusted_sources` setting (comma-separated) +- +5 if `created_at` within last 30 days +- Render per-atom badge; higher-scoring atom gets "Recommended" chip; ties → no chip + +#### C3. Diff highlighting + +Add `diff-match-patch` (~50KB). In `PairRow` / `ContradictionRow` expanded view, replace raw `
` with line-diff. Atom A pane highlights removals red; Atom B pane highlights additions green. Content always fully visible.
+
+#### C4. Contradiction summary (LLM)
+
+New action in `apply_manual_fix`: `(contradiction_detection, summary)` body empty. Calls LLM: "In one sentence describe what factual claims conflict between these atoms, or 'no real conflict' if the differences are perspective, not fact." Cache per `pair_id` in frontend state.
+
+#### C5. "Flag for later" + "Show deferred"
+
+`defer` action inserts dismissal with `expires_at = now + 7d`. Tab header shows `Show deferred (N)` toggle when any deferred items exist. When enabled, pass `?include_deferred=true` to `compute_single_check` — rename the param if it conflicts, otherwise add it to the query extractor.
+
+---
+
+### Phase D — Batch, Strip Boilerplate, Export, Dashboard Sync (~25h)
+
+#### D1. Selection mode
+
+Checkbox per row. State `selectedItems: Record>` keyed by tab. Floating action bar when any selected:
+
+```
+[3 selected]  [Dismiss all]  [Apply suggested merge]  [Clear]
+```
+
+Sequential batch dispatch with progress callback. Undo stack captures all action ids; Undo applies in reverse.
+
+#### D2. "Strip boilerplate" LLM pass
+
+**New endpoint:** `POST /api/health/strip-boilerplate/{atom_id}` body `{ dry_run: bool }`.
+
+New function `strip_boilerplate` in `llm_fixes.rs`:
+1. Load atom + all atoms sharing ≥5 near-identical chunks (via `semantic_edges` ≥0.99)
+2. LLM prompt: "The following atoms share template text. Return the unique content of atom_X only, preserving its specific details but removing shared sections present in all samples."
+3. `dry_run=true` returns proposed content; `false` writes via `update_atom_content_only`
+
+Frontend: dry-run → before/after diff modal → confirm → real call.
+
+#### D3. Export queue to Markdown
+
+Frontend-only. New `buildReviewQueueMarkdown(report, dismissals)` that iterates all 5 tab datasets and emits the format the prompt specifies. Reuse web/Tauri file-save split from existing `HealthExportModal.tsx`. Button lives in the modal header.
+
+#### D4. Dashboard real-time sync
+
+Debounce-wrap `fetchHealth()` in `HealthPanel` so batch actions only trigger one refresh. Optional: backend returns `{ dirty: true }` on dismissal changes so the dashboard can show "Scores may be stale — refresh" instead of forcing recompute.
+
+#### D5. "Apply N automatic fixes" tooltip
+
+Add tooltip `"Auto-fixes only affect: broken links, re-tagging empty atoms, trimming long content. Manual review items are handled in the Review Queue."` to the button in `HealthWidget.tsx`.
+
+---
+
+## Files / Components To Change
+
+### Backend (Rust)
+
+| File | Change |
+|---|---|
+| `crates/atomic-core/src/db.rs` | V18 migration; bump `LATEST_VERSION`; idempotent ALTER pattern |
+| `crates/atomic-core/src/storage/sqlite/health.rs` | 3 dismissal methods; enrich overlap/contradiction queries with `created_at` |
+| `crates/atomic-core/src/storage/mod.rs` | `StorageBackend` async wrappers for the 3 dismissal methods |
+| `crates/atomic-core/src/health/checks.rs` | Thread `dismissed_keys` into every reviewable check; exclude matching items |
+| `crates/atomic-core/src/health/mod.rs` | `compute_health` / `compute_single_check` pass dismissals; add `include_deferred` param |
+| `crates/atomic-core/src/health/llm_fixes.rs` | New `strip_boilerplate` function; new `merge_with_edited_content`; `summarize_contradiction` |
+| `crates/atomic-server/src/routes/health.rs` | Extend `apply_manual_fix` match with all new action tuples; add `POST /api/health/strip-boilerplate/{atom_id}` + OpenAPI annotation; thread `include_deferred` query param through `compute_single_check` |
+| `crates/atomic-server/src/routes/mod.rs` | Register new strip-boilerplate route |
+| `crates/atomic-server/src/lib.rs` | Add new handler + schema types to `#[openapi(paths(...))]` |
+
+### Frontend (TypeScript)
+
+| File | Change |
+|---|---|
+| `src/components/dashboard/widgets/HealthReviewModal.tsx` | Split into multiple files; add checkbox state, sort/filter bar, export button, per-tab re-scan |
+| `src/components/dashboard/widgets/review/NoSourceRow.tsx` | **New** — inline URL editor + Mark intentional + Open |
+| `src/components/dashboard/widgets/review/TagRootlessRow.tsx` | **New** — Move under dropdown + Dismiss |
+| `src/components/dashboard/widgets/review/TagSimilarPairRow.tsx` | **New** — Merge confirm + Ignore pair |
+| `src/components/dashboard/widgets/review/BoilerplateAtomRow.tsx` | **New** — View edges expand + Re-embed |
+| `src/components/dashboard/widgets/review/MergeEditorModal.tsx` | **New** — CodeMirror merge editor with dry-run pre-fill |
+| `src/components/dashboard/widgets/review/PairDiffView.tsx` | **New** — diff-match-patch line-mode rendering for side-by-side |
+| `src/components/dashboard/widgets/review/ReviewQueueExport.ts` | **New** — buildReviewQueueMarkdown helper |
+| `src/components/dashboard/widgets/review/trustScore.ts` | **New** — source/recency scoring helper |
+| `src/components/dashboard/widgets/HealthWidget.tsx` | Debounced refresh on `onResolved`; "Apply N fixes" tooltip |
+| `src/lib/transport/command-map.ts` | `strip_health_boilerplate` entry; pass `include_deferred` to `health_check_single` |
+| `package.json` | Add `diff-match-patch` + `@types/diff-match-patch` |
+
+---
+
+## Data Flow / Interfaces
+
+### Dismissal lifecycle
+
+```
+user clicks "Mark intentional"
+  → POST /api/health/fix/content_quality/{atom_id}  body {action: "mark_intentional"}
+  → dismiss_health_item(check="content_quality", key=atom_id, reason="intentional_no_source")
+  → frontend optimistically removes row; onResolved() fires
+  → next compute_single_check() excludes this atom until it gains a source URL
+```
+
+### Merge-editor flow
+
+```
+user clicks "Merge (edit)" on a pair
+  → POST /api/health/fix/content_overlap/{pair_id}  body {action: "merge_with_llm", dry_run: true}
+  → merge_duplicate_pair(dry_run=true) returns synthesized content (no writes)
+  → MergeEditorModal opens; pre-fills CodeMirror with synthesis
+  → user edits, clicks "Save merge"
+  → POST /api/health/fix/content_overlap/{pair_id}  body {action: "merge_with_edited_content", content, winner_atom_id, loser_atom_id}
+  → update_atom(winner.id, edited_content); delete_atom(loser.id); log FixAction
+```
+
+### Batch dispatch
+
+```
+user selects 3 pairs, clicks "Dismiss all"
+  → for each: POST /api/health/fix/content_overlap/{pair_id} {action: "dismiss"}
+  → frontend shows "Processing 2/3…"
+  → on completion: toast "✅ 3 pairs dismissed" with Undo
+  → Undo → reverse sequence of undismiss calls
+```
+
+---
+
+## Configuration / Secrets / Deployment Notes
+
+### New settings (optional — default empty/off)
+
+- `trusted_sources` (string, comma-separated hostnames) — used by trustScore helper
+- `review_queue.auto_defer_days` (int, default 7) — expires_at for "Flag for later"
+- `review_queue.batch_concurrency` (int, default 1) — parallelism for batch dispatch; keep at 1 by default to preserve order for undo
+
+No secrets needed. No new env vars. The LLM endpoints reuse the already-configured provider (OpenRouter or Ollama).
+
+### Schema migration deployment
+
+The V18 migration is additive-only (new table + unique index). Safe to deploy without downtime. Backfill is not required — an empty `health_dismissals` table means nothing is dismissed, which is the correct initial state.
+
+### OpenAPI surface
+
+Register 1 new path (`/api/health/strip-boilerplate/{atom_id}`) plus extended request body schema for `ManualFixRequest` (add optional fields for `url`, `parent_id`, `into_tag_id`, `content`, `winner_atom_id`, `loser_atom_id`, `dry_run`). All schemas under `#[cfg_attr(feature = "openapi", derive(ToSchema))]` matching the existing convention.
+
+---
+
+## Testing / Validation Plan
+
+### Automated
+
+**Backend unit tests** (`crates/atomic-core/src/health/tests.rs`):
+- `test_dismissed_content_overlap_excluded` — create fixture with 3 pairs, dismiss 1, confirm `compute_single_check` returns 2
+- `test_dismissed_tag_health_rootless_excluded` — same pattern for rootless tags
+- `test_contradiction_defer_expires` — insert dismissal with `expires_at` in the past, confirm item reappears
+- `test_add_source_updates_atom` — call `apply_manual_fix` with `add_source`, verify atom `source_url` is set without touching content
+- `test_move_under_reparents_tag` — `apply_manual_fix` with `move_under`, verify `update_tag_impl` was called with the new parent
+- `test_tag_merge_via_health_fix` — `apply_manual_fix` with `merge`, verify `apply_tag_merges_impl` ran and atoms were re-tagged
+- `test_keep_a_archives_b` — confirm loser atom is soft-deleted, winner untouched
+- `test_merge_dry_run_returns_content_no_writes` — already covered by existing `merge_duplicate_pair` test; add assertion that atom row count unchanged
+- `test_strip_boilerplate_dry_run` — stub LLM, confirm original atom content unchanged after dry_run
+
+**Frontend unit tests** (`src/components/dashboard/widgets/__tests__/`):
+- `NoSourceRow.test.tsx` — click Add source, enter URL, verify `update_atom` called with correct body
+- `TagRootlessRow.test.tsx` — select parent from dropdown, verify `apply_health_item_fix` with `move_under`
+- `MergeEditorModal.test.tsx` — mock dry_run response, render editor, edit content, save, verify final mutation
+- `PairDiffView.test.tsx` — snapshot test for red/green highlighting of a known diff
+- `trustScore.test.ts` — table-driven cases (trusted hostname wins, recent age beats old, ties produce no chip)
+- `ReviewQueueExport.test.ts` — fixture report → matches expected markdown byte-for-byte
+
+**Commands:**
+```bash
+cargo test -p atomic-core -- health
+cargo test -p atomic-core -- boilerplate
+cargo test -p atomic-server -- health
+cargo check -p atomic-core -p atomic-server
+npx tsc --noEmit
+npx vitest run src/components/dashboard/widgets/__tests__/
+npm run lint
+```
+
+### Manual / E2E
+
+- Build the desktop app: `npm run tauri dev`
+- Seed 3 overlapping atoms, 2 no-source atoms, 2 rootless tags in a test DB
+- Exercise each per-item action; verify dismissed items stay dismissed across modal close/reopen
+- Trigger batch dismiss on 3 items; verify undo rolls all 3 back
+- Trigger Merge-edit flow; confirm the editor pre-fills and saving updates both atoms
+- Run `npm run build:mobile` then load in Capacitor iOS/Android to smoke-test the new touch-friendly controls (checkboxes, inline URL input). Capacitor builds may require the simulator running — use `npm run dev:mobile:ios` for a live loop.
+- Export queue to markdown; diff against a known-good fixture
+
+### Blockers for runnable E2E
+- The project does not appear to ship a Playwright / Cypress harness in the repo — E2E is manual via `npm run tauri dev`. If the team wants automated E2E, that's a separate track and not covered here.
+- LLM-dependent features (Merge, Strip boilerplate, Contradiction summary) require a reachable OpenRouter key or a running Ollama instance. Tests should mock the provider via the existing `MockLlmProvider` pattern (search support/mod.rs) to avoid hitting the network.
+
+---
+
+## Risks, Assumptions, and Open Questions
+
+### Risks
+
+| Risk | Severity | Mitigation |
+|---|---|---|
+| Dismissal table grows unbounded (every dismissed pair, tag, atom) | Medium | Add periodic cleanup: delete dismissals where underlying atom/tag no longer exists; cap total at 10k rows per check with FIFO eviction |
+| Merge-edit flow can race: user A dismisses while user B is mid-merge | Low | Idempotent actions: merge_with_edited_content checks both atoms still exist before writing; returns 409 on conflict |
+| LLM cost for Contradiction summary × 20 pairs on modal open | Medium | Lazy-fetch: only call summary when user expands a pair; cache per-session |
+| Diff-match-patch for very long atoms (>10k chars) is slow | Low | Truncate content to first 2000 chars for diff view; show "Content truncated for diff; click Open to view full atom" |
+| Batch dispatch partial failure (3rd of 5 fails) | Medium | Stop on first failure; show error toast with what was applied; user can retry the rest |
+| "Strip boilerplate" LLM hallucination removes unique content | High | Always dry-run first; show before/after diff; never auto-apply in batch |
+| Move-under dropdown performance with 1000s of tags | Low | Virtualize the dropdown (same library as tag tree) |
+
+### Assumptions
+
+- `HealthReport.checks[*].data` structure is stable across all five reviewable checks (verified in current code)
+- `merge_duplicate_pair` dry-run returns content in a predictable shape — verify in `llm_fixes.rs`; may need adapter
+- `update_atom` preserves `created_at` / `updated_at` semantics (verify; should it bump `updated_at` on source-only edit?)
+- Tag merge UI does not need to preview affected atoms before applying — a count is sufficient. If the team wants full preview, add a separate "Preview merge" dry-run mode to `apply_tag_merges_impl`
+- `retry_embedding` command is the correct primitive for the "Re-embed" button; already exists in command-map (verify exact name)
+
+### Open questions
+
+1. **Archived vs deleted for Keep A / Keep B** — do we soft-archive (set a `status` flag) or hard-delete? Current `delete_atom` hard-deletes. Soft-archive would need schema work. Recommendation: use existing hard-delete; rely on `health_fix_log.before_state` snapshot for undo. Check whether `log_fix` already stores atom content snapshot on delete.
+2. **Item-key collision across DBs** — if the same atom_id exists in two databases, dismissal needs DB scoping. `health_dismissals` is per-DB (lives in data DB, not registry DB), so this is implicit — verify when wiring the migration.
+3. **Where does the "resolved today" counter live?** — localStorage is simple but doesn't sync across devices. Server-side is more work. Recommendation: localStorage for Phase B; revisit if users want cross-device.
+4. **What happens to a dismissed content_overlap pair when one of the atoms is deleted?** — dismissal becomes stale. Cleanup job: delete `health_dismissals` rows whose `item_key` references a non-existent atom. Run on startup + weekly.
+5. **Strip boilerplate threshold** — ≥5 shared edges is the current boilerplate-detection threshold. Reuse that, or make it configurable per-call?
+6. **Tag merge confirm count** — `count_atoms_with_tags` already exists. Just wire it up.
+
+---
+
+## LOE / Effort Estimate
+
+| Phase | Hours | Deliverable |
+|---|---|---|
+| A | 20 | Dismissals table + per-item actions on all tabs (except Strip/Merge-edit) |
+| B | 10 | Re-scan + resolved counters + lazy content |
+| C | 25 | 3-option resolver + merge editor + source badges + diff highlighting + contradiction summary + "Flag for later" |
+| D | 25 | Batch selection + Strip boilerplate LLM pass + Export + Dashboard sync + tooltip |
+| **Total** | **80** | ~2.5 weeks at 30 hrs/week |
+
+Additive: ~10% (8h) for unit + component tests across all phases, split evenly. ~15% (12h) for integration/manual QA across mobile + desktop.
+
+**Net estimate:** ~100 hours end-to-end, matching the complexity of the original v1 health dashboard plan.
+
+---
+
+## Decision Log
+
+| Date | Decision | Rationale |
+|---|---|---|
+| 2026-05-01 | New `health_dismissals` table rather than per-check flag columns | Single polymorphic table serves all 5 checks; easier to add future review categories |
+| 2026-05-01 | Dismissal keys are string composites (`a__b` sorted) | Avoids schema changes when we add new key shapes; frontend can construct keys without backend knowledge |
+| 2026-05-01 | Reuse `merge_duplicate_pair` dry-run for merge-editor pre-fill | Primitive already exists; avoids new LLM code paths |
+| 2026-05-01 | Diff-match-patch over a richer diff library (e.g., react-diff-viewer) | 50KB vs 200KB+; we only need line-mode highlighting, not a full diff UI |
+| 2026-05-01 | Resolved counter in localStorage, not server-side | Low stakes; avoids new storage roundtrips. Revisit if cross-device sync requested |
+| 2026-05-01 | Hard-delete for Keep A/Keep B, rely on `before_state` for undo | Avoids soft-delete schema work; existing undo infra already handles this pattern for `fix_source_uniqueness` |
+| 2026-05-01 | Phase ordering A→B→C→D | Each phase is independently shippable; A unlocks everything, D has the most LLM cost and lowest UX criticality |
diff --git a/package-lock.json b/package-lock.json
index 3629b85b..078a32e2 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -49,8 +49,10 @@
         "@tauri-apps/plugin-opener": "~2.5.3",
         "@tauri-apps/plugin-shell": "~2.3.5",
         "@types/diff": "^7.0.2",
+        "@types/diff-match-patch": "^1.0.36",
         "d3-force": "^3.0.0",
         "diff": "^8.0.4",
+        "diff-match-patch": "^1.0.5",
         "graphology": "^0.26.0",
         "graphology-types": "^0.24.8",
         "idb-keyval": "^6.2.2",
@@ -76,6 +78,8 @@
         "@capacitor/cli": "^8.3.0",
         "@tailwindcss/vite": "^4.0.0",
         "@tauri-apps/cli": "^2.0.0",
+        "@testing-library/react": "^16.3.2",
+        "@testing-library/user-event": "^14.6.1",
         "@types/d3-force": "^3.0.10",
         "@types/qrcode": "^1.5.6",
         "@types/react": "^18.3.0",
@@ -4874,6 +4878,77 @@
         "@tauri-apps/api": "^2.10.1"
       }
     },
+    "node_modules/@testing-library/dom": {
+      "version": "10.4.1",
+      "resolved": "https://registry.npmjs.org/@testing-library/dom/-/dom-10.4.1.tgz",
+      "integrity": "sha512-o4PXJQidqJl82ckFaXUeoAW+XysPLauYI43Abki5hABd853iMhitooc6znOnczgbTYmEP6U6/y1ZyKAIsvMKGg==",
+      "dev": true,
+      "license": "MIT",
+      "peer": true,
+      "dependencies": {
+        "@babel/code-frame": "^7.10.4",
+        "@babel/runtime": "^7.12.5",
+        "@types/aria-query": "^5.0.1",
+        "aria-query": "5.3.0",
+        "dom-accessibility-api": "^0.5.9",
+        "lz-string": "^1.5.0",
+        "picocolors": "1.1.1",
+        "pretty-format": "^27.0.2"
+      },
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@testing-library/react": {
+      "version": "16.3.2",
+      "resolved": "https://registry.npmjs.org/@testing-library/react/-/react-16.3.2.tgz",
+      "integrity": "sha512-XU5/SytQM+ykqMnAnvB2umaJNIOsLF3PVv//1Ew4CTcpz0/BRyy/af40qqrt7SjKpDdT1saBMc42CUok5gaw+g==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/runtime": "^7.12.5"
+      },
+      "engines": {
+        "node": ">=18"
+      },
+      "peerDependencies": {
+        "@testing-library/dom": "^10.0.0",
+        "@types/react": "^18.0.0 || ^19.0.0",
+        "@types/react-dom": "^18.0.0 || ^19.0.0",
+        "react": "^18.0.0 || ^19.0.0",
+        "react-dom": "^18.0.0 || ^19.0.0"
+      },
+      "peerDependenciesMeta": {
+        "@types/react": {
+          "optional": true
+        },
+        "@types/react-dom": {
+          "optional": true
+        }
+      }
+    },
+    "node_modules/@testing-library/user-event": {
+      "version": "14.6.1",
+      "resolved": "https://registry.npmjs.org/@testing-library/user-event/-/user-event-14.6.1.tgz",
+      "integrity": "sha512-vq7fv0rnt+QTXgPxr5Hjc210p6YKq2kmdziLgnsZGgLJ9e6VAShx1pACLuRjd/AS/sr7phAR58OIIpf0LlmQNw==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=12",
+        "npm": ">=6"
+      },
+      "peerDependencies": {
+        "@testing-library/dom": ">=7.21.4"
+      }
+    },
+    "node_modules/@types/aria-query": {
+      "version": "5.0.4",
+      "resolved": "https://registry.npmjs.org/@types/aria-query/-/aria-query-5.0.4.tgz",
+      "integrity": "sha512-rfT93uj5s0PRL7EzccGMs3brplhcrghnDoV26NqKhCAS1hVo+WdNsPvE/yb6ilfr5hi2MEk6d5EWJTKdxg8jVw==",
+      "dev": true,
+      "license": "MIT",
+      "peer": true
+    },
     "node_modules/@types/babel__core": {
       "version": "7.20.5",
       "resolved": "https://registry.npmjs.org/@types/babel__core/-/babel__core-7.20.5.tgz",
@@ -4959,6 +5034,12 @@
       "integrity": "sha512-JSWRMozjFKsGlEjiiKajUjIJVKuKdE3oVy2DNtK+fUo8q82nhFZ2CPQwicAIkXrofahDXrWJ7mjelvZphMS98Q==",
       "license": "MIT"
     },
+    "node_modules/@types/diff-match-patch": {
+      "version": "1.0.36",
+      "resolved": "https://registry.npmjs.org/@types/diff-match-patch/-/diff-match-patch-1.0.36.tgz",
+      "integrity": "sha512-xFdR6tkm0MWvBfO8xXCSsinYxHcqkQUlcHeSpMC2ukzOb6lwQAfDmW+Qt0AvlGd8HpsS28qKsB+oPeJn9I39jg==",
+      "license": "MIT"
+    },
     "node_modules/@types/estree": {
       "version": "1.0.8",
       "resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.8.tgz",
@@ -5413,6 +5494,17 @@
         "url": "https://github.com/chalk/ansi-styles?sponsor=1"
       }
     },
+    "node_modules/aria-query": {
+      "version": "5.3.0",
+      "resolved": "https://registry.npmjs.org/aria-query/-/aria-query-5.3.0.tgz",
+      "integrity": "sha512-b0P0sZPKtyu8HkeRAfCq0IfURZK+SuwMjY1UXGBU27wpAiTwQAIlq56IbIO+ytk/JjS1fMR14ee5WBBfKi5J6A==",
+      "dev": true,
+      "license": "Apache-2.0",
+      "peer": true,
+      "dependencies": {
+        "dequal": "^2.0.3"
+      }
+    },
     "node_modules/array-buffer-byte-length": {
       "version": "1.0.2",
       "resolved": "https://registry.npmjs.org/array-buffer-byte-length/-/array-buffer-byte-length-1.0.2.tgz",
@@ -6646,12 +6738,26 @@
         "node": ">=0.3.1"
       }
     },
+    "node_modules/diff-match-patch": {
+      "version": "1.0.5",
+      "resolved": "https://registry.npmjs.org/diff-match-patch/-/diff-match-patch-1.0.5.tgz",
+      "integrity": "sha512-IayShXAgj/QMXgB0IWmKx+rOPuGMhqm5w6jvFxmVenXKIzRqTAAsbBPT3kWQeGANj3jGgvcvv4yK6SxqYmikgw==",
+      "license": "Apache-2.0"
+    },
     "node_modules/dijkstrajs": {
       "version": "1.0.3",
       "resolved": "https://registry.npmjs.org/dijkstrajs/-/dijkstrajs-1.0.3.tgz",
       "integrity": "sha512-qiSlmBq9+BCdCA/L46dw8Uy93mloxsPSbwnm5yrKn2vMPiy8KyAskTF6zuV/j5BMsmOGZDPs7KjU+mjb670kfA==",
       "license": "MIT"
     },
+    "node_modules/dom-accessibility-api": {
+      "version": "0.5.16",
+      "resolved": "https://registry.npmjs.org/dom-accessibility-api/-/dom-accessibility-api-0.5.16.tgz",
+      "integrity": "sha512-X7BJ2yElsnOJ30pZF4uIIDfBEVgF4XEBxL9Bxhy6dnrm5hkzqmsWHGTiHqRiITNhMyFLyAiWndIJP7Z1NTteDg==",
+      "dev": true,
+      "license": "MIT",
+      "peer": true
+    },
     "node_modules/dunder-proto": {
       "version": "1.0.1",
       "resolved": "https://registry.npmjs.org/dunder-proto/-/dunder-proto-1.0.1.tgz",
@@ -9185,6 +9291,17 @@
         "react": "^16.5.1 || ^17.0.0 || ^18.0.0 || ^19.0.0"
       }
     },
+    "node_modules/lz-string": {
+      "version": "1.5.0",
+      "resolved": "https://registry.npmjs.org/lz-string/-/lz-string-1.5.0.tgz",
+      "integrity": "sha512-h5bgJWpxJNswbU7qCrV0tIKQCaS3blPDrqKWx+QxzuzL1zGUzij9XCWLrSLsJPu5t+eWA/ycetzYAO5IOMcWAQ==",
+      "dev": true,
+      "license": "MIT",
+      "peer": true,
+      "bin": {
+        "lz-string": "bin/bin.js"
+      }
+    },
     "node_modules/magic-string": {
       "version": "0.30.21",
       "resolved": "https://registry.npmjs.org/magic-string/-/magic-string-0.30.21.tgz",
@@ -10926,6 +11043,47 @@
         "url": "https://github.com/sponsors/sindresorhus"
       }
     },
+    "node_modules/pretty-format": {
+      "version": "27.5.1",
+      "resolved": "https://registry.npmjs.org/pretty-format/-/pretty-format-27.5.1.tgz",
+      "integrity": "sha512-Qb1gy5OrP5+zDf2Bvnzdl3jsTf1qXVMazbvCoKhtKqVs4/YK4ozX4gKQJJVyNe+cajNPn0KoC0MC3FUmaHWEmQ==",
+      "dev": true,
+      "license": "MIT",
+      "peer": true,
+      "dependencies": {
+        "ansi-regex": "^5.0.1",
+        "ansi-styles": "^5.0.0",
+        "react-is": "^17.0.1"
+      },
+      "engines": {
+        "node": "^10.13.0 || ^12.13.0 || ^14.15.0 || >=15.0.0"
+      }
+    },
+    "node_modules/pretty-format/node_modules/ansi-regex": {
+      "version": "5.0.1",
+      "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz",
+      "integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==",
+      "dev": true,
+      "license": "MIT",
+      "peer": true,
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/pretty-format/node_modules/ansi-styles": {
+      "version": "5.2.0",
+      "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-5.2.0.tgz",
+      "integrity": "sha512-Cxwpt2SfTzTtXcfOlzGEee8O+c+MmUgGrNiBcXnuWxuFJHe6a5Hz7qwhwe5OgaSYI0IJvkLqWX1ASG+cJOkEiA==",
+      "dev": true,
+      "license": "MIT",
+      "peer": true,
+      "engines": {
+        "node": ">=10"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/ansi-styles?sponsor=1"
+      }
+    },
     "node_modules/prompts": {
       "version": "2.4.2",
       "resolved": "https://registry.npmjs.org/prompts/-/prompts-2.4.2.tgz",
@@ -11146,6 +11304,14 @@
         "react": "^18.3.1"
       }
     },
+    "node_modules/react-is": {
+      "version": "17.0.2",
+      "resolved": "https://registry.npmjs.org/react-is/-/react-is-17.0.2.tgz",
+      "integrity": "sha512-w2GsyukL62IJnlaff/nRegPQR94C/XXamvMWmSHRJ4y7Ts/4ocGRmTHvOs8PSE6pB3dWOrD/nueuU5sduBsQ4w==",
+      "dev": true,
+      "license": "MIT",
+      "peer": true
+    },
     "node_modules/react-markdown": {
       "version": "10.1.0",
       "resolved": "https://registry.npmjs.org/react-markdown/-/react-markdown-10.1.0.tgz",
diff --git a/package.json b/package.json
index b88fd06a..406dc7d5 100644
--- a/package.json
+++ b/package.json
@@ -95,8 +95,10 @@
     "@tauri-apps/plugin-opener": "~2.5.3",
     "@tauri-apps/plugin-shell": "~2.3.5",
     "@types/diff": "^7.0.2",
+    "@types/diff-match-patch": "^1.0.36",
     "d3-force": "^3.0.0",
     "diff": "^8.0.4",
+    "diff-match-patch": "^1.0.5",
     "graphology": "^0.26.0",
     "graphology-types": "^0.24.8",
     "idb-keyval": "^6.2.2",
@@ -122,6 +124,8 @@
     "@capacitor/cli": "^8.3.0",
     "@tailwindcss/vite": "^4.0.0",
     "@tauri-apps/cli": "^2.0.0",
+    "@testing-library/react": "^16.3.2",
+    "@testing-library/user-event": "^14.6.1",
     "@types/d3-force": "^3.0.10",
     "@types/qrcode": "^1.5.6",
     "@types/react": "^18.3.0",
diff --git a/src/components/atoms/AtomReader.tsx b/src/components/atoms/AtomReader.tsx
index 246c9651..d3dc690b 100644
--- a/src/components/atoms/AtomReader.tsx
+++ b/src/components/atoms/AtomReader.tsx
@@ -1,5 +1,5 @@
 import { lazy, Suspense, useState, useEffect, useCallback, useMemo, useRef } from 'react';
-import { ChevronDown, Trash2 } from 'lucide-react';
+import { ChevronDown, Trash2, Lock, LockOpen } from 'lucide-react';
 import { openExternalUrl } from '../../lib/platform';
 import { Modal } from '../ui/Modal';
 import { Input } from '../ui/Input';
@@ -380,6 +380,34 @@ function AtomReaderContent({
                     className="text-xs"
                   />
                 
+                
                 
+
+        {/* Label + score bar */}
+        
+ {label} + + {check.score} +
+ + {/* Trend indicator */} + {trend !== undefined && ( + + {trend} + + )} + {severityBadge && ( + + {severityBadge} + + )} + {/* Action buttons */} +
+ + + {check.requires_review && ( + + )} +
+ + + {/* Description (always shown) */} + {description && ( +

{description}

+ )} + + {/* Expanded detail */} + {isExpanded && ( +
+ {lastCheckedLabel && ( +

Last checked: {lastCheckedLabel}

+ )} + + {examples && examples.length > 0 && ( +
+ {examples.slice(0, 2).map((ex, i) => ( +
+ + {ex} +
+ ))} +
+ )} + + {check.auto_fixable && ( + + )} + + {check.requires_review && ( + + )} +
+ )} + + ); +} + +function formatRelative(ts: number): string { + const diff = Date.now() - ts; + if (diff < 0) return 'just now'; + const mins = Math.floor(diff / 60_000); + if (mins < 1) return 'just now'; + if (mins < 60) return `${mins} min ago`; + const hrs = Math.floor(mins / 60); + if (hrs < 24) return `${hrs} hr ago`; + const days = Math.floor(hrs / 24); + return `${days}d ago`; +} diff --git a/src/components/dashboard/widgets/HealthConfirmModal.tsx b/src/components/dashboard/widgets/HealthConfirmModal.tsx new file mode 100644 index 00000000..1161c35f --- /dev/null +++ b/src/components/dashboard/widgets/HealthConfirmModal.tsx @@ -0,0 +1,172 @@ +import { createPortal } from 'react-dom'; +import { X, Play } from 'lucide-react'; +import { useEffect, useState } from 'react'; + +export interface PendingFix { + label: string; + check: string; +} + +export interface ManualOnlyCategory { + label: string; + check: string; + reason?: string; +} + +interface Props { + pending: PendingFix[]; + manualOnly?: ManualOnlyCategory[]; + currentScore: number; + atomsAffected?: number; + /** Called with the final subset of check names to run. */ + onConfirm: (selectedChecks: string[]) => void; + onCancel: () => void; +} + +export function HealthConfirmModal({ + pending, + manualOnly = [], + currentScore, + atomsAffected, + onConfirm, + onCancel, +}: Props) { + const [selected, setSelected] = useState>( + () => new Set(pending.map(p => p.check)), + ); + + useEffect(() => { + const handler = (e: KeyboardEvent) => { + if (e.key === 'Escape') onCancel(); + if (e.key === 'Enter' && selected.size > 0) { + onConfirm(Array.from(selected)); + } + }; + document.addEventListener('keydown', handler); + return () => document.removeEventListener('keydown', handler); + }, [onCancel, onConfirm, selected]); + + const toggle = (check: string) => { + setSelected(prev => { + const next = new Set(prev); + if (next.has(check)) next.delete(check); + else next.add(check); + return next; + }); + }; + + const selectedCount = selected.size; + const estSec = Math.max(2, selectedCount * 3); + const estLabel = estSec < 60 ? `~${estSec}s` : `~${Math.ceil(estSec / 60)}m`; + + return createPortal( +
{ if (e.target === e.currentTarget) onCancel(); }} + role="dialog" + aria-modal="true" + aria-label="Apply automatic fixes" + > +
+ {/* Header */} +
+
+

Apply automatic fixes?

+

+ Current score: {currentScore}/100 + {atomsAffected !== undefined && ` · ~${atomsAffected} atoms affected`} + {` · est. ${estLabel}`} +

+
+ +
+ + {/* Fix list */} +
+

Select the fixes to run:

+
    + {pending.map((fix) => { + const on = selected.has(fix.check); + return ( +
  • + +
  • + ); + })} +
+ + {manualOnly.length > 0 && ( +
+

+ Manual review only (no auto-fix) +

+
    + {manualOnly.map(cat => ( +
  • + + + {cat.label} + {cat.reason && ( + — {cat.reason} + )} + +
  • + ))} +
+
+ )} +
+ + {/* Footer */} +
+

+ {selectedCount} of {pending.length} selected +

+
+ + +
+
+
+
, + document.body, + ); +} diff --git a/src/components/dashboard/widgets/HealthExportModal.tsx b/src/components/dashboard/widgets/HealthExportModal.tsx new file mode 100644 index 00000000..5c4e3cb1 --- /dev/null +++ b/src/components/dashboard/widgets/HealthExportModal.tsx @@ -0,0 +1,152 @@ +import { createPortal } from 'react-dom'; +import { X, Download } from 'lucide-react'; +import { useEffect } from 'react'; + +// Minimal types needed for export +interface ExportHealthCheckResult { + status: string; + score: number; + data: Record; +} + +interface ExportHealthReport { + overall_score: number; + overall_status: string; + computed_at: string; + atom_count: number; + checks: Record; + auto_fixable: number; + requires_review: number; +} + +const CHECK_LABELS_EXPORT: Record = { + embedding_coverage: 'Embeddings', + tagging_coverage: 'Tagging', + source_uniqueness: 'Source duplicates', + orphan_tags: 'Orphan tags', + semantic_graph_freshness: 'Semantic graph freshness', + wiki_coverage: 'Wiki coverage', + content_quality: 'Content quality', + tag_health: 'Tag health', + content_overlap: 'Content overlap', + contradiction_detection: 'Contradiction detection', + broken_internal_links: 'Broken internal links', + boilerplate_pollution: 'Boilerplate pollution', +}; + +const CHECK_ORDER_EXPORT = [ + 'embedding_coverage', 'tagging_coverage', 'source_uniqueness', 'orphan_tags', + 'semantic_graph_freshness', 'wiki_coverage', 'content_quality', 'tag_health', + 'content_overlap', 'contradiction_detection', 'broken_internal_links', +]; + +function buildMarkdown(report: ExportHealthReport): string { + const date = new Date(report.computed_at).toLocaleString(); + let md = `# Knowledge Base Health Report\n\n`; + md += `**Overall Score:** ${report.overall_score}/100 \n`; + md += `**Status:** ${report.overall_status.replace('_', ' ')} \n`; + md += `**Generated:** ${date} \n`; + md += `**Total atoms:** ${report.atom_count} \n\n`; + md += `---\n\n`; + + for (const key of CHECK_ORDER_EXPORT) { + const check = report.checks[key]; + if (!check) continue; + const label = CHECK_LABELS_EXPORT[key] ?? key; + const statusIcon = check.score >= 90 ? '✅' : check.score >= 70 ? '⚠️' : check.score >= 50 ? '🟠' : '❌'; + md += `## ${statusIcon} ${label}\n\n`; + md += `**Score:** ${check.score}/100 \n`; + md += `**Status:** ${check.status} \n\n`; + // Include key data fields + const dataEntries = Object.entries(check.data) + .filter(([, v]) => typeof v === 'number' || typeof v === 'string') + .slice(0, 5); + if (dataEntries.length > 0) { + for (const [k, v] of dataEntries) { + md += `- **${k.replace(/_/g, ' ')}:** ${v}\n`; + } + md += '\n'; + } + } + + return md; +} + +async function downloadMarkdown(report: ExportHealthReport): Promise { + const md = buildMarkdown(report); + const filename = `health-report-${new Date(report.computed_at).toISOString().split('T')[0]}.md`; + + // Tauri desktop: use plugin-dialog + plugin-fs if available + const tauriWindow = window as typeof window & { __TAURI__?: { dialog?: unknown; fs?: unknown } }; + if (tauriWindow.__TAURI__) { + try { + const { save } = await import('@tauri-apps/plugin-dialog'); + const { writeTextFile } = await import('@tauri-apps/plugin-fs'); + const path = await save({ + defaultPath: filename, + filters: [{ name: 'Markdown', extensions: ['md'] }], + }); + if (path) { + await writeTextFile(path, md); + } + return; + } catch { + // Fall through to web download if Tauri plugins aren't available + } + } + + // Web: data: URI download + const blob = new Blob([md], { type: 'text/markdown;charset=utf-8' }); + const url = URL.createObjectURL(blob); + const a = document.createElement('a'); + a.href = url; + a.download = filename; + document.body.appendChild(a); + a.click(); + document.body.removeChild(a); + URL.revokeObjectURL(url); +} + +interface Props { + report: ExportHealthReport; + onClose: () => void; +} + +export function HealthExportModal({ report, onClose }: Props) { + const md = buildMarkdown(report); + + useEffect(() => { + const handler = (e: KeyboardEvent) => { if (e.key === 'Escape') onClose(); }; + document.addEventListener('keydown', handler); + return () => document.removeEventListener('keydown', handler); + }, [onClose]); + + return createPortal( +
{ if (e.target === e.currentTarget) onClose(); }} + > +
+
+

Export Health Report

+
+ + +
+
+
+
{md}
+
+
+
, + document.body, + ); +} diff --git a/src/components/dashboard/widgets/HealthHelpOverlay.tsx b/src/components/dashboard/widgets/HealthHelpOverlay.tsx new file mode 100644 index 00000000..7e7e0f1a --- /dev/null +++ b/src/components/dashboard/widgets/HealthHelpOverlay.tsx @@ -0,0 +1,51 @@ +import { createPortal } from 'react-dom'; +import { X } from 'lucide-react'; +import { useEffect } from 'react'; + +const SHORTCUTS = [ + { key: 'r', desc: 'Refresh all checks' }, + { key: 'f', desc: 'Open fix confirmation' }, + { key: 'e', desc: 'Export to markdown' }, + { key: '1 – 9', desc: 'Expand / collapse Nth check in list' }, + { key: '?', desc: 'Toggle this help overlay' }, + { key: 'Esc', desc: 'Close modal / overlay' }, +]; + +interface Props { + onClose: () => void; +} + +export function HealthHelpOverlay({ onClose }: Props) { + useEffect(() => { + const handler = (e: KeyboardEvent) => { + if (e.key === 'Escape' || e.key === '?') onClose(); + }; + document.addEventListener('keydown', handler); + return () => document.removeEventListener('keydown', handler); + }, [onClose]); + + return createPortal( +
{ if (e.target === e.currentTarget) onClose(); }} + > +
+
+

Keyboard shortcuts

+ +
+
+ {SHORTCUTS.map(({ key, desc }) => ( +
+ {key} + {desc} +
+ ))} +
+
+
, + document.body, + ); +} diff --git a/src/components/dashboard/widgets/HealthReviewModal.tsx b/src/components/dashboard/widgets/HealthReviewModal.tsx new file mode 100644 index 00000000..4aad776d --- /dev/null +++ b/src/components/dashboard/widgets/HealthReviewModal.tsx @@ -0,0 +1,1829 @@ +import { useState, useEffect, useCallback, useMemo, useRef } from 'react'; +import { createPortal } from 'react-dom'; +import { + X, GitMerge, Loader2, CheckCircle, + ChevronDown, ChevronUp, RefreshCw, ChevronLeft, ChevronRight, Check, Clipboard, +} from 'lucide-react'; +import { useVirtualizer } from '@tanstack/react-virtual'; +import { getTransport } from '../../../lib/transport'; +import { runReviewAction } from './review/reviewActions'; +import { toast } from '../../../stores/toasts'; +import { useTagsStore } from '../../../stores/tags'; +import { useDatabasesStore } from '../../../stores/databases'; +import { NoSourceRow } from './review/NoSourceRow'; +import { TagRootlessRow } from './review/TagRootlessRow'; +import { BoilerplateAtomRow } from './review/BoilerplateAtomRow'; +import { BrokenLinksSection } from './review/BrokenLinksSection'; +import { sourceTrust, relativeAge } from './review/badges'; +import { lineDiff, type DiffPart } from './review/diffUtil'; + +// ==================== Types ==================== + +export interface OverlapPair { + pair_id: string; + atom_a: { id: string; title: string; source?: string; created_at?: string }; + atom_b: { id: string; title: string; source?: string; created_at?: string }; + similarity: number; + shared_tag_count: number; + available_actions: string[]; +} + +interface AtomDetail { + id: string; + content: string; + source_url?: string; +} + +type PairAction = 'merge_with_llm' | 'keep_a' | 'keep_b' | 'merge_with_edited_content'; +type PairStatus = 'idle' | 'loading' | 'done' | 'error'; + + +// Boilerplate atom entry +interface BoilerplateEntry { + id: string; + title: string; + clone_count: number; +} + +// Contradiction pair +interface ContradictionPair { + pair_id: string; + atom_a: { id: string; title: string; source?: string; created_at?: string }; + atom_b: { id: string; title: string; source?: string; created_at?: string }; + similarity: number; + shared_tag_count: number; +} + +// Rootless tag +interface RootlessTag { + id: string; + name: string; + atom_count: number; +} + +// ==================== localStorage helpers ==================== + +function todayKey(): string { + const d = new Date(); + return `${d.getFullYear()}-${(d.getMonth() + 1).toString().padStart(2, '0')}-${d.getDate().toString().padStart(2, '0')}`; +} + +interface ResolvedRecord { + date: string; + counts: Record; +} + +function loadResolved(dbId: string): ResolvedRecord { + try { + const raw = localStorage.getItem(`health-resolved:${dbId}`); + if (!raw) return { date: todayKey(), counts: {} }; + const parsed = JSON.parse(raw) as ResolvedRecord; + if (parsed.date !== todayKey()) return { date: todayKey(), counts: {} }; + return parsed; + } catch { + return { date: todayKey(), counts: {} }; + } +} + +function saveResolved(dbId: string, rec: ResolvedRecord): void { + try { + localStorage.setItem(`health-resolved:${dbId}`, JSON.stringify(rec)); + } catch { /* ignore quota errors */ } +} + + +function similarityLabel(s: number): { text: string; color: string } { + if (s >= 0.80) return { text: `${(s * 100).toFixed(0)}% overlap`, color: 'text-orange-400' }; + if (s >= 0.65) return { text: `${(s * 100).toFixed(0)}% overlap`, color: 'text-yellow-400' }; + return { text: `${(s * 100).toFixed(0)}% overlap`, color: 'text-gray-400' }; +} + +// ==================== Tab header ==================== + +function TabHeader({ + label, + scannedAt, + rescanning, + onRescan, + resolvedToday, + initialQueueSize, +}: { + label: string; + scannedAt: string | undefined; + rescanning: boolean; + onRescan: () => void; + resolvedToday: number; + initialQueueSize: number; +}) { + const [, forceTick] = useState(0); + useEffect(() => { + if (!scannedAt) return; + const id = window.setInterval(() => forceTick(n => n + 1), 30_000); + return () => window.clearInterval(id); + }, [scannedAt]); + + const rel = useMemo(() => { + if (!scannedAt) return 'not scanned yet'; + const delta = Date.now() - new Date(scannedAt).getTime(); + const mins = Math.round(delta / 60_000); + if (mins < 1) return 'just now'; + if (mins < 60) return `${mins}m ago`; + const hrs = Math.round(mins / 60); + if (hrs < 24) return `${hrs}h ago`; + return `${Math.round(hrs / 24)}d ago`; + }, [scannedAt]); + + const progressPct = initialQueueSize > 0 + ? Math.min(100, Math.round((resolvedToday / initialQueueSize) * 100)) + : 0; + + return ( +
+
+
+ {label} + {resolvedToday > 0 && ( + • {resolvedToday} resolved today + )} +
+ {initialQueueSize > 0 && resolvedToday > 0 && ( +
+
+
+ )} +
+ + {scannedAt && !rescanning && ( + {rel} + )} +
+ ); +} + +// ==================== Overlap pair row ==================== + +function DiffView({ a, b }: { a: string; b: string }) { + const parts = useMemo(() => lineDiff(a, b), [a, b]); + return ( +
+      {parts.map((p: DiffPart, i: number) => (
+        {p.text}
+      ))}
+    
+ ); +} + +function PairRow({ + pair, + onResolve, +}: { + pair: OverlapPair; + onResolve: (pair: OverlapPair) => void; +}) { + const [status, setStatus] = useState('idle'); + const [appliedAction, setAppliedAction] = useState(null); + const [expanded, setExpanded] = useState(false); + const [contents, setContents] = useState<[string, string] | null>(null); + const [loadingContent, setLoadingContent] = useState(false); + const [mergeOpen, setMergeOpen] = useState(false); + const [mergeDraft, setMergeDraft] = useState(''); + const [diffMode, setDiffMode] = useState(false); + const sim = similarityLabel(pair.similarity); + + function buildDraft(a: string, b: string, titleA: string, titleB: string): string { + return `# ${titleA}\n\n${a}\n\n---\n\n# ${titleB}\n\n${b}`.trim(); + } + + const fetchContents = async () => { + setLoadingContent(true); + try { + const [a, b] = await Promise.all([ + getTransport().invoke('get_atom', { id: pair.atom_a.id }), + getTransport().invoke('get_atom', { id: pair.atom_b.id }), + ]); + setContents([a.content, b.content]); + return [a.content, b.content] as [string, string]; + } finally { + setLoadingContent(false); + } + }; + + const toggleExpand = async () => { + if (!expanded && !contents) { + await fetchContents(); + } + setExpanded(v => !v); + }; + + const openMerge = async () => { + let c = contents; + if (!c) { + c = await fetchContents(); + } + if (c) { + setMergeDraft(prev => prev || buildDraft(c![0], c![1], pair.atom_a.title, pair.atom_b.title)); + } + setMergeOpen(true); + }; + + const applyDirect = async (action: 'keep_a' | 'keep_b') => { + setStatus('loading'); + setAppliedAction(action); + const ok = await runReviewAction({ + label: action === 'keep_a' ? 'Keep A' : 'Keep B', + command: 'apply_health_item_fix', + args: { + check: 'content_overlap', + item_id: `${pair.atom_a.id <= pair.atom_b.id ? pair.atom_a.id : pair.atom_b.id}__${pair.atom_a.id <= pair.atom_b.id ? pair.atom_b.id : pair.atom_a.id}`, + action, + }, + }); + if (ok === undefined) { setStatus('idle'); setAppliedAction(null); return; } + setStatus('done'); + onResolve(pair); + }; + + const applyEditedMerge = async () => { + const aDate = pair.atom_a.created_at ? Date.parse(pair.atom_a.created_at) : 0; + const bDate = pair.atom_b.created_at ? Date.parse(pair.atom_b.created_at) : 0; + const [winner, loser] = aDate >= bDate + ? [pair.atom_a.id, pair.atom_b.id] + : [pair.atom_b.id, pair.atom_a.id]; + setStatus('loading'); + setAppliedAction('merge_with_edited_content'); + const ok = await runReviewAction({ + label: 'Merge atoms', + command: 'apply_health_item_fix', + args: { + check: 'content_overlap', + item_id: `${pair.atom_a.id <= pair.atom_b.id ? pair.atom_a.id : pair.atom_b.id}__${pair.atom_a.id <= pair.atom_b.id ? pair.atom_b.id : pair.atom_a.id}`, + action: 'merge_with_edited_content', + winner_atom_id: winner, + loser_atom_id: loser, + content: mergeDraft, + }, + }); + if (ok === undefined) { setStatus('idle'); setAppliedAction(null); return; } + setStatus('done'); + onResolve(pair); + }; + + const verifyWithLlm = async () => { + setStatus('loading'); + setAppliedAction(null); + const result = await runReviewAction({ + label: 'Verify with LLM', + command: 'apply_health_item_fix', + args: { + check: 'content_overlap', + item_id: `${pair.atom_a.id <= pair.atom_b.id ? pair.atom_a.id : pair.atom_b.id}__${pair.atom_a.id <= pair.atom_b.id ? pair.atom_b.id : pair.atom_a.id}`, + action: 'verify_with_llm', + }, + }) as { is_duplicate: boolean; reason: string } | undefined; + if (result === undefined) { setStatus('idle'); return; } + if (!result.is_duplicate) { + toast.info('Not a duplicate — dismissed', { detail: result.reason }); + setStatus('done'); + onResolve(pair); + } else { + toast.info('Confirmed duplicate: ' + result.reason); + setStatus('idle'); + } + }; + + const mergeWithLlm = async () => { + setStatus('loading'); + setAppliedAction('merge_with_llm'); + const ok = await runReviewAction({ + label: 'Merge with LLM', + command: 'apply_health_item_fix', + args: { + check: 'content_overlap', + item_id: `${pair.atom_a.id <= pair.atom_b.id ? pair.atom_a.id : pair.atom_b.id}__${pair.atom_a.id <= pair.atom_b.id ? pair.atom_b.id : pair.atom_a.id}`, + action: 'merge_with_llm', + }, + }); + if (ok === undefined) { setStatus('idle'); setAppliedAction(null); return; } + setStatus('done'); + onResolve(pair); + }; + + if (status === 'done') { + const labels: Record = { + merge_with_llm: 'Merged — LLM synthesised both atoms into one', + merge_with_edited_content: 'Merged — edited content applied', + keep_a: 'Kept A; removed B', + keep_b: 'Kept B; removed A', + }; + return ( +
+ + {labels[appliedAction!] ?? 'Resolved'} +
+ ); + } + + return ( +
+
+ {/* Header row */} +
+ {sim.text} +
+ {pair.shared_tag_count > 0 && ( + {pair.shared_tag_count} shared tag{pair.shared_tag_count !== 1 ? 's' : ''} + )} + {expanded && contents && ( + + )} + +
+
+ + {/* Atom summaries */} +
+ {[pair.atom_a, pair.atom_b].map((atom, i) => ( +
+

{atom.title}

+
+ {(() => { + const trust = sourceTrust(atom.source); + const toneClass = trust.tone === 'official' + ? 'text-blue-400 bg-blue-500/10' + : trust.tone === 'manual' + ? 'text-gray-500 bg-[#2a2a2a]' + : 'text-gray-400 bg-[#2a2a2a]'; + return {trust.label}; + })()} + {relativeAge(atom.created_at) && ( + · {relativeAge(atom.created_at)} + )} +
+
+ ))} +
+ + {/* Side-by-side or diff content */} + {expanded && contents && ( + diffMode + ? + :
+ {[pair.atom_a, pair.atom_b].map((atom, i) => ( +
+

{atom.title}

+
+                      {contents[i as 0 | 1]}
+                    
+
+ ))} +
+ )} + + {/* Actions */} +
+ } + label="Keep A" + title="Delete the right atom; keep the left one" + loading={status === 'loading' && appliedAction === 'keep_a'} + disabled={status === 'loading'} + onClick={() => applyDirect('keep_a')} + /> + } + label="Keep B" + title="Delete the left atom; keep the right one" + loading={status === 'loading' && appliedAction === 'keep_b'} + disabled={status === 'loading'} + onClick={() => applyDirect('keep_b')} + /> + } + label="Merge with LLM" + title="LLM merges both atoms into one reconciled document" + loading={status === 'loading' && appliedAction === 'merge_with_llm'} + disabled={status === 'loading'} + onClick={mergeWithLlm} + /> + } + label="Merge…" + title="Open an editor to combine both atoms, then delete the loser" + loading={loadingContent && !expanded} + disabled={status === 'loading'} + onClick={openMerge} + /> + } + label="Verify with LLM" + title="Ask the LLM whether this is a real duplicate" + loading={status === 'loading' && appliedAction === null && status === 'loading'} + disabled={status === 'loading'} + onClick={verifyWithLlm} + variant="outline" + /> +
+ + {/* Merge editor */} + {mergeOpen && ( +
+