Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
51 commits
Select commit Hold shift + click to select a range
fe37d31
feat(health): Phase 2 — trends, filtering, and sorting for health das…
bk-ty May 1, 2026
0829717
health: enrich review queue data and add unit tests
bk-ty May 1, 2026
79f12e3
feat(boilerplate): add boilerplate-aware embedding filter
bk-ty May 1, 2026
d5736a9
feat: Review Queue v2 Phase A — inline actions on NoSource, Tag, Boil…
bk-ty May 1, 2026
fe7f8fb
feat(health): Phase B — per-tab re-scan, resolved counters, virtualiz…
bk-ty May 1, 2026
d89590e
feat(health): batch fix endpoint, strip-boilerplate LLM fix, and rout…
bk-ty May 1, 2026
c572688
feat(review-queue): Phase D frontend — batch selection, strip boilerp…
bk-ty May 1, 2026
d828e24
test(atomic-core): add strip_boilerplate integration tests + MockAiSe…
bk-ty May 1, 2026
f155f73
feat(health): expose similar-name pair list with inline merge/ignore …
bk-ty May 1, 2026
84e4cee
feat(health): add BrokenLinksSection tab and extend TagHealthSection …
bk-ty May 1, 2026
17e7484
feat(health): polish BrokenLinksSection UI + add inline relink picker
bk-ty May 1, 2026
7cc4c2b
feat: add broken-link suggest endpoint and relink action
bk-ty May 1, 2026
d5195bc
fix(health): populate InternalLink.original for markdown broken links
bk-ty May 1, 2026
3a9627b
feat(health): route all review-row errors through toast with Retry
bk-ty May 1, 2026
e735b05
feat(health): add LLM verify/merge capabilities for overlap and contr…
bk-ty May 1, 2026
ef58779
feat(health): LLM-powered broken-link auto-resolver
bk-ty May 1, 2026
d582906
feat(health): add tag-structure proposal types, migration V19, and st…
bk-ty May 1, 2026
654b5a5
feat(health): dedicated /health page + LLM verify/resolve/auto-fix UI
bk-ty May 2, 2026
6cbf84a
fix(health): strip markdown fences from LLM JSON responses
bk-ty May 2, 2026
ca01ed8
feat(health): UI polish pass — tooltips, delta, per-row pulse, confir…
bk-ty May 2, 2026
b78b740
feat(health): opt-in scoring, per-DB config, locked atoms, wiki exclu…
bk-ty May 2, 2026
7c2af82
feat(health): dashboard shortcut + Settings tab + user-defined custom…
bk-ty May 2, 2026
06f6a33
feat(health): 8 new custom-rule variants (Tier 1 + Tier 2)
bk-ty May 2, 2026
899c643
refactor(health): drop Configure tab from Health page (Settings owns …
bk-ty May 2, 2026
9be9bfc
test(health): coverage for custom checks end-to-end
bk-ty May 2, 2026
80a902d
feat(health): preview endpoint + UI for custom checks
bk-ty May 2, 2026
8696bd5
fix(health): de-duplicate header on /health page
bk-ty May 2, 2026
19de7bc
fix(health): renumber migrations after rebase onto upstream V16
bk-ty May 2, 2026
f31e705
refactor(health): address AGENTS.md code-organization review
bk-ty May 2, 2026
e2bb1e2
chore(health): clippy lint cleanup
bk-ty May 2, 2026
7415435
refactor(health): split oversized modules
bk-ty May 3, 2026
79167b6
feat(health): wire per-DB prompt overrides for LLM fixes
bk-ty May 3, 2026
f272520
feat(settings): surface health LLM prompt overrides in Prompts tab
bk-ty May 3, 2026
c199c84
feat(health): expose detection thresholds as per-DB settings
bk-ty May 3, 2026
ba3d32f
ui(health): move Save/Reset below thresholds, sticky footer
bk-ty May 3, 2026
da1d4ea
feat(health): validate thresholds on set_health_config
bk-ty May 3, 2026
701edf4
test(health): thresholds actually change what checks flag
bk-ty May 3, 2026
f52e833
ui(health): inline Save in weights row; label as "Save checks & thres…
bk-ty May 3, 2026
dcd61d9
ui(health): autosave check weights and thresholds (debounced 600ms)
bk-ty May 3, 2026
c4b46ad
debug(health): log autosave tick payload to help diagnose non-persist…
bk-ty May 3, 2026
ecb1aec
fix(health): flush pending autosave on unmount; strip debug log
bk-ty May 3, 2026
604d8ce
ui(health): show autosave pill inside thresholds panel too
bk-ty May 3, 2026
1d3ae76
ui(health): unify autosave feedback into one sticky header
bk-ty May 3, 2026
8377642
fix(health): robust autosave and correct mount-time dedup baseline
bk-ty May 3, 2026
beca29d
fix(db): self-heal missing tags.autotag_description column
bk-ty May 3, 2026
3e3ef32
feat(health-config): drop sticky autosave bar
bk-ty May 3, 2026
056bcd5
feat(health/contradiction): filter template-clone false positives
bk-ty May 3, 2026
3c32089
feat(health/contradiction): add title-overlap and boilerplate-zone fi…
bk-ty May 3, 2026
ac2dd4a
fix(health/broken-links): auto-fix LLM and link picker
bk-ty May 3, 2026
de02917
fix(health/dismissals): recompute score after filtering pairs/atoms
bk-ty May 3, 2026
74d4657
fix(health): consistent counts + broken-link subdir scope
bk-ty May 3, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
156 changes: 156 additions & 0 deletions crates/atomic-core/src/boilerplate.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
//! Boilerplate-aware embedding filter.
//!
//! Detects chunks shared across multiple atoms and excludes them from
//! semantic search vectors (vec_chunks). The stored atom content
//! (atom_chunks.content) is never modified — only the embeddings change.

use sha2::{Digest, Sha256};
use std::collections::{HashMap, HashSet};

/// Normalize chunk text for boilerplate fingerprinting.
/// Strips markdown heading markers, collapses whitespace, lowercases.
pub(crate) fn normalize_for_dedup(text: &str) -> String {
let stripped: String = text
.lines()
.map(|l| l.trim_start_matches('#').trim())
.collect::<Vec<_>>()
.join(" ");
stripped
.split_whitespace()
.collect::<Vec<_>>()
.join(" ")
.to_lowercase()
}

/// Compute SHA-256 hex digest of the normalized chunk text.
pub(crate) fn content_hash(text: &str) -> String {
let normalized = normalize_for_dedup(text);
let mut hasher = Sha256::new();
hasher.update(normalized.as_bytes());
format!("{:x}", hasher.finalize())
}

/// Given a map of `hash → distinct_atom_count`, return the set of chunk
/// indices that are boilerplate (count >= min_atom_threshold).
///
/// **Fallback:** if every chunk would be filtered, returns an empty set
/// so atoms with 100% boilerplate content still get embedded.
pub(crate) fn boilerplate_indices(
chunks: &[String],
counts: &HashMap<String, i64>,
min_atom_threshold: i64,
) -> HashSet<usize> {
if min_atom_threshold <= 0 {
return HashSet::new();
}
let indices: HashSet<usize> = chunks
.iter()
.enumerate()
.filter_map(|(i, chunk)| {
let h = content_hash(chunk);
let count = counts.get(&h).copied().unwrap_or(0);
(count >= min_atom_threshold).then_some(i)
})
.collect();
// Fallback: never strip all chunks
if indices.len() == chunks.len() && !chunks.is_empty() {
HashSet::new()
} else {
indices
}
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_normalize_strips_heading_markers() {
assert_eq!(normalize_for_dedup("# My Header"), "my header");
assert_eq!(normalize_for_dedup("## Section"), "section");
}

#[test]
fn test_normalize_collapses_whitespace() {
assert_eq!(normalize_for_dedup(" hello world "), "hello world");
}

#[test]
fn test_normalize_lowercases() {
assert_eq!(normalize_for_dedup("Hello World"), "hello world");
}

#[test]
fn test_content_hash_deterministic() {
let h1 = content_hash("# My Header");
let h2 = content_hash("# My Header");
assert_eq!(h1, h2);
assert_eq!(h1.len(), 64); // SHA-256 hex
}

#[test]
fn test_content_hash_normalizes_heading_variants() {
// Different markdown levels with same text → same hash after normalization
let h1 = content_hash("# Terms of Service");
let h2 = content_hash("## Terms of Service");
assert_eq!(h1, h2);
}

#[test]
fn test_boilerplate_indices_all_unique() {
let chunks = vec![
"unique content a".to_string(),
"unique content b".to_string(),
];
let counts: HashMap<String, i64> = HashMap::new();
let indices = boilerplate_indices(&chunks, &counts, 5);
assert!(indices.is_empty());
}

#[test]
fn test_boilerplate_indices_shared_chunks() {
let chunks = vec![
"shared header".to_string(),
"unique body content".to_string(),
"shared footer".to_string(),
];
let mut counts = HashMap::new();
counts.insert(content_hash("shared header"), 10i64);
counts.insert(content_hash("shared footer"), 8i64);
let indices = boilerplate_indices(&chunks, &counts, 5);
assert_eq!(indices, HashSet::from([0, 2]));
}

#[test]
fn test_boilerplate_indices_fallback_all_boilerplate() {
let chunks = vec![
"shared chunk a".to_string(),
"shared chunk b".to_string(),
];
let mut counts = HashMap::new();
counts.insert(content_hash("shared chunk a"), 20i64);
counts.insert(content_hash("shared chunk b"), 15i64);
// All chunks are boilerplate → fallback: return empty set
let indices = boilerplate_indices(&chunks, &counts, 5);
assert!(indices.is_empty(), "should fall back to empty when all chunks are boilerplate");
}

#[test]
fn test_boilerplate_below_threshold_not_filtered() {
let chunks = vec!["shared header".to_string()];
let mut counts = HashMap::new();
counts.insert(content_hash("shared header"), 3i64); // below threshold of 5
let indices = boilerplate_indices(&chunks, &counts, 5);
assert!(indices.is_empty());
}

#[test]
fn test_boilerplate_threshold_zero_disabled() {
let chunks = vec!["any content".to_string()];
let mut counts = HashMap::new();
counts.insert(content_hash("any content"), 100i64);
// threshold = 0 means disabled → nothing filtered
let indices = boilerplate_indices(&chunks, &counts, 0);
assert!(indices.is_empty());
}
}
1 change: 1 addition & 0 deletions crates/atomic-core/src/briefing/agentic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -650,6 +650,7 @@ mod tests {
tagging_status: "complete".to_string(),
embedding_error: None,
tagging_error: None,
is_locked: false,
},
tags: vec![],
}
Expand Down
17 changes: 16 additions & 1 deletion crates/atomic-core/src/briefing/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -103,10 +103,25 @@ pub async fn run_briefing(
}

// Run the agent loop.
let (content, citations) = agentic::generate(core, &since, &new_atoms, total_new)
let (mut content, citations) = agentic::generate(core, &since, &new_atoms, total_new)
.await
.map_err(AtomicCoreError::Wiki)?;

// Append health summary when score is concerning
if let Ok(report) = crate::health::compute_health(core).await {
if report.overall_score < 85 {
let health_section = format!(
"\n\n## Knowledge Health\n\n\
Your knowledge base health score is **{}/100** ({}).\n\n\
{} issues can be auto-fixed via the dashboard.",
report.overall_score,
report.overall_status,
report.auto_fixable
);
content.push_str(&health_section);
}
}

let id = uuid::Uuid::new_v4().to_string();
let now = Utc::now().to_rfc3339();
let briefing = Briefing {
Expand Down
149 changes: 148 additions & 1 deletion crates/atomic-core/src/db.rs
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,7 @@ impl Database {
/// 1. Add a new `if version < N` block at the end (before the virtual-table section)
/// 2. End the block with `PRAGMA user_version = N;`
/// 3. Bump LATEST_VERSION
const LATEST_VERSION: i32 = 16;
const LATEST_VERSION: i32 = 21;

pub fn run_migrations(conn: &Connection) -> Result<(), AtomicCoreError> {
Self::run_migrations_internal(conn, false)
Expand Down Expand Up @@ -816,6 +816,119 @@ impl Database {
conn.execute_batch("PRAGMA user_version = 16;")?;
}

// --- V16 → V17: Knowledge health tables ---
if version < 17 {
conn.execute_batch(
r#"
CREATE TABLE IF NOT EXISTS health_reports (
id TEXT PRIMARY KEY,
computed_at TEXT NOT NULL,
overall_score INTEGER NOT NULL,
check_scores TEXT NOT NULL,
atom_count INTEGER NOT NULL,
auto_fixes_applied INTEGER NOT NULL DEFAULT 0,
report_json TEXT NOT NULL
);
CREATE INDEX IF NOT EXISTS idx_health_reports_computed
ON health_reports(computed_at DESC);

CREATE TABLE IF NOT EXISTS health_fix_log (
id TEXT PRIMARY KEY,
check_name TEXT NOT NULL,
action TEXT NOT NULL,
tier TEXT NOT NULL,
atom_ids TEXT,
tag_ids TEXT,
before_state TEXT NOT NULL DEFAULT '{}',
after_state TEXT NOT NULL DEFAULT '{}',
llm_prompt TEXT,
llm_response TEXT,
executed_at TEXT NOT NULL,
undone_at TEXT
);
CREATE INDEX IF NOT EXISTS idx_health_fix_log_executed
ON health_fix_log(executed_at DESC);
CREATE INDEX IF NOT EXISTS idx_health_fix_log_check
ON health_fix_log(check_name);

PRAGMA user_version = 17;
"#,
)?;
}

// --- V17 → V18: content_hash column on atom_chunks for boilerplate detection ---
if version < 18 {
// ALTER TABLE ADD COLUMN has no IF NOT EXISTS in SQLite; ignore the error
// if the column was already added (e.g. during a test migration re-run).
let _ = conn.execute(
"ALTER TABLE atom_chunks ADD COLUMN content_hash TEXT",
[],
);
conn.execute_batch(
r#"
CREATE INDEX IF NOT EXISTS idx_atom_chunks_content_hash
ON atom_chunks(content_hash);
PRAGMA user_version = 18;
"#,
)?;
}

// --- V18 → V19: persistent dismissals for the review queue ---
if version < 19 {
conn.execute_batch(
r#"
CREATE TABLE IF NOT EXISTS health_dismissals (
id TEXT PRIMARY KEY,
check_name TEXT NOT NULL,
item_key TEXT NOT NULL,
reason TEXT NOT NULL,
dismissed_at TEXT NOT NULL,
expires_at TEXT
);
CREATE UNIQUE INDEX IF NOT EXISTS idx_health_dismissals_lookup
ON health_dismissals(check_name, item_key);
PRAGMA user_version = 19;
"#,
)?;
}

// --- V19 → V20: tag_proposals table ---
if version < 20 {
conn.execute_batch(
r#"
CREATE TABLE IF NOT EXISTS tag_proposals (
id TEXT PRIMARY KEY,
summary TEXT NOT NULL,
actions_json TEXT NOT NULL,
created_at TEXT NOT NULL,
applied_at TEXT
);
CREATE INDEX IF NOT EXISTS idx_tag_proposals_created
ON tag_proposals(created_at DESC);
PRAGMA user_version = 20;
"#,
)?;
}

// --- V20 → V21: atoms.is_locked flag ---
//
// Locked atoms are protected from automated mutation by health fixes
// (strip-boilerplate, auto-merge-duplicate, auto-resolve-contradiction,
// relink-broken-link). They remain readable and editable through the
// normal UI. Use for source-of-truth material (books, studies, primary
// research) where automated "correction" would do more harm than good.
if version < 21 {
// ALTER TABLE ADD COLUMN has no IF NOT EXISTS in SQLite. Ignore the
// "duplicate column" error so migration stays idempotent when a
// test resets user_version to a pre-V21 value on a DB whose table
// was already migrated by the initial open.
let _ = conn.execute(
"ALTER TABLE atoms ADD COLUMN is_locked INTEGER NOT NULL DEFAULT 0",
[],
);
conn.execute_batch("PRAGMA user_version = 21;")?;
}

// --- Triggers (recreated every startup to stay current) ---
conn.execute_batch(
"DROP TRIGGER IF EXISTS atom_tags_insert_count;
Expand Down Expand Up @@ -1012,6 +1125,40 @@ impl Database {
// legacy seed rows so the resolver's "any per-DB row is an override"
// rule stays correct.

// ---------------------------------------------------------------
// Self-healing: idempotent column checks.
//
// Runs on every migration pass regardless of version. Exists because
// a rebase/renumber in the past let some DBs tick their user_version
// past the migration that added `tags.autotag_description` without
// ever executing the ALTER. Any query joining that column then errors
// at runtime ("no such column: t.autotag_description"). Cheap enough
// to always verify; keeps migration drift from bricking a DB.
//
// When adding a new column, prefer listing it here in addition to the
// versioned migration step — belt and braces.
const EXPECTED_COLUMNS: &[(&str, &str, &str)] = &[
// (table, column, DDL to add)
("tags", "autotag_description", "ALTER TABLE tags ADD COLUMN autotag_description TEXT NOT NULL DEFAULT ''"),
];
for (table, column, ddl) in EXPECTED_COLUMNS {
let has_col: bool = conn
.query_row(
"SELECT 1 FROM pragma_table_info(?1) WHERE name = ?2",
rusqlite::params![table, column],
|_| Ok(true),
)
.unwrap_or(false);
if !has_col {
tracing::warn!(
table,
column,
"healing missing column (migration drift); running late ALTER"
);
conn.execute_batch(ddl)?;
}
}

Ok(())
}
}
Expand Down
Loading
Loading