From 9c1d1f08be58d893bf163a5e70a24d130d343925 Mon Sep 17 00:00:00 2001 From: Lisa Date: Thu, 16 Apr 2026 09:12:37 +0200 Subject: [PATCH 01/18] fix(scip): persist pre-computed symbols through journal, Merkle sync, and embeddings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The SCIP import path pushed pre-computed symbols/occurrences/edges via Delta but the daemon silently dropped them in multiple places: - Journal wrote empty text → symbols lost on daemon restart - upsert_file_precomputed ignored CPG edges → blast-radius broken - stale_files hashed empty text → infinite Merkle re-sync loop - file_source_text returned "" → embeddings, stream_context, and explain-match all failed for imported files Fix: FileInput now carries a precomputed flag and content_hash. JournalEntry::UpsertFilePrecomputed persists symbols, occurrences, and edges so they survive compact + replay. stale_files uses the stored content_hash. file_source_text falls back to disk for precomputed file:// URIs. Co-Authored-By: Claude Opus 4.6 (1M context) --- bindings/rust/src/daemon/journal.rs | 129 +++++++++++++- bindings/rust/src/daemon/session.rs | 26 +++ bindings/rust/src/query_graph/db.rs | 265 +++++++++++++++++++++++++++- 3 files changed, 413 insertions(+), 7 deletions(-) diff --git a/bindings/rust/src/daemon/journal.rs b/bindings/rust/src/daemon/journal.rs index 3de1c29..7072415 100644 --- a/bindings/rust/src/daemon/journal.rs +++ b/bindings/rust/src/daemon/journal.rs @@ -33,7 +33,7 @@ use serde::{Deserialize, Serialize}; use tracing::warn; use crate::query_graph::LipDatabase; -use crate::schema::OwnedAnnotationEntry; +use crate::schema::{OwnedAnnotationEntry, OwnedGraphEdge, OwnedOccurrence, OwnedSymbolInfo}; /// Compact the journal when it has accumulated this many entries. /// Below this threshold the overhead of compaction isn't worth it. @@ -49,6 +49,14 @@ pub enum JournalEntry { text: String, language: String, }, + UpsertFilePrecomputed { + uri: String, + language: String, + content_hash: String, + symbols: Vec, + occurrences: Vec, + edges: Vec, + }, RemoveFile { uri: String, }, @@ -168,9 +176,28 @@ pub fn compact(path: &Path, db: &LipDatabase) -> anyhow::Result { })?; } - // One UpsertFile per tracked file. + // One UpsertFile (or UpsertFilePrecomputed) per tracked file. for uri in db.tracked_uris() { - if let (Some(text), Some(lang)) = (db.file_text(&uri), db.file_language(&uri)) { + let Some(lang) = db.file_language(&uri) else { + continue; + }; + if db.is_precomputed(&uri) { + let content_hash = db + .file_content_hash(&uri) + .unwrap_or_default() + .to_owned(); + let symbols = db.cached_symbols(&uri).as_ref().clone(); + let occurrences = db.cached_occurrences(&uri).as_ref().clone(); + let edges = db.file_call_edges_raw(&uri); + write_entry(&JournalEntry::UpsertFilePrecomputed { + uri, + language: lang.to_owned(), + content_hash, + symbols, + occurrences, + edges, + })?; + } else if let Some(text) = db.file_text(&uri) { write_entry(&JournalEntry::UpsertFile { uri, text: text.to_owned(), @@ -208,6 +235,23 @@ pub fn replay(entries: &[JournalEntry], db: &mut LipDatabase) { } => { db.upsert_file(uri.clone(), text.clone(), language.clone()); } + JournalEntry::UpsertFilePrecomputed { + uri, + language, + content_hash, + symbols, + occurrences, + edges, + } => { + db.upsert_file_precomputed( + uri.clone(), + language.clone(), + content_hash.clone(), + symbols.clone(), + occurrences.clone(), + edges.clone(), + ); + } JournalEntry::RemoveFile { uri } => { db.remove_file(uri); } @@ -355,6 +399,85 @@ mod tests { assert_eq!(db2.current_merkle_root(), Some("abc")); } + #[test] + fn precomputed_survives_compact_replay() { + use crate::schema::{OwnedOccurrence, OwnedRange, OwnedSymbolInfo, Role, SymbolKind}; + + let tmp = NamedTempFile::new().unwrap(); + let path = tmp.path().to_owned(); + + let sym = OwnedSymbolInfo { + uri: "lip://local/lib.rs#Foo".into(), + display_name: "Foo".into(), + kind: SymbolKind::Function, + documentation: None, + signature: None, + confidence_score: 90, + relationships: vec![], + runtime_p99_ms: None, + call_rate_per_s: None, + taint_labels: vec![], + blast_radius: 0, + is_exported: false, + }; + let occ = OwnedOccurrence { + symbol_uri: "lip://local/lib.rs#Foo".into(), + range: OwnedRange { + start_line: 0, + start_char: 0, + end_line: 0, + end_char: 3, + }, + confidence_score: 90, + role: Role::Definition, + override_doc: None, + }; + + // Write a precomputed entry. + let (mut j, _) = Journal::open(&path).unwrap(); + j.append(&JournalEntry::UpsertFilePrecomputed { + uri: "file:///project/lib.rs".into(), + language: "rust".into(), + content_hash: "abc123".into(), + symbols: vec![sym], + occurrences: vec![occ], + edges: vec![], + }) + .unwrap(); + drop(j); + + // Replay into db1. + let (_, entries) = Journal::open(&path).unwrap(); + let mut db1 = LipDatabase::new(); + replay(&entries, &mut db1); + assert_eq!(db1.file_count(), 1); + assert!(db1.is_precomputed("file:///project/lib.rs")); + let syms = db1.file_symbols("file:///project/lib.rs"); + assert_eq!(syms.len(), 1, "precomputed symbol must survive replay"); + + // Compact and replay into db2. + compact(&path, &db1).unwrap(); + let (_, compacted) = Journal::open(&path).unwrap(); + let mut db2 = LipDatabase::new(); + replay(&compacted, &mut db2); + assert_eq!(db2.file_count(), 1); + assert!(db2.is_precomputed("file:///project/lib.rs")); + let syms2 = db2.file_symbols("file:///project/lib.rs"); + assert_eq!( + syms2.len(), + 1, + "precomputed symbol must survive compact + replay" + ); + assert_eq!(syms2[0].display_name, "Foo"); + + let results = db2.workspace_symbols("Foo", 10); + assert_eq!( + results.len(), + 1, + "precomputed symbol must be searchable after compact + replay" + ); + } + #[test] fn open_append_creates_file_if_absent() { let dir = tempfile::tempdir().unwrap(); diff --git a/bindings/rust/src/daemon/session.rs b/bindings/rust/src/daemon/session.rs index 59748d2..a0b571b 100644 --- a/bindings/rust/src/daemon/session.rs +++ b/bindings/rust/src/daemon/session.rs @@ -227,9 +227,35 @@ impl Session { let lang = document.language.clone(); let source_opt = document.source_text.clone(); + let has_precomputed = document.source_text.is_none() + && (!document.symbols.is_empty() + || !document.occurrences.is_empty()); + let content_hash = document.content_hash.clone(); + let symbols = document.symbols.clone(); + let occurrences = document.occurrences.clone(); + let edges = document.edges.clone(); + let workspace_root = { let mut db = self.db.lock().await; match action { + Action::Upsert if has_precomputed => { + self.journal_write(JournalEntry::UpsertFilePrecomputed { + uri: uri.clone(), + language: lang.clone(), + content_hash: content_hash.clone(), + symbols: symbols.clone(), + occurrences: occurrences.clone(), + edges: edges.clone(), + }); + db.upsert_file_precomputed( + uri.clone(), + lang.clone(), + content_hash, + symbols, + occurrences, + edges, + ); + } Action::Upsert => { let text = source_opt.clone().unwrap_or_default(); self.journal_write(JournalEntry::UpsertFile { diff --git a/bindings/rust/src/query_graph/db.rs b/bindings/rust/src/query_graph/db.rs index 8166bd8..4d2f6eb 100644 --- a/bindings/rust/src/query_graph/db.rs +++ b/bindings/rust/src/query_graph/db.rs @@ -17,8 +17,8 @@ use crate::query_graph::types::{ }; use crate::schema::EdgeKind; use crate::schema::{ - sha256_hex, OwnedAnnotationEntry, OwnedDependencySlice, OwnedOccurrence, OwnedRange, - OwnedSymbolInfo, Role, + sha256_hex, OwnedAnnotationEntry, OwnedDependencySlice, OwnedGraphEdge, OwnedOccurrence, + OwnedRange, OwnedSymbolInfo, Role, }; // ─── Helpers ───────────────────────────────────────────────────────────────── @@ -94,6 +94,12 @@ struct FileInput { language: String, /// Revision at which this input was last changed. revision: u64, + /// `true` when symbols/occurrences were supplied externally (SCIP import) + /// rather than derived from `text` by Tier 1. + precomputed: bool, + /// Content hash supplied by the caller (e.g. from `OwnedDocument.content_hash`). + /// Used by `stale_files` so Merkle sync works even when `text` is empty. + content_hash: String, } #[derive(Debug)] @@ -227,12 +233,15 @@ impl LipDatabase { pub fn upsert_file(&mut self, uri: String, text: String, language: String) { self.revision += 1; let rev = self.revision; + let content_hash = sha256_hex(text.as_bytes()); self.file_inputs.insert( uri.clone(), FileInput { text, language, revision: rev, + precomputed: false, + content_hash, }, ); // Invalidate the direct derived caches. api_cache is intentionally kept @@ -339,6 +348,121 @@ impl LipDatabase { self.file_embeddings.remove(&uri); } + /// Upsert a file whose symbols and occurrences are already computed + /// (e.g. SCIP import). Populates the same indexes as `upsert_file` but + /// skips the Tier 1 parser since the caller already provides the data. + pub fn upsert_file_precomputed( + &mut self, + uri: String, + language: String, + content_hash: String, + symbols: Vec, + occurrences: Vec, + edges: Vec, + ) { + self.revision += 1; + let rev = self.revision; + self.file_inputs.insert( + uri.clone(), + FileInput { + text: String::new(), + language, + revision: rev, + precomputed: true, + content_hash, + }, + ); + + // Clear stale caches + def_index entries for this file. + self.sym_cache.remove(&uri); + self.occ_cache.remove(&uri); + let stale_defs: Vec = self + .def_index + .iter() + .filter(|(_, (furi, _))| furi == &uri) + .map(|(sym_uri, _)| sym_uri.clone()) + .collect(); + for sym_uri in &stale_defs { + let name = extract_name(sym_uri); + if let Some(uris) = self.name_to_symbols.get_mut(name) { + uris.retain(|u| u != sym_uri); + if uris.is_empty() { + self.name_to_symbols.remove(name); + } + } + } + self.def_index.retain(|_, (furi, _)| furi != &uri); + + // Build def_index + name_to_symbols from pre-computed occurrences. + let occs = Arc::new(occurrences); + for occ in occs.iter() { + if occ.role == Role::Definition { + self.def_index + .insert(occ.symbol_uri.clone(), (uri.clone(), occ.range.clone())); + let name = extract_name(&occ.symbol_uri).to_owned(); + if !name.is_empty() { + self.name_to_symbols + .entry(name) + .or_default() + .push(occ.symbol_uri.clone()); + } + } + } + self.occ_cache + .insert(uri.clone(), Cached::new(occs.clone(), rev)); + + // Seed sym_cache so file_symbols() returns the pre-computed symbols. + let syms = Arc::new(symbols); + self.sym_cache + .insert(uri.clone(), Cached::new(syms, rev)); + + // Consumed-names index (same as upsert_file). + { + let mut consumed: HashSet = HashSet::new(); + for occ in occs.iter().filter(|o| o.role == Role::Reference) { + let name = extract_name(&occ.symbol_uri); + if name.is_empty() { + continue; + } + let is_external = self + .def_index + .get(&occ.symbol_uri) + .map(|(def_file, _)| def_file != &uri) + .unwrap_or(true); + if is_external { + consumed.insert(name.to_owned()); + } + } + self.file_consumed_names.insert(uri.clone(), consumed); + } + + // Call-edge indexes from pre-computed edges. + self.remove_file_call_edges(&uri); + let mut pairs: Vec<(String, String)> = Vec::new(); + for edge in edges.iter().filter(|e| e.kind == EdgeKind::Calls) { + self.callee_to_callers + .entry(edge.to_uri.clone()) + .or_default() + .push(edge.from_uri.clone()); + let callee_name = extract_name(&edge.to_uri).to_owned(); + if !callee_name.is_empty() { + self.callee_name_to_callers + .entry(callee_name) + .or_default() + .push(edge.from_uri.clone()); + } + pairs.push((edge.from_uri.clone(), edge.to_uri.clone())); + } + self.file_call_edges.insert(uri.clone(), pairs); + + let now_ms = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .map(|d| d.as_millis() as i64) + .unwrap_or(0); + self.file_indexed_at.insert(uri.clone(), now_ms); + self.file_embeddings.remove(&uri); + } + pub fn remove_file(&mut self, uri: &str) { self.revision += 1; self.file_inputs.remove(uri); @@ -402,7 +526,16 @@ impl LipDatabase { /// Returns the source text stored for `uri`, or `None` if not indexed. pub fn file_source_text(&self, uri: &str) -> Option { - self.file_inputs.get(uri).map(|f| f.text.clone()) + let fi = self.file_inputs.get(uri)?; + if fi.precomputed && fi.text.is_empty() { + if let Some(path) = uri.strip_prefix("file://") { + if let Ok(text) = std::fs::read_to_string(path) { + return Some(text); + } + } + return None; + } + Some(fi.text.clone()) } pub fn set_workspace_root(&mut self, root: PathBuf) { @@ -475,6 +608,48 @@ impl LipDatabase { self.file_inputs.keys().cloned().collect() } + pub fn is_precomputed(&self, uri: &str) -> bool { + self.file_inputs.get(uri).is_some_and(|f| f.precomputed) + } + + pub fn file_content_hash(&self, uri: &str) -> Option<&str> { + self.file_inputs.get(uri).map(|f| f.content_hash.as_str()) + } + + /// Read-only access to cached symbols (for journal compaction). + pub fn cached_symbols(&self, uri: &str) -> Arc> { + self.sym_cache + .get(uri) + .map(|c| c.value.clone()) + .unwrap_or_default() + } + + /// Read-only access to cached occurrences (for journal compaction). + pub fn cached_occurrences(&self, uri: &str) -> Arc> { + self.occ_cache + .get(uri) + .map(|c| c.value.clone()) + .unwrap_or_default() + } + + /// Return stored call-edge pairs for a file (for journal compaction). + pub fn file_call_edges_raw(&self, uri: &str) -> Vec { + self.file_call_edges + .get(uri) + .map(|pairs| { + pairs + .iter() + .map(|(from, to)| OwnedGraphEdge { + from_uri: from.clone(), + to_uri: to.clone(), + kind: EdgeKind::Calls, + at_range: OwnedRange::default(), + }) + .collect() + }) + .unwrap_or_default() + } + /// Merkle sync probe: given a slice of `(uri, client_content_hash)` pairs, /// returns URIs that are stale (daemon hash ≠ client hash) or unknown to /// the daemon (never indexed). The client should re-Delta each returned URI. @@ -484,7 +659,7 @@ impl LipDatabase { .filter(|(uri, client_hash)| { match self.file_inputs.get(uri) { None => true, // daemon has never seen this file - Some(fi) => sha256_hex(fi.text.as_bytes()) != *client_hash, + Some(fi) => fi.content_hash != *client_hash, } }) .map(|(uri, _)| uri.clone()) @@ -3376,4 +3551,86 @@ impl Greeter { let (total, _, _) = db.coverage("/project/src"); assert_eq!(total, 1, "should only count files under /project/src"); } + + // ── Precomputed upsert (SCIP import path) ──────────────────────────── + + #[test] + fn precomputed_symbols_appear_in_search() { + let mut db = LipDatabase::new(); + let uri = "file:///project/lib.rs".to_owned(); + let sym_uri = "lip://local/lib.rs#MyStruct".to_owned(); + let symbols = vec![OwnedSymbolInfo { + uri: sym_uri.clone(), + display_name: "MyStruct".into(), + kind: SymbolKind::Class, + documentation: None, + signature: None, + confidence_score: 90, + relationships: vec![], + runtime_p99_ms: None, + call_rate_per_s: None, + taint_labels: vec![], + blast_radius: 0, + is_exported: false, + }]; + let occurrences = vec![OwnedOccurrence { + symbol_uri: sym_uri.clone(), + range: OwnedRange { + start_line: 0, + start_char: 0, + end_line: 0, + end_char: 8, + }, + confidence_score: 90, + role: Role::Definition, + override_doc: None, + }]; + + db.upsert_file_precomputed(uri.clone(), "rust".into(), "hash123".into(), symbols, occurrences, vec![]); + + let syms = db.file_symbols(&uri); + assert_eq!(syms.len(), 1); + assert_eq!(syms[0].display_name, "MyStruct"); + + let results = db.workspace_symbols("MyStruct", 10); + assert_eq!(results.len(), 1, "pre-computed symbol must appear in workspace search"); + + assert!( + db.symbol_definition_location(&sym_uri).is_some(), + "pre-computed definition must be resolvable" + ); + } + + #[test] + fn precomputed_upsert_is_idempotent() { + let mut db = LipDatabase::new(); + let uri = "file:///project/lib.rs".to_owned(); + let sym = OwnedSymbolInfo { + uri: "lip://local/lib.rs#Foo".into(), + display_name: "Foo".into(), + kind: SymbolKind::Function, + documentation: None, + signature: None, + confidence_score: 90, + relationships: vec![], + runtime_p99_ms: None, + call_rate_per_s: None, + taint_labels: vec![], + blast_radius: 0, + is_exported: false, + }; + let occ = OwnedOccurrence { + symbol_uri: "lip://local/lib.rs#Foo".into(), + range: OwnedRange { start_line: 0, start_char: 0, end_line: 0, end_char: 3 }, + confidence_score: 90, + role: Role::Definition, + override_doc: None, + }; + + db.upsert_file_precomputed(uri.clone(), "rust".into(), "hash1".into(), vec![sym.clone()], vec![occ.clone()], vec![]); + db.upsert_file_precomputed(uri.clone(), "rust".into(), "hash1".into(), vec![sym], vec![occ], vec![]); + + let results = db.workspace_symbols("Foo", 10); + assert_eq!(results.len(), 1, "re-upsert must not duplicate symbols"); + } } From ce30619ee796312acecd061b1ae5cef948b89050 Mon Sep 17 00:00:00 2001 From: Lisa Date: Thu, 16 Apr 2026 09:14:07 +0200 Subject: [PATCH 02/18] docs: document intentional Tier 2 skip for SCIP and lossy SCIP export - session.rs: explain why Tier 2 verification is skipped for pre-computed SCIP imports (source_opt is None by design, SCIP emitters are authoritative) - export.rs: document that SCIP round-trips lose CPG edges since the SCIP wire format has no edge representation Co-Authored-By: Claude Opus 4.6 (1M context) --- bindings/rust/src/daemon/session.rs | 5 +++++ tools/lip-cli/src/cmd/export.rs | 8 +++++--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/bindings/rust/src/daemon/session.rs b/bindings/rust/src/daemon/session.rs index a0b571b..d20d2be 100644 --- a/bindings/rust/src/daemon/session.rs +++ b/bindings/rust/src/daemon/session.rs @@ -291,6 +291,11 @@ impl Session { } // Enqueue Tier 2 verification for supported languages on upsert. + // Skipped for pre-computed imports (SCIP): source_opt is None so + // the (Some(tx), Some(source)) guard below won't fire. This is + // intentional — SCIP emitters are authoritative; re-verifying via + // a local language server would be redundant and may not have the + // right project context. if matches!(action, Action::Upsert) { let needs_tier2 = lang == "rust" || uri.ends_with(".rs") diff --git a/tools/lip-cli/src/cmd/export.rs b/tools/lip-cli/src/cmd/export.rs index 2c952b9..b07b88a 100644 --- a/tools/lip-cli/src/cmd/export.rs +++ b/tools/lip-cli/src/cmd/export.rs @@ -91,9 +91,11 @@ fn convert_document(doc: OwnedDocument) -> scip::Document { let occurrences: Vec = doc.occurrences.iter().map(convert_occurrence).collect(); - // Note: scip::Document has no `text` field in the generated proto; source - // text is not part of the SCIP wire format at this schema version. - let _ = doc.source_text; // present in LIP, absent in SCIP + // SCIP has no representation for source text or CPG edges; both are + // LIP-only. A SCIP round-trip (import → export) is therefore lossy for + // call-graph / blast-radius data. + let _ = doc.source_text; + let _ = doc.edges; scip::Document { language: doc.language, relative_path: uri_to_relative_path(&doc.uri), From e7d6cf50ae0a6aa73bf51ed45bde48b0c6594bd1 Mon Sep 17 00:00:00 2001 From: Lisa Date: Thu, 16 Apr 2026 09:44:41 +0200 Subject: [PATCH 03/18] feat: SCIP integration test, Tier 2 test harness, name-dep invalidation, CI workflow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Five improvements in one batch: 1. SCIP integration test — end-to-end test proving pre-computed symbols from Delta are searchable via WorkspaceSymbols and resolvable via QueryDefinition (regression coverage for the import path fix) 2. Proto fix — Relationship.is_override → is_definition to match upstream SCIP field 5 semantics; export mapping updated accordingly 3. SCIP CI action — reusable GitHub Actions workflow (.github/workflows/scip-import.yml) that runs a SCIP indexer (rust/typescript/python), starts a LIP daemon, and pushes the index at confidence 100 4. Tier 2 test harness — 14 unit tests for the verification manager: routing dispatch, channel backpressure, confidence elevation, symbol upgrade merging, backend unavailability 5. Name-dep invalidation — new invalidated_files_for() query answering "which files break if these symbols change" using the existing file_consumed_names index; wired into the daemon protocol as QueryInvalidatedFiles / InvalidatedFilesResult Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/workflows/scip-import.yml | 154 ++++++ bindings/rust/src/daemon/session.rs | 15 + bindings/rust/src/daemon/tier2_manager.rs | 548 ++++++++++++++++++++++ bindings/rust/src/query_graph/db.rs | 133 ++++++ bindings/rust/src/query_graph/types.rs | 18 + bindings/rust/tests/integration.rs | 147 +++++- tools/lip-cli/src/cmd/export.rs | 2 +- tools/lip-cli/src/proto/scip.proto | 2 +- 8 files changed, 1016 insertions(+), 3 deletions(-) create mode 100644 .github/workflows/scip-import.yml diff --git a/.github/workflows/scip-import.yml b/.github/workflows/scip-import.yml new file mode 100644 index 0000000..ff47c90 --- /dev/null +++ b/.github/workflows/scip-import.yml @@ -0,0 +1,154 @@ +name: SCIP Import + +on: + workflow_call: + inputs: + scip-tool: + description: "SCIP indexer to run" + required: true + type: string # rust | typescript | python + scip-file-path: + description: "Path to the SCIP index file" + required: false + type: string + default: ".scip/index.scip" + daemon-socket: + description: "Unix socket path for the LIP daemon" + required: false + type: string + default: "/tmp/lip-ci.sock" + confidence: + description: "Confidence score for imported symbols (1-100)" + required: false + type: number + default: 100 + + workflow_dispatch: + inputs: + scip-tool: + description: "SCIP indexer to run" + required: true + type: choice + options: + - rust + - typescript + - python + scip-file-path: + description: "Path to the SCIP index file" + required: false + type: string + default: ".scip/index.scip" + daemon-socket: + description: "Unix socket path for the LIP daemon" + required: false + type: string + default: "/tmp/lip-ci.sock" + confidence: + description: "Confidence score for imported symbols (1-100)" + required: false + type: number + default: 100 + +env: + CARGO_TERM_COLOR: always + CARGO_INCREMENTAL: 0 + +jobs: + scip-import: + name: SCIP Import (${{ inputs.scip-tool }}) + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + # ── Install lip CLI ────────────────────────────────────────────────────── + - uses: actions/cache@v4 + id: lip-cache + with: + path: ~/.cargo/bin/lip + key: lip-cli-${{ runner.os }}-${{ hashFiles('.github/workflows/scip-import.yml') }} + + - name: Install lip CLI + if: steps.lip-cache.outputs.cache-hit != 'true' + run: cargo install lip-cli --locked + + # ── Start LIP daemon ───────────────────────────────────────────────────── + - name: Start LIP daemon + run: | + lip daemon --socket ${{ inputs.daemon-socket }} & + DAEMON_PID=$! + echo "DAEMON_PID=$DAEMON_PID" >> "$GITHUB_ENV" + # Wait for socket to appear + for i in $(seq 1 30); do + [ -S "${{ inputs.daemon-socket }}" ] && break + sleep 0.2 + done + if [ ! -S "${{ inputs.daemon-socket }}" ]; then + echo "::error::Daemon socket did not appear within 6s" + exit 1 + fi + + # ── Install and run SCIP indexer ───────────────────────────────────────── + - name: Install Rust toolchain + if: inputs.scip-tool == 'rust' + uses: dtolnay/rust-toolchain@stable + with: + components: rust-analyzer + + - name: Run scip-rust indexer + if: inputs.scip-tool == 'rust' + run: | + mkdir -p "$(dirname '${{ inputs.scip-file-path }}')" + rust-analyzer scip . + # rust-analyzer writes to index.scip in cwd; move if needed + if [ "${{ inputs.scip-file-path }}" != "index.scip" ] && [ -f index.scip ]; then + mv index.scip "${{ inputs.scip-file-path }}" + fi + + - name: Setup Node.js + if: inputs.scip-tool == 'typescript' + uses: actions/setup-node@v4 + with: + node-version: "20" + + - name: Run scip-typescript indexer + if: inputs.scip-tool == 'typescript' + run: | + npm install -g @sourcegraph/scip-typescript + mkdir -p "$(dirname '${{ inputs.scip-file-path }}')" + scip-typescript index --output "${{ inputs.scip-file-path }}" + + - name: Setup Python + if: inputs.scip-tool == 'python' + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Run scip-python indexer + if: inputs.scip-tool == 'python' + run: | + pip install scip-python + mkdir -p "$(dirname '${{ inputs.scip-file-path }}')" + scip-python index --output "${{ inputs.scip-file-path }}" + + # ── Import into daemon ─────────────────────────────────────────────────── + - name: Import SCIP index into LIP daemon + run: | + lip import \ + --from-scip "${{ inputs.scip-file-path }}" \ + --push-to-daemon "${{ inputs.daemon-socket }}" \ + --confidence ${{ inputs.confidence }} + + # ── Verify import ──────────────────────────────────────────────────────── + - name: Verify index status + run: | + echo '[{"type":"query_index_status"}]' | \ + lip query --socket "${{ inputs.daemon-socket }}" batch + + # ── Cleanup ────────────────────────────────────────────────────────────── + - name: Stop daemon + if: always() + run: | + if [ -n "$DAEMON_PID" ]; then + kill "$DAEMON_PID" 2>/dev/null || true + wait "$DAEMON_PID" 2>/dev/null || true + fi diff --git a/bindings/rust/src/daemon/session.rs b/bindings/rust/src/daemon/session.rs index d20d2be..cf916cf 100644 --- a/bindings/rust/src/daemon/session.rs +++ b/bindings/rust/src/daemon/session.rs @@ -419,6 +419,14 @@ impl Session { ServerMessage::DeadSymbolsResult { symbols } } + ClientMessage::QueryInvalidatedFiles { + changed_symbol_uris, + } => { + let db = self.db.lock().await; + let file_uris = db.invalidated_files_for(&changed_symbol_uris); + ServerMessage::InvalidatedFilesResult { file_uris } + } + // ── Annotations ─────────────────────────────────────────────── ClientMessage::AnnotationSet { symbol_uri, @@ -1895,6 +1903,13 @@ fn process_query_sync( ok(ServerMessage::DeadSymbolsResult { symbols }) } + ClientMessage::QueryInvalidatedFiles { + changed_symbol_uris, + } => { + let file_uris = db.invalidated_files_for(&changed_symbol_uris); + ok(ServerMessage::InvalidatedFilesResult { file_uris }) + } + // ── Annotations ─────────────────────────────────────────────────── ClientMessage::AnnotationSet { symbol_uri, diff --git a/bindings/rust/src/daemon/tier2_manager.rs b/bindings/rust/src/daemon/tier2_manager.rs index b021476..23b8ddd 100644 --- a/bindings/rust/src/daemon/tier2_manager.rs +++ b/bindings/rust/src/daemon/tier2_manager.rs @@ -604,3 +604,551 @@ impl Tier2Manager { } } } + +#[cfg(test)] +mod tests { + use super::*; + use crate::schema::OwnedSymbolInfo; + use std::sync::Arc; + use tokio::sync::{broadcast, mpsc, Mutex}; + + // ── Helpers ────────────────────────────────────────────────────────────── + + /// Build a minimal `Tier2Manager` with all backends disabled. + /// + /// This is the primary test fixture: every backend is marked permanently + /// disabled so that `handle_*` returns immediately without attempting to + /// spawn a language server process. This lets us exercise routing, channel + /// behaviour and broadcast logic in isolation. + fn manager_all_disabled() -> (Tier2Manager, mpsc::Sender) { + let (tx, rx) = mpsc::channel(CHANNEL_CAPACITY); + let (notify_tx, _) = broadcast::channel(16); + let db = Arc::new(Mutex::new(LipDatabase::new())); + + let mut backends = Tier2Backends::new(); + backends.rust_disabled = true; + backends.typescript_disabled = true; + backends.python_disabled = true; + backends.dart_disabled = true; + backends.clangd_disabled = true; + backends.gopls_disabled = true; + backends.kotlin_disabled = true; + backends.swift_disabled = true; + + let mgr = Tier2Manager { + db, + rx, + backends, + notify_tx: Some(notify_tx), + }; + (mgr, tx) + } + + fn make_job(uri: &str) -> VerificationJob { + VerificationJob { + uri: uri.to_owned(), + source: String::new(), + workspace_root: None, + version: 1, + } + } + + fn make_symbol(uri: &str, confidence: u8) -> OwnedSymbolInfo { + OwnedSymbolInfo { + uri: uri.to_owned(), + display_name: uri.rsplit('#').next().unwrap_or(uri).to_owned(), + kind: crate::schema::SymbolKind::Function, + documentation: None, + signature: None, + confidence_score: confidence, + relationships: vec![], + runtime_p99_ms: None, + call_rate_per_s: None, + taint_labels: vec![], + blast_radius: 0, + is_exported: false, + } + } + + // ── Job routing ────────────────────────────────────────────────────────── + + /// Verify that `handle` dispatches to the correct backend for every + /// supported file extension. Because all backends are disabled, each + /// handler returns immediately — the absence of a panic proves the + /// routing path was reached (the disabled-flag early-return is the first + /// line in every `handle_*` method). + #[tokio::test] + async fn routing_dispatches_to_correct_backend() { + let (mut mgr, _tx) = manager_all_disabled(); + + // Rust + mgr.handle(make_job("file:///src/main.rs")).await; + + // TypeScript family + mgr.handle(make_job("file:///src/index.ts")).await; + mgr.handle(make_job("file:///src/App.tsx")).await; + mgr.handle(make_job("file:///src/util.js")).await; + mgr.handle(make_job("file:///src/App.jsx")).await; + mgr.handle(make_job("file:///src/esm.mjs")).await; + mgr.handle(make_job("file:///src/cjs.cjs")).await; + + // Python + mgr.handle(make_job("file:///src/app.py")).await; + + // Dart + mgr.handle(make_job("file:///lib/main.dart")).await; + + // C / C++ + mgr.handle(make_job("file:///src/main.c")).await; + mgr.handle(make_job("file:///src/lib.h")).await; + mgr.handle(make_job("file:///src/main.cpp")).await; + mgr.handle(make_job("file:///src/util.cc")).await; + mgr.handle(make_job("file:///src/core.cxx")).await; + mgr.handle(make_job("file:///src/api.hpp")).await; + mgr.handle(make_job("file:///src/api.hxx")).await; + + // Go + mgr.handle(make_job("file:///cmd/main.go")).await; + + // Kotlin + mgr.handle(make_job("file:///src/Main.kt")).await; + mgr.handle(make_job("file:///build.gradle.kts")).await; + + // Swift + mgr.handle(make_job("file:///Sources/App.swift")).await; + } + + /// Files with unknown extensions should be silently ignored — no panic, + /// no error, no backend touched. + #[tokio::test] + async fn routing_unknown_extension_is_noop() { + let (mut mgr, _tx) = manager_all_disabled(); + + mgr.handle(make_job("file:///README.md")).await; + mgr.handle(make_job("file:///data.json")).await; + mgr.handle(make_job("file:///Makefile")).await; + } + + // ── Channel behaviour ──────────────────────────────────────────────────── + + /// When the bounded channel is full, `try_send` must fail (Err) rather + /// than blocking the caller. + #[tokio::test] + async fn full_channel_drops_jobs() { + let (tx, _rx) = mpsc::channel::(CHANNEL_CAPACITY); + + // Fill the channel to capacity. + for i in 0..CHANNEL_CAPACITY { + let job = VerificationJob { + uri: format!("file:///src/file_{i}.rs"), + source: String::new(), + workspace_root: None, + version: 1, + }; + tx.try_send(job).expect("channel should accept up to capacity"); + } + + // The next try_send must fail — this is the documented contract. + let overflow = VerificationJob { + uri: "file:///src/overflow.rs".to_owned(), + source: String::new(), + workspace_root: None, + version: 1, + }; + assert!( + tx.try_send(overflow).is_err(), + "try_send on a full channel must return Err, not block" + ); + } + + // ── Backend unavailability ─────────────────────────────────────────────── + + /// When a backend's `disabled` flag is set (binary not found), calling + /// `handle` with a matching file must return gracefully — no panic, no + /// spawn attempt. + #[tokio::test] + async fn disabled_backend_skips_gracefully() { + let (mut mgr, _tx) = manager_all_disabled(); + + // Explicitly verify each disabled backend short-circuits. + assert!(mgr.backends.rust_disabled); + mgr.handle(make_job("file:///src/lib.rs")).await; + assert!(mgr.backends.rust.is_none(), "no backend should be created"); + + assert!(mgr.backends.typescript_disabled); + mgr.handle(make_job("file:///src/app.ts")).await; + assert!(mgr.backends.typescript.is_none()); + + assert!(mgr.backends.python_disabled); + mgr.handle(make_job("file:///src/app.py")).await; + assert!(mgr.backends.python.is_none()); + + assert!(mgr.backends.dart_disabled); + mgr.handle(make_job("file:///lib/main.dart")).await; + assert!(mgr.backends.dart.is_none()); + + assert!(mgr.backends.clangd_disabled); + mgr.handle(make_job("file:///src/main.c")).await; + assert!(mgr.backends.clangd.is_none()); + + assert!(mgr.backends.gopls_disabled); + mgr.handle(make_job("file:///cmd/main.go")).await; + assert!(mgr.backends.gopls.is_none()); + + assert!(mgr.backends.kotlin_disabled); + mgr.handle(make_job("file:///src/Main.kt")).await; + assert!(mgr.backends.kotlin.is_none()); + + assert!(mgr.backends.swift_disabled); + mgr.handle(make_job("file:///Sources/App.swift")).await; + assert!(mgr.backends.swift.is_none()); + } + + // ── Confidence elevation (broadcast) ───────────────────────────────────── + + /// When a Tier 2 upgrade raises a symbol's confidence, the manager must + /// broadcast a `SymbolUpgraded` message with the correct old/new scores. + #[tokio::test] + async fn broadcast_upgrades_fires_on_confidence_increase() { + let (notify_tx, mut notify_rx) = broadcast::channel(16); + let db = Arc::new(Mutex::new(LipDatabase::new())); + + let file_uri = "file:///src/lib.rs"; + let sym_uri = "lip://local//src/lib.rs#foo"; + + // Seed the database with a Tier 1 symbol at confidence 40. + { + let mut db = db.lock().await; + db.upsert_file_precomputed( + file_uri.to_owned(), + "rust".to_owned(), + "abc123".to_owned(), + vec![make_symbol(sym_uri, 40)], + vec![], + vec![], + ); + } + + let mgr = Tier2Manager { + db: db.clone(), + rx: mpsc::channel(1).1, + backends: Tier2Backends::new(), + notify_tx: Some(notify_tx), + }; + + // Simulate a Tier 2 upgrade to confidence 90. + let upgrades = vec![make_symbol(sym_uri, 90)]; + { + let mut db = db.lock().await; + mgr.broadcast_upgrades(file_uri, &upgrades, &mut db); + } + + let msg = notify_rx.try_recv().expect("should receive a broadcast"); + match msg { + ServerMessage::SymbolUpgraded { + uri, + old_confidence, + new_confidence, + } => { + assert_eq!(uri, sym_uri); + assert_eq!(old_confidence, 40); + assert_eq!(new_confidence, 90); + } + other => panic!("expected SymbolUpgraded, got {other:?}"), + } + } + + /// No broadcast should fire when the upgrade does NOT raise confidence + /// (e.g. a stale Tier 2 result arriving after a SCIP push already set + /// the symbol to confidence 95). + #[tokio::test] + async fn broadcast_upgrades_silent_when_confidence_not_raised() { + let (notify_tx, mut notify_rx) = broadcast::channel(16); + let db = Arc::new(Mutex::new(LipDatabase::new())); + + let file_uri = "file:///src/lib.rs"; + let sym_uri = "lip://local//src/lib.rs#bar"; + + // Seed at confidence 95 (SCIP push). + { + let mut db = db.lock().await; + db.upsert_file_precomputed( + file_uri.to_owned(), + "rust".to_owned(), + "abc123".to_owned(), + vec![make_symbol(sym_uri, 95)], + vec![], + vec![], + ); + } + + let mgr = Tier2Manager { + db: db.clone(), + rx: mpsc::channel(1).1, + backends: Tier2Backends::new(), + notify_tx: Some(notify_tx), + }; + + // "Upgrade" to 90 — this is actually a downgrade, no broadcast. + let upgrades = vec![make_symbol(sym_uri, 90)]; + { + let mut db = db.lock().await; + mgr.broadcast_upgrades(file_uri, &upgrades, &mut db); + } + + assert!( + notify_rx.try_recv().is_err(), + "no broadcast should fire when the upgrade does not raise confidence" + ); + } + + /// When there are no broadcast receivers, `broadcast_upgrades` must + /// short-circuit without reading from the db (the receiver_count check). + #[tokio::test] + async fn broadcast_upgrades_noop_without_receivers() { + let (notify_tx, _) = broadcast::channel::(16); + let db = Arc::new(Mutex::new(LipDatabase::new())); + + // Drop the only receiver so receiver_count == 0. + // (The `_` binding above was never subscribed to.) + drop(notify_tx.subscribe()); // subscribe then immediately drop + + let mgr = Tier2Manager { + db: db.clone(), + rx: mpsc::channel(1).1, + backends: Tier2Backends::new(), + notify_tx: Some(notify_tx), + }; + + let upgrades = vec![make_symbol("lip://local//src/lib.rs#baz", 90)]; + { + let mut db = db.lock().await; + // Should not panic even though "file:///src/lib.rs" is not in the db. + mgr.broadcast_upgrades("file:///src/lib.rs", &upgrades, &mut db); + } + } + + /// When `notify_tx` is `None`, `broadcast_upgrades` must be a no-op. + #[tokio::test] + async fn broadcast_upgrades_noop_when_notifications_disabled() { + let db = Arc::new(Mutex::new(LipDatabase::new())); + + let mgr = Tier2Manager { + db: db.clone(), + rx: mpsc::channel(1).1, + backends: Tier2Backends::new(), + notify_tx: None, + }; + + let upgrades = vec![make_symbol("lip://local//src/lib.rs#baz", 90)]; + { + let mut db = db.lock().await; + mgr.broadcast_upgrades("file:///src/lib.rs", &upgrades, &mut db); + } + // No panic = pass. + } + + // ── Symbol upgrade merging (LipDatabase::upgrade_file_symbols) ─────────── + + /// `upgrade_file_symbols` must raise confidence and merge signature, + /// documentation and relationships from Tier 2 results into existing + /// Tier 1 symbols. + #[tokio::test] + async fn upgrade_merges_signature_and_confidence() { + let mut db = LipDatabase::new(); + + let file_uri = "file:///src/lib.rs"; + let sym_uri = "lip://local//src/lib.rs#process"; + + let tier1 = OwnedSymbolInfo { + uri: sym_uri.to_owned(), + display_name: "process".to_owned(), + kind: crate::schema::SymbolKind::Function, + documentation: None, + signature: None, + confidence_score: 40, + relationships: vec![], + runtime_p99_ms: None, + call_rate_per_s: None, + taint_labels: vec![], + blast_radius: 0, + is_exported: false, + }; + + db.upsert_file_precomputed( + file_uri.to_owned(), + "rust".to_owned(), + "hash1".to_owned(), + vec![tier1], + vec![], + vec![], + ); + + // Simulate Tier 2 upgrade with signature and doc. + let upgrade = OwnedSymbolInfo { + uri: sym_uri.to_owned(), + display_name: "process".to_owned(), + kind: crate::schema::SymbolKind::Function, + documentation: Some("Process the input data.".to_owned()), + signature: Some("pub fn process(input: &[u8]) -> Result<()>".to_owned()), + confidence_score: 90, + relationships: vec![crate::schema::OwnedRelationship { + target_uri: "lip://local//src/types.rs#Result".to_owned(), + is_type_definition: true, + is_reference: false, + is_implementation: false, + is_override: false, + }], + runtime_p99_ms: None, + call_rate_per_s: None, + taint_labels: vec![], + blast_radius: 0, + is_exported: true, + }; + + db.upgrade_file_symbols(file_uri, &[upgrade]); + + let symbols = db.file_symbols(file_uri); + assert_eq!(symbols.len(), 1); + let sym = &symbols[0]; + assert_eq!(sym.confidence_score, 90, "confidence must be elevated"); + assert_eq!( + sym.signature.as_deref(), + Some("pub fn process(input: &[u8]) -> Result<()>"), + "signature must be merged from Tier 2" + ); + assert_eq!( + sym.documentation.as_deref(), + Some("Process the input data."), + "documentation must be merged from Tier 2" + ); + assert_eq!(sym.relationships.len(), 1, "relationships must be merged"); + assert!(sym.relationships[0].is_type_definition); + } + + /// `upgrade_file_symbols` must NOT downgrade a symbol that already has a + /// higher confidence (e.g. from a SCIP push at 95). + #[tokio::test] + async fn upgrade_does_not_downgrade_confidence() { + let mut db = LipDatabase::new(); + + let file_uri = "file:///src/lib.rs"; + let sym_uri = "lip://local//src/lib.rs#hi_conf"; + + let existing = OwnedSymbolInfo { + uri: sym_uri.to_owned(), + display_name: "hi_conf".to_owned(), + kind: crate::schema::SymbolKind::Function, + documentation: Some("Already documented.".to_owned()), + signature: Some("fn hi_conf() -> u32".to_owned()), + confidence_score: 95, + relationships: vec![], + runtime_p99_ms: None, + call_rate_per_s: None, + taint_labels: vec![], + blast_radius: 0, + is_exported: false, + }; + + db.upsert_file_precomputed( + file_uri.to_owned(), + "rust".to_owned(), + "hash2".to_owned(), + vec![existing], + vec![], + vec![], + ); + + // Tier 2 arrives late with a lower confidence. + let stale = OwnedSymbolInfo { + uri: sym_uri.to_owned(), + display_name: "hi_conf".to_owned(), + kind: crate::schema::SymbolKind::Function, + documentation: None, + signature: Some("fn hi_conf() -> u32".to_owned()), + confidence_score: 70, + relationships: vec![], + runtime_p99_ms: None, + call_rate_per_s: None, + taint_labels: vec![], + blast_radius: 0, + is_exported: false, + }; + + db.upgrade_file_symbols(file_uri, &[stale]); + + let symbols = db.file_symbols(file_uri); + let sym = &symbols[0]; + assert_eq!( + sym.confidence_score, 95, + "confidence must not be downgraded" + ); + assert_eq!( + sym.documentation.as_deref(), + Some("Already documented."), + "existing documentation must be preserved" + ); + } + + /// `upgrade_file_symbols` with an empty upgrade slice is a no-op. + #[tokio::test] + async fn upgrade_empty_is_noop() { + let mut db = LipDatabase::new(); + + let file_uri = "file:///src/lib.rs"; + db.upsert_file_precomputed( + file_uri.to_owned(), + "rust".to_owned(), + "hash3".to_owned(), + vec![make_symbol("lip://local//src/lib.rs#x", 40)], + vec![], + vec![], + ); + + db.upgrade_file_symbols(file_uri, &[]); + + let symbols = db.file_symbols(file_uri); + assert_eq!(symbols[0].confidence_score, 40, "nothing should change"); + } + + /// `upgrade_file_symbols` on a URI not in the database is a no-op. + #[tokio::test] + async fn upgrade_unknown_file_is_noop() { + let mut db = LipDatabase::new(); + let sym = make_symbol("lip://local//unknown.rs#foo", 90); + // Must not panic. + db.upgrade_file_symbols("file:///unknown.rs", &[sym]); + } + + // ── Tier2Backends default state ────────────────────────────────────────── + + /// Fresh `Tier2Backends` must have all backends as `None` and all + /// disabled flags as `false` — backends are lazily initialised. + #[test] + fn backends_default_state() { + let b = Tier2Backends::new(); + assert!(b.rust.is_none()); + assert!(!b.rust_disabled); + assert!(b.typescript.is_none()); + assert!(!b.typescript_disabled); + assert!(b.python.is_none()); + assert!(!b.python_disabled); + assert!(b.dart.is_none()); + assert!(!b.dart_disabled); + assert!(b.clangd.is_none()); + assert!(!b.clangd_disabled); + assert!(b.gopls.is_none()); + assert!(!b.gopls_disabled); + assert!(b.kotlin.is_none()); + assert!(!b.kotlin_disabled); + assert!(b.swift.is_none()); + assert!(!b.swift_disabled); + } + + // ── Channel capacity constant ──────────────────────────────────────────── + + #[test] + fn channel_capacity_is_64() { + assert_eq!(CHANNEL_CAPACITY, 64); + } +} diff --git a/bindings/rust/src/query_graph/db.rs b/bindings/rust/src/query_graph/db.rs index 4d2f6eb..fa3cd0a 100644 --- a/bindings/rust/src/query_graph/db.rs +++ b/bindings/rust/src/query_graph/db.rs @@ -1052,6 +1052,26 @@ impl LipDatabase { .collect() } + /// Given a set of changed symbol URIs, return the deduplicated set of file + /// URIs that need re-verification because they consume at least one of the + /// changed names. + /// + /// This is the public entry-point for symbol-level invalidation (Kotlin IC + /// model). It extracts the display name from each symbol URI via + /// `extract_name`, then delegates to the `file_consumed_names` index. + pub fn invalidated_files_for(&self, changed_symbol_uris: &[String]) -> Vec { + let names: HashSet<&str> = changed_symbol_uris + .iter() + .map(|uri| extract_name(uri)) + .filter(|n| !n.is_empty()) + .collect(); + if names.is_empty() { + return vec![]; + } + let name_refs: Vec<&str> = names.into_iter().collect(); + self.files_consuming_names(&name_refs) + } + // ── Embedding / observability ───────────────────────────────────────── /// Store a pre-computed embedding vector for a file, recording which model produced it. @@ -3633,4 +3653,117 @@ impl Greeter { let results = db.workspace_symbols("Foo", 10); assert_eq!(results.len(), 1, "re-upsert must not duplicate symbols"); } + + // ── Symbol-level invalidation ──────────────────────────────────────── + + #[test] + fn invalidated_files_for_returns_consumers() { + // File A defines `fn foo()`, File B references `foo`. + // Changing `foo` must invalidate B. + let mut db = LipDatabase::new(); + + // File A: defines foo + let uri_a = "lip://local/a.rs".to_owned(); + let sym_foo = OwnedSymbolInfo::new("lip://local/a.rs#foo", "foo"); + let occ_def = OwnedOccurrence { + symbol_uri: "lip://local/a.rs#foo".into(), + range: OwnedRange { start_line: 0, start_char: 0, end_line: 0, end_char: 3 }, + confidence_score: 90, + role: Role::Definition, + override_doc: None, + }; + db.upsert_file_precomputed( + uri_a.clone(), "rust".into(), "h1".into(), + vec![sym_foo], vec![occ_def], vec![], + ); + + // File B: references foo (defined in A → external) + let uri_b = "lip://local/b.rs".to_owned(); + let occ_ref = OwnedOccurrence { + symbol_uri: "lip://local/a.rs#foo".into(), + range: OwnedRange { start_line: 0, start_char: 0, end_line: 0, end_char: 3 }, + confidence_score: 80, + role: Role::Reference, + override_doc: None, + }; + db.upsert_file_precomputed( + uri_b.clone(), "rust".into(), "h2".into(), + vec![], vec![occ_ref], vec![], + ); + + let invalidated = db.invalidated_files_for(&["lip://local/a.rs#foo".into()]); + assert_eq!(invalidated, vec![uri_b]); + } + + #[test] + fn invalidated_files_for_unreferenced_symbol() { + // File C defines `fn bar()`, no one references it. + // Changing `bar` invalidates nothing. + let mut db = LipDatabase::new(); + + let uri_c = "lip://local/c.rs".to_owned(); + let sym_bar = OwnedSymbolInfo::new("lip://local/c.rs#bar", "bar"); + let occ_def = OwnedOccurrence { + symbol_uri: "lip://local/c.rs#bar".into(), + range: OwnedRange { start_line: 0, start_char: 0, end_line: 0, end_char: 3 }, + confidence_score: 90, + role: Role::Definition, + override_doc: None, + }; + db.upsert_file_precomputed( + uri_c.clone(), "rust".into(), "h1".into(), + vec![sym_bar], vec![occ_def], vec![], + ); + + let invalidated = db.invalidated_files_for(&["lip://local/c.rs#bar".into()]); + assert!(invalidated.is_empty(), "unreferenced symbol should invalidate nothing"); + } + + #[test] + fn remove_file_clears_consumed_names() { + // After removing a file, its consumed-names entries must be gone, + // so it no longer appears in invalidation results. + let mut db = LipDatabase::new(); + + // File A: defines foo + let uri_a = "lip://local/a.rs".to_owned(); + let sym_foo = OwnedSymbolInfo::new("lip://local/a.rs#foo", "foo"); + let occ_def = OwnedOccurrence { + symbol_uri: "lip://local/a.rs#foo".into(), + range: OwnedRange { start_line: 0, start_char: 0, end_line: 0, end_char: 3 }, + confidence_score: 90, + role: Role::Definition, + override_doc: None, + }; + db.upsert_file_precomputed( + uri_a.clone(), "rust".into(), "h1".into(), + vec![sym_foo], vec![occ_def], vec![], + ); + + // File B: references foo + let uri_b = "lip://local/b.rs".to_owned(); + let occ_ref = OwnedOccurrence { + symbol_uri: "lip://local/a.rs#foo".into(), + range: OwnedRange { start_line: 0, start_char: 0, end_line: 0, end_char: 3 }, + confidence_score: 80, + role: Role::Reference, + override_doc: None, + }; + db.upsert_file_precomputed( + uri_b.clone(), "rust".into(), "h2".into(), + vec![], vec![occ_ref], vec![], + ); + + // Sanity: B is invalidated before removal + assert_eq!( + db.invalidated_files_for(&["lip://local/a.rs#foo".into()]), + vec![uri_b.clone()], + ); + + // Remove B — its consumed-names entry should be cleaned up + db.remove_file(&uri_b); + + let invalidated = db.invalidated_files_for(&["lip://local/a.rs#foo".into()]); + assert!(invalidated.is_empty(), "removed file must not appear in invalidation results"); + } } diff --git a/bindings/rust/src/query_graph/types.rs b/bindings/rust/src/query_graph/types.rs index d0447f4..b16990c 100644 --- a/bindings/rust/src/query_graph/types.rs +++ b/bindings/rust/src/query_graph/types.rs @@ -203,6 +203,13 @@ pub enum ServerMessage { DeadSymbolsResult { symbols: Vec, }, + /// Response to [`ClientMessage::QueryInvalidatedFiles`]. + /// + /// Contains the deduplicated set of file URIs that consume at least one of + /// the changed symbol names and therefore need re-verification. + InvalidatedFilesResult { + file_uris: Vec, + }, AnnotationAck, AnnotationValue { value: Option, @@ -678,6 +685,12 @@ pub enum ClientMessage { QueryDeadSymbols { limit: Option, }, + /// Given a list of changed symbol URIs, return the file URIs that consume + /// those symbols and need re-verification (Kotlin IC model). + /// Returns `InvalidatedFilesResult`. + QueryInvalidatedFiles { + changed_symbol_uris: Vec, + }, AnnotationSet { symbol_uri: String, key: String, @@ -1114,6 +1127,7 @@ impl ClientMessage { "query_workspace_symbols", "query_document_symbols", "query_dead_symbols", + "query_invalidated_files", "annotation_set", "annotation_get", "annotation_list", @@ -1183,6 +1197,7 @@ impl ClientMessage { ClientMessage::QueryWorkspaceSymbols { .. } => "query_workspace_symbols", ClientMessage::QueryDocumentSymbols { .. } => "query_document_symbols", ClientMessage::QueryDeadSymbols { .. } => "query_dead_symbols", + ClientMessage::QueryInvalidatedFiles { .. } => "query_invalidated_files", ClientMessage::AnnotationSet { .. } => "annotation_set", ClientMessage::AnnotationGet { .. } => "annotation_get", ClientMessage::AnnotationList { .. } => "annotation_list", @@ -1445,6 +1460,9 @@ mod tests { }, ClientMessage::QueryDocumentSymbols { uri: String::new() }, ClientMessage::QueryDeadSymbols { limit: None }, + ClientMessage::QueryInvalidatedFiles { + changed_symbol_uris: vec![], + }, ClientMessage::AnnotationSet { symbol_uri: String::new(), key: String::new(), diff --git a/bindings/rust/tests/integration.rs b/bindings/rust/tests/integration.rs index 5285bce..19a1e8d 100644 --- a/bindings/rust/tests/integration.rs +++ b/bindings/rust/tests/integration.rs @@ -6,7 +6,10 @@ use tokio::net::UnixStream; use lip_core::daemon::LipDaemon; use lip_core::query_graph::{ClientMessage, ErrorCode, ServerMessage}; -use lip_core::schema::{Action, IndexingState, OwnedDocument}; +use lip_core::schema::{ + Action, IndexingState, OwnedDocument, OwnedOccurrence, OwnedRange, OwnedSymbolInfo, Role, + SymbolKind, +}; // ─── Framing helpers (client side) ─────────────────────────────────────────── @@ -886,3 +889,145 @@ async fn unknown_variant_returns_unknown_message_and_keeps_connection() { task.abort(); let _ = task.await; } + +// ─── SCIP import: pre-computed symbols via Delta ───────────────────────────── + +/// Regression test for the SCIP import path. When a client sends a Delta with +/// `source_text: None` and pre-computed `symbols` + `occurrences`, the daemon +/// must store them verbatim (via `upsert_file_precomputed`) and make them +/// queryable through both `WorkspaceSymbols` and `QueryDefinition`. +#[tokio::test] +async fn scip_import_precomputed_symbols_searchable() { + let dir = tempfile::tempdir().expect("tempdir"); + let socket_path = dir.path().join("lip_scip.sock"); + + let daemon = LipDaemon::new(&socket_path); + let task = tokio::spawn(async move { daemon.run().await.ok() }); + tokio::time::sleep(Duration::from_millis(20)).await; + + let mut client = UnixStream::connect(&socket_path).await.expect("connect"); + + // ── Build a pre-computed document (SCIP-style: no source_text) ─────────── + let uri = "lip://local/dep@1.0/scip_mod.rs"; + let symbol_uri = format!("{uri}#ScipWidget"); + + let doc = OwnedDocument { + uri: uri.to_owned(), + content_hash: "cafebabe01234567".to_owned(), + language: "rust".to_owned(), + symbols: vec![OwnedSymbolInfo { + uri: symbol_uri.clone(), + display_name: "ScipWidget".to_owned(), + kind: SymbolKind::Class, + documentation: Some("A widget from SCIP import.".to_owned()), + signature: Some("pub struct ScipWidget".to_owned()), + confidence_score: 100, + relationships: vec![], + runtime_p99_ms: None, + call_rate_per_s: None, + taint_labels: vec![], + blast_radius: 0, + is_exported: true, + }], + occurrences: vec![OwnedOccurrence { + symbol_uri: symbol_uri.clone(), + range: OwnedRange { + start_line: 0, + start_char: 11, + end_line: 0, + end_char: 21, + }, + confidence_score: 100, + role: Role::Definition, + override_doc: None, + }], + merkle_path: uri.to_owned(), + edges: vec![], + source_text: None, // <-- key: SCIP imports have no source + }; + + // ── Send the Delta ─────────────────────────────────────────────────────── + send( + &mut client, + &ClientMessage::Delta { + seq: 100, + action: Action::Upsert, + document: doc, + }, + ) + .await + .expect("send scip delta"); + + let resp = recv(&mut client).await.expect("recv scip delta ack"); + match resp { + ServerMessage::DeltaAck { seq, accepted, .. } => { + assert_eq!(seq, 100); + assert!(accepted, "daemon rejected pre-computed delta"); + } + other => panic!("expected DeltaAck, got {other:?}"), + } + + // ── WorkspaceSymbols: the pre-computed symbol must be discoverable ─────── + send( + &mut client, + &ClientMessage::QueryWorkspaceSymbols { + query: "ScipWidget".to_owned(), + limit: Some(10), + }, + ) + .await + .expect("send workspace symbols query"); + + let resp = recv(&mut client).await.expect("recv workspace symbols"); + match resp { + ServerMessage::WorkspaceSymbolsResult { symbols } => { + assert!( + !symbols.is_empty(), + "expected ScipWidget in workspace symbols, got none" + ); + assert!( + symbols.iter().any(|s| s.display_name == "ScipWidget"), + "ScipWidget not found in results: {symbols:?}" + ); + } + other => panic!("expected WorkspaceSymbolsResult, got {other:?}"), + } + + // ── QueryDefinition: the Definition-role occurrence must resolve ───────── + send( + &mut client, + &ClientMessage::QueryDefinition { + uri: uri.to_owned(), + line: 0, + col: 15, // inside the occurrence range [11..21] + }, + ) + .await + .expect("send query definition"); + + let resp = recv(&mut client).await.expect("recv definition result"); + match resp { + ServerMessage::DefinitionResult { + symbol, + location_uri, + .. + } => { + assert!( + symbol.is_some(), + "expected symbol info for ScipWidget, got None" + ); + let sym = symbol.unwrap(); + assert_eq!(sym.display_name, "ScipWidget"); + assert_eq!( + location_uri.as_deref(), + Some(uri), + "definition should resolve to the same file" + ); + } + other => panic!("expected DefinitionResult, got {other:?}"), + } + + // ── Cleanup ────────────────────────────────────────────────────────────── + task.abort(); + let _ = task.await; +} diff --git a/tools/lip-cli/src/cmd/export.rs b/tools/lip-cli/src/cmd/export.rs index b07b88a..dd1e453 100644 --- a/tools/lip-cli/src/cmd/export.rs +++ b/tools/lip-cli/src/cmd/export.rs @@ -113,7 +113,7 @@ fn convert_symbol_info(sym: &OwnedSymbolInfo) -> scip::SymbolInformation { is_reference: r.is_reference, is_implementation: r.is_implementation, is_type_definition: r.is_type_definition, - is_override: r.is_override, + is_definition: r.is_override, }) .collect(); diff --git a/tools/lip-cli/src/proto/scip.proto b/tools/lip-cli/src/proto/scip.proto index 0106348..ef77107 100644 --- a/tools/lip-cli/src/proto/scip.proto +++ b/tools/lip-cli/src/proto/scip.proto @@ -131,7 +131,7 @@ message Relationship { bool is_reference = 2; bool is_implementation = 3; bool is_type_definition = 4; - bool is_override = 5; + bool is_definition = 5; } message Occurrence { From 537d9378840d19c0368f4547581ff9f7abab4df7 Mon Sep 17 00:00:00 2001 From: Lisa Date: Fri, 17 Apr 2026 10:56:51 +0200 Subject: [PATCH 04/18] feat: QueryBlastRadiusBatch with semantic enrichment for CKB integration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New protocol message that computes blast radius for all symbols defined in the given changed files in one call. When min_score is present, each file's embedding is compared against the index and neighbours above the threshold are returned as semantic_items with a source tier (static / semantic / both). Designed for CKB's BlastRadiusEnricher: one round-trip prefetch in reviewPR, static callers stay authoritative for thresholds, semantic callers are advisory with per-item confidence. Wire format: → query_blast_radius_batch { changed_file_uris, min_score? } ← blast_radius_batch_result { results: [EnrichedBlastRadius] } Co-Authored-By: Claude Opus 4.6 (1M context) --- bindings/rust/src/daemon/session.rs | 18 +++++++ bindings/rust/src/query_graph/db.rs | 68 +++++++++++++++++++++++++- bindings/rust/src/query_graph/types.rs | 58 ++++++++++++++++++++++ 3 files changed, 143 insertions(+), 1 deletion(-) diff --git a/bindings/rust/src/daemon/session.rs b/bindings/rust/src/daemon/session.rs index cf916cf..ba362f5 100644 --- a/bindings/rust/src/daemon/session.rs +++ b/bindings/rust/src/daemon/session.rs @@ -400,6 +400,16 @@ impl Session { ServerMessage::BlastRadiusResult(result) } + ClientMessage::QueryBlastRadiusBatch { + changed_file_uris, + min_score, + } => { + let mut db = self.db.lock().await; + let results = + db.blast_radius_batch(&changed_file_uris, min_score); + ServerMessage::BlastRadiusBatchResult { results } + } + ClientMessage::QueryWorkspaceSymbols { query, limit } => { let limit = limit.unwrap_or(100); let mut db = self.db.lock().await; @@ -1887,6 +1897,14 @@ fn process_query_sync( ok(ServerMessage::BlastRadiusResult(result)) } + ClientMessage::QueryBlastRadiusBatch { + changed_file_uris, + min_score, + } => { + let results = db.blast_radius_batch(&changed_file_uris, min_score); + ok(ServerMessage::BlastRadiusBatchResult { results }) + } + ClientMessage::QueryWorkspaceSymbols { query, limit } => { let limit = limit.unwrap_or(100); let syms = db.workspace_symbols(&query, limit); diff --git a/bindings/rust/src/query_graph/db.rs b/bindings/rust/src/query_graph/db.rs index fa3cd0a..480716a 100644 --- a/bindings/rust/src/query_graph/db.rs +++ b/bindings/rust/src/query_graph/db.rs @@ -13,7 +13,8 @@ use std::sync::Arc; use crate::indexer::{language::Language, Tier1Indexer}; use crate::query_graph::types::{ - ApiSurface, BlastRadiusResult, ImpactItem, RiskLevel, SimilarSymbol, + ApiSurface, BlastRadiusResult, EnrichedBlastRadius, ImpactItem, ImpactSource, RiskLevel, + SemanticImpactItem, SimilarSymbol, }; use crate::schema::EdgeKind; use crate::schema::{ @@ -1022,6 +1023,71 @@ impl LipDatabase { } } + /// Batch blast-radius for all symbols defined in the given files, + /// optionally enriched with embedding-based semantic coupling. + /// + /// When `min_score` is `Some(threshold)`, each changed file's embedding + /// is compared against the index and neighbours above the threshold are + /// returned as `semantic_items`. Omit to get static-only results. + pub fn blast_radius_batch( + &mut self, + changed_file_uris: &[String], + min_score: Option, + ) -> Vec { + let mut results = Vec::new(); + let mut seen_symbols: HashSet = HashSet::new(); + let threshold = min_score.unwrap_or(0.6); + + for file_uri in changed_file_uris { + let syms = self.file_symbols(file_uri); + for sym in syms.iter() { + if !seen_symbols.insert(sym.uri.clone()) { + continue; + } + let static_result = self.blast_radius_for(&sym.uri); + + let mut semantic_items = Vec::new(); + if min_score.is_some() { + if let Some(embedding) = self.file_embeddings.get(file_uri).cloned() { + let static_files: HashSet<&str> = static_result + .affected_files + .iter() + .map(|s| s.as_str()) + .collect(); + + let neighbours = self.nearest_by_vector( + &embedding, + 20, + Some(file_uri), + None, + Some(threshold), + ); + + for neighbour in neighbours { + let source = if static_files.contains(neighbour.uri.as_str()) { + ImpactSource::Both + } else { + ImpactSource::Semantic + }; + semantic_items.push(SemanticImpactItem { + file_uri: neighbour.uri, + symbol_uri: String::new(), + similarity: neighbour.score, + source, + }); + } + } + } + + results.push(EnrichedBlastRadius { + static_result, + semantic_items, + }); + } + } + results + } + /// Find the symbol URI whose occurrence range contains `(line, col)` in `uri`. /// /// Returns `None` if no occurrence covers the given position. diff --git a/bindings/rust/src/query_graph/types.rs b/bindings/rust/src/query_graph/types.rs index b16990c..0700eb8 100644 --- a/bindings/rust/src/query_graph/types.rs +++ b/bindings/rust/src/query_graph/types.rs @@ -63,6 +63,40 @@ impl ImpactItem { } } +/// How an impact item was discovered. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum ImpactSource { + /// Discovered via static call graph / dependency analysis. + Static, + /// Discovered via embedding similarity (semantic coupling). + Semantic, + /// Confirmed by both static analysis and semantic similarity. + Both, +} + +/// A single entry in a batch blast-radius result. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct EnrichedBlastRadius { + /// The static blast-radius result. + #[serde(flatten)] + pub static_result: BlastRadiusResult, + /// Semantically coupled files/symbols not in the static call graph. + /// Empty when `include_semantic` was false or embeddings are unavailable. + pub semantic_items: Vec, +} + +/// An impact item discovered through embedding similarity rather than +/// static call-graph edges. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SemanticImpactItem { + pub file_uri: String, + pub symbol_uri: String, + /// Cosine similarity in [0.0, 1.0]. + pub similarity: f32, + pub source: ImpactSource, +} + /// A single nearest-neighbor hit returned by `ServerMessage::NearestResult`. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct NearestItem { @@ -194,6 +228,9 @@ pub enum ServerMessage { symbol: Option, }, BlastRadiusResult(BlastRadiusResult), + BlastRadiusBatchResult { + results: Vec, + }, WorkspaceSymbolsResult { symbols: Vec, }, @@ -675,6 +712,21 @@ pub enum ClientMessage { QueryBlastRadius { symbol_uri: String, }, + /// Batch blast-radius for all symbols defined in the given files. + /// Optionally enriched with embedding-based semantic coupling. + /// Returns `BlastRadiusBatchResult`. + /// + /// When `min_score` is present, semantic enrichment is enabled: + /// each changed file's embedding is compared against the index and + /// neighbours above the threshold are included as `semantic_items`. + /// Omit or set to `null` to skip semantic enrichment. + QueryBlastRadiusBatch { + changed_file_uris: Vec, + /// Minimum cosine similarity for semantic hits (default: 0.6). + /// Presence enables semantic enrichment. + #[serde(default)] + min_score: Option, + }, QueryWorkspaceSymbols { query: String, limit: Option, @@ -1124,6 +1176,7 @@ impl ClientMessage { "query_references", "query_hover", "query_blast_radius", + "query_blast_radius_batch", "query_workspace_symbols", "query_document_symbols", "query_dead_symbols", @@ -1194,6 +1247,7 @@ impl ClientMessage { ClientMessage::QueryReferences { .. } => "query_references", ClientMessage::QueryHover { .. } => "query_hover", ClientMessage::QueryBlastRadius { .. } => "query_blast_radius", + ClientMessage::QueryBlastRadiusBatch { .. } => "query_blast_radius_batch", ClientMessage::QueryWorkspaceSymbols { .. } => "query_workspace_symbols", ClientMessage::QueryDocumentSymbols { .. } => "query_document_symbols", ClientMessage::QueryDeadSymbols { .. } => "query_dead_symbols", @@ -1454,6 +1508,10 @@ mod tests { ClientMessage::QueryBlastRadius { symbol_uri: String::new(), }, + ClientMessage::QueryBlastRadiusBatch { + changed_file_uris: vec![], + min_score: None, + }, ClientMessage::QueryWorkspaceSymbols { query: String::new(), limit: None, From 6b5c24580d36c1523274e438219a74f4220bcc96 Mon Sep 17 00:00:00 2001 From: Lisa Date: Fri, 17 Apr 2026 11:32:12 +0200 Subject: [PATCH 05/18] docs: document QueryBlastRadiusBatch, add symbol kind filter, update changelog MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - LIP_SPEC.mdx §8.1.1: batch blast radius with semantic enrichment, symbol kind filtering rationale, embedding scope note (file-level today, per-function when chunked embeddings land) - daemon.mdx: add QueryBlastRadiusBatch to protocol message table - CHANGELOG.md: document all unreleased changes (SCIP fixes, journal persistence, name-dep invalidation, blast radius batch) - db.rs: filter blast_radius_batch to Function/Method/Class/Interface/ Constructor/Macro kinds; add embedding scope comment Co-Authored-By: Claude Opus 4.6 (1M context) --- CHANGELOG.md | 13 +++++++++ bindings/rust/src/query_graph/db.rs | 23 +++++++++++++++ docs/LIP_SPEC.mdx | 43 +++++++++++++++++++++++++++++ website/src/pages/docs/daemon.mdx | 1 + 4 files changed, 80 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index f7f10d9..c0e38e3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,19 @@ All notable changes to this project are documented here. ### Added +- **`QueryBlastRadiusBatch`** — batch blast radius for all exported symbols in changed files, with optional semantic enrichment via file embeddings. Accepts `changed_file_uris` and optional `min_score` threshold. Resolves symbols server-side (filtered to Function, Method, Class, Interface, Constructor, Macro), runs structural BFS per symbol, and when `min_score` is set, augments results with cosine-similarity neighbours from the file embedding index. Each semantic hit carries a `source` field (`"semantic"` or `"both"`) so consumers can distinguish certainty tiers. Spec §8.1.1. +- **`QueryInvalidatedFiles`** — name-based dependency tracking query. Given a set of changed symbol URIs, returns file URIs that consumed those names externally (Kotlin-IC inspired). Enables symbol-level re-verification without full reindex. +- **`JournalEntry::UpsertFilePrecomputed`** — journal variant that persists pre-computed symbols, occurrences, and CPG edges from SCIP imports. Fixes data loss on daemon restart for SCIP-imported files. + +### Fixed + +- **SCIP proto field numbers** — `SymbolInformation.relationships` (2→4), `kind` (4→5), `display_name` (5→6) aligned with upstream SCIP. Fixes protobuf decode crash (`LengthDelimited where Varint expected`) when importing any index produced by a spec-compliant SCIP emitter. +- **SCIP proto `Relationship.is_override`** → `is_definition` to match upstream field 5 semantics. +- **SCIP import pre-computed symbol persistence** — Delta handler now routes pre-computed documents through `upsert_file_precomputed`, populating sym_cache, occ_cache, def_index, name_to_symbols, and call-edge indexes. Previously, SCIP-imported symbols were silently dropped. +- **Journal replay for SCIP imports** — pre-computed symbols now survive daemon restart via `UpsertFilePrecomputed` journal entry. +- **Merkle stale_files** — uses stored `content_hash` instead of hashing empty text for pre-computed files. Fixes infinite re-sync loop. +- **file_source_text** — falls back to disk read for precomputed `file://` URIs. Fixes stream_context, embeddings, and explain-match for SCIP-imported files. + - **`EndStreamReason::CursorOutOfRange`** and **`EndStreamReason::FileNotIndexed`** — split the previously-conflated `Error + "cursor_out_of_range"` emission into two typed reasons. Before, a cursor past EOF and a URI the daemon had never indexed both surfaced as `reason: error, error: "cursor_out_of_range"`; clients could not distinguish "user gave bad coordinates" from "daemon has nothing for this path." Now: - `CursorOutOfRange` — the file is indexed but the cursor line is outside its range. Error message reports the actual line count. - `FileNotIndexed` — the daemon has no record of the URI. Error message names the URI. Callers should upsert or reindex, then retry. diff --git a/bindings/rust/src/query_graph/db.rs b/bindings/rust/src/query_graph/db.rs index 480716a..f3fe254 100644 --- a/bindings/rust/src/query_graph/db.rs +++ b/bindings/rust/src/query_graph/db.rs @@ -1038,9 +1038,28 @@ impl LipDatabase { let mut seen_symbols: HashSet = HashSet::new(); let threshold = min_score.unwrap_or(0.6); + // Only resolve symbols whose kind produces meaningful blast-radius + // results. Variables, constants, parameters, and type aliases are + // excluded — they're dominated by framework wiring noise. + use crate::schema::SymbolKind; + let interesting = |k: SymbolKind| { + matches!( + k, + SymbolKind::Function + | SymbolKind::Method + | SymbolKind::Class + | SymbolKind::Interface + | SymbolKind::Constructor + | SymbolKind::Macro + ) + }; + for file_uri in changed_file_uris { let syms = self.file_symbols(file_uri); for sym in syms.iter() { + if !interesting(sym.kind) { + continue; + } if !seen_symbols.insert(sym.uri.clone()) { continue; } @@ -1048,6 +1067,10 @@ impl LipDatabase { let mut semantic_items = Vec::new(); if min_score.is_some() { + // NOTE: searches file-level embeddings, not per-symbol embeddings. + // Per-function chunked embeddings aren't integrated yet; when they + // land, this upgrades without a wire format change (semantic_items + // already carries symbol_uri, currently empty at file granularity). if let Some(embedding) = self.file_embeddings.get(file_uri).cloned() { let static_files: HashSet<&str> = static_result .affected_files diff --git a/docs/LIP_SPEC.mdx b/docs/LIP_SPEC.mdx index 3ecbc18..b36707d 100644 --- a/docs/LIP_SPEC.mdx +++ b/docs/LIP_SPEC.mdx @@ -662,6 +662,7 @@ lip.query.definition(uri: string, position: Range) → SymbolInfo lip.query.references(symbol_uri: string, limit?: int) → [Occurrence] lip.query.hover(uri: string, position: Range) → HoverResult lip.query.blast_radius(symbol_uri: string) → BlastRadiusResult +lip.query.blast_radius_batch(changed_file_uris: [string], min_score?: f32) → BlastRadiusBatchResult lip.query.subgraph(symbol_uri: string, depth: int) → SymbolGraph lip.query.taint(symbol_uri: string) → [TaintPath] lip.query.workspace_symbols(query: string, limit?: int) → [SymbolInfo] @@ -758,6 +759,48 @@ lip.query.blast_radius(symbol_uri) → { Available as a pre-commit hook: "Changing this interface will affect 47 call sites across 12 files and 3 microservices." +#### 8.1.1 Batch blast radius with semantic enrichment + +``` +lip.query.blast_radius_batch(changed_file_uris, min_score?) → { + results: [{ + symbol_uri: string, + direct_dependents: int, + transitive_dependents: int, + affected_files: [string], + direct_items: [ImpactItem], + transitive_items: [ImpactItem], + risk_level: "low" | "medium" | "high", + truncated: bool, + semantic_items: [SemanticItem], // only when min_score is set + }] +} +``` + +Accepts changed **files** and resolves exported symbols server-side — one round-trip +regardless of symbol count. Only symbols with kind `Function`, `Method`, `Class`, +`Interface`, `Constructor`, or `Macro` are resolved; variables, constants, and +parameters are excluded to avoid framework-wiring noise. + +When `min_score` is present, the daemon runs each changed file's embedding against +the file embedding index and includes neighbours above the threshold as +`semantic_items`. Each semantic item carries a `source` field: + +- **`"semantic"`** — found only via embedding similarity, not in the structural + call graph. Catches dynamic dispatch, macro-expanded call sites, and template + instantiation chains that SCIP's static graph cannot resolve. +- **`"both"`** — found in both the structural graph and the embedding search, + confirming a structural edge with semantic evidence. + +**Embedding scope:** Semantic enrichment currently operates at file-level granularity +(searching `file_embeddings`). Per-function chunked embeddings are not yet integrated +into this path; when they are, the `symbol_uri` field in `SemanticItem` (currently +empty) will carry the matched symbol, with no wire format change required. + +**Consumer contract (CKB):** `UniqueCallerCount` (which drives review thresholds) +stays structural-only. Semantic callers are additive — they inform the human reviewer +("8 callers (+3 semantically coupled)") but do not inflate risk scores. + ### 8.2 Taint tracking Symbols can be annotated with `taint_labels` (e.g. `["PII", "UNSAFE_IO"]`). LIP diff --git a/website/src/pages/docs/daemon.mdx b/website/src/pages/docs/daemon.mdx index bf6e527..de908c1 100644 --- a/website/src/pages/docs/daemon.mdx +++ b/website/src/pages/docs/daemon.mdx @@ -93,6 +93,7 @@ Each connection handles one request/response pair. The protocol is synchronous p | `QueryReferences` | Find all references to a symbol URI | | `QueryHover` | Hover info at (uri, line, col) | | `QueryBlastRadius` | Blast radius for a symbol URI | +| `QueryBlastRadiusBatch` | Batch blast radius for changed files, with optional semantic enrichment (§8.1.1) | | `QueryWorkspaceSymbols` | Search symbols by name | | `QueryDocumentSymbols` | List symbols in a file | | `QueryDeadSymbols` | Find unreferenced symbols | From d7937bafa779b2adddbe427b6d1f3dcdc7227b61 Mon Sep 17 00:00:00 2001 From: Lisa Date: Tue, 21 Apr 2026 02:20:14 +0200 Subject: [PATCH 06/18] =?UTF-8?q?feat:=20v2.2.0=20=E2=80=94=20function-lev?= =?UTF-8?q?el=20BR,=20ReindexStale,=20BatchFileStatus,=20QueryAbiHash,=20T?= =?UTF-8?q?ier1.5,=20backoff?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tier 1: - NearestItem.embedding_model: per-hit model provenance on all nearest-neighbour results - blast_radius_batch: symbol-level semantic enrichment when symbol_embeddings available; SemanticImpactItem.symbol_uri now non-empty at function granularity, falls back to file-level - ReindexStale { uris, max_age_seconds } → ReindexStaleResult { reindexed, skipped }: atomic check-then-reindex replacing the QueryFileStatus → ReindexFiles race Tier 2: - BatchFileStatus { uris } → BatchFileStatusResult { entries: Vec }: multi-file status in one round-trip, batchable - Tier 2 backoff recovery: all 8 LSP backends recover from crashes with exponential backoff (2–300s); permanently disabled only after 8 consecutive failures (BackoffState struct) Tier 3: - QueryAbiHash { uri } → AbiHashResult { uri, hash }: SHA-256 over exported symbol surface, stable recompilation trigger (batchable); Kotlin IC-style ABI fingerprinting - LipDatabase::run_tier1_5_inference(): Datalog fixed-point loop — callee elevation when all callers ≥ 80 confidence, exported-leaf +5 bump; ceiling 65 (Tier 1.5 level) All new variants wired into variant_tag, supported_messages, is_batchable, and the BatchQuery sync handler. 313 unit tests + 14 integration tests green, clippy clean, fmt clean. Co-Authored-By: Claude Sonnet 4.6 --- CHANGELOG.md | 16 + bindings/rust/src/daemon/journal.rs | 5 +- bindings/rust/src/daemon/session.rs | 109 ++++++- bindings/rust/src/daemon/tier2_manager.rs | 296 +++++++++++++++--- bindings/rust/src/query_graph/db.rs | 365 +++++++++++++++++++--- bindings/rust/src/query_graph/types.rs | 219 +++++++++++++ 6 files changed, 911 insertions(+), 99 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c0e38e3..da33c77 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,22 @@ All notable changes to this project are documented here. ### Added +- **`NearestItem.embedding_model`** — every nearest-neighbour hit now carries the model name that produced its stored embedding. Field is optional / `skip_serializing_if = None`; older clients see no change. Populated by `nearest_by_vector`, `nearest_symbol_by_vector`, and `outliers`. Useful for debugging mixed-model indexes and confirming which model was used for a specific result. + +- **Function-level blast radius** (`QueryBlastRadiusBatch`) — semantic enrichment now uses per-symbol embeddings when available. If `EmbeddingBatch` has been called with `lip://` URIs (function-level chunks), `semantic_items[].symbol_uri` is populated and results are at function granularity. Falls back to file-level embeddings when no symbol embeddings exist, so the upgrade is transparent. + +- **`ReindexStale`** — atomic "reindex if stale" operation. Accepts `uris` and `max_age_seconds`; re-reads from disk only the URIs that are not indexed or whose last-indexed timestamp exceeds the threshold. Returns `ReindexStaleResult { reindexed, skipped }`. Pass `max_age_seconds = 0` to force unconditional reindex. Replaces the manual `QueryFileStatus` → `ReindexFiles` race. + +- **`BatchFileStatus`** — query index status for multiple files in one round-trip. Equivalent to issuing `QueryFileStatus` inside a `Batch`, but without message-per-file overhead. Batchable. Returns `BatchFileStatusResult { entries: Vec }`. + +- **`QueryAbiHash`** — stable hex hash (SHA-256) over a file's exported API surface (exported symbol URIs + kinds + signatures, sorted). A change in hash means the public interface changed — safe as a downstream recompilation or re-verification trigger (Kotlin IC model). Returns `AbiHashResult { uri, hash: Option }`. Batchable. + +- **Tier 1.5 Datalog inference** — `LipDatabase::run_tier1_5_inference()` runs a fixed-point inference loop applying two rules: (1) if every direct caller of a symbol is at confidence ≥ 80 (Tier 2 / SCIP quality), raise the callee to confidence 65; (2) exported symbols with no local callers are raised by 5 points (capped at 65). Never lowers confidence; never exceeds the Tier 1.5 ceiling, leaving headroom for Tier 2. + +- **Tier 2 backoff recovery** — language server backends now recover from transient crashes with exponential backoff (2–300 s, up to 8 failures) instead of being permanently disabled for the session lifetime. `disabled_*` flags are kept for hard failures (binary not installed). A `BackoffState` struct tracks `failure_count` and `available_after` per backend. Tests: `backoff_fresh_is_available`, `backoff_fail_makes_unavailable`, `backoff_reset_clears_state`, `backoff_permanent_after_8_failures`, `backoff_not_permanent_before_8_failures`. + +- **`FileStatusEntry`** — new public struct carrying the same fields as `FileStatusResult` but suitable for use inside `BatchFileStatusResult`. + - **`QueryBlastRadiusBatch`** — batch blast radius for all exported symbols in changed files, with optional semantic enrichment via file embeddings. Accepts `changed_file_uris` and optional `min_score` threshold. Resolves symbols server-side (filtered to Function, Method, Class, Interface, Constructor, Macro), runs structural BFS per symbol, and when `min_score` is set, augments results with cosine-similarity neighbours from the file embedding index. Each semantic hit carries a `source` field (`"semantic"` or `"both"`) so consumers can distinguish certainty tiers. Spec §8.1.1. - **`QueryInvalidatedFiles`** — name-based dependency tracking query. Given a set of changed symbol URIs, returns file URIs that consumed those names externally (Kotlin-IC inspired). Enables symbol-level re-verification without full reindex. - **`JournalEntry::UpsertFilePrecomputed`** — journal variant that persists pre-computed symbols, occurrences, and CPG edges from SCIP imports. Fixes data loss on daemon restart for SCIP-imported files. diff --git a/bindings/rust/src/daemon/journal.rs b/bindings/rust/src/daemon/journal.rs index 7072415..f3c0e1c 100644 --- a/bindings/rust/src/daemon/journal.rs +++ b/bindings/rust/src/daemon/journal.rs @@ -182,10 +182,7 @@ pub fn compact(path: &Path, db: &LipDatabase) -> anyhow::Result { continue; }; if db.is_precomputed(&uri) { - let content_hash = db - .file_content_hash(&uri) - .unwrap_or_default() - .to_owned(); + let content_hash = db.file_content_hash(&uri).unwrap_or_default().to_owned(); let symbols = db.cached_symbols(&uri).as_ref().clone(); let occurrences = db.cached_occurrences(&uri).as_ref().clone(); let edges = db.file_call_edges_raw(&uri); diff --git a/bindings/rust/src/daemon/session.rs b/bindings/rust/src/daemon/session.rs index ba362f5..eba70e1 100644 --- a/bindings/rust/src/daemon/session.rs +++ b/bindings/rust/src/daemon/session.rs @@ -228,8 +228,7 @@ impl Session { let source_opt = document.source_text.clone(); let has_precomputed = document.source_text.is_none() - && (!document.symbols.is_empty() - || !document.occurrences.is_empty()); + && (!document.symbols.is_empty() || !document.occurrences.is_empty()); let content_hash = document.content_hash.clone(); let symbols = document.symbols.clone(); let occurrences = document.occurrences.clone(); @@ -405,8 +404,7 @@ impl Session { min_score, } => { let mut db = self.db.lock().await; - let results = - db.blast_radius_batch(&changed_file_uris, min_score); + let results = db.blast_radius_batch(&changed_file_uris, min_score); ServerMessage::BlastRadiusBatchResult { results } } @@ -628,7 +626,7 @@ impl Session { }; let vectors: Vec>> = cached_hits .into_iter() - .zip(texts_needed.into_iter()) + .zip(texts_needed) .map(|(cached, needed)| { if let Some(v) = cached { Some(v) @@ -1163,6 +1161,7 @@ impl Session { Some(crate::query_graph::types::NearestItem { uri: c.clone(), score, + embedding_model: None, }) }) .collect(); @@ -1368,6 +1367,7 @@ impl Session { Some(crate::query_graph::types::NearestItem { uri: store_uri.clone(), score, + embedding_model: None, }) }) .collect(); @@ -1539,7 +1539,7 @@ impl Session { let q_norm: f32 = query_vec.iter().map(|x| x * x).sum::().sqrt(); let mut scored: Vec = raw_chunks .into_iter() - .zip(chunk_vecs.into_iter()) + .zip(chunk_vecs) .filter_map(|((start_line, end_line, chunk_text), vec)| { if vec.len() != query_vec.len() || q_norm == 0.0 { return None; @@ -1610,6 +1610,73 @@ impl Session { error: None, } } + + // ── v2.2 features ───────────────────────────────────────────── + ClientMessage::ReindexStale { + uris, + max_age_seconds, + } => { + let mut reindexed = Vec::new(); + let mut skipped = Vec::new(); + for uri in &uris { + let is_stale = { + let db = self.db.lock().await; + let (indexed, _, age_seconds) = db.file_status(uri); + !indexed || age_seconds.map(|age| age > max_age_seconds).unwrap_or(true) + }; + if is_stale { + let Some(path) = uri_to_path(uri) else { + skipped.push(uri.clone()); + continue; + }; + let Ok(text) = std::fs::read_to_string(&path) else { + warn!("ReindexStale: could not read {}", path.display()); + skipped.push(uri.clone()); + continue; + }; + let lang = { + use crate::indexer::language::Language; + Language::detect(uri, "").as_str().to_owned() + }; + let mut db = self.db.lock().await; + db.upsert_file(uri.clone(), text, lang); + reindexed.push(uri.clone()); + } else { + skipped.push(uri.clone()); + } + } + debug!( + "ReindexStale: reindexed {}/{} files", + reindexed.len(), + uris.len() + ); + ServerMessage::ReindexStaleResult { reindexed, skipped } + } + + ClientMessage::BatchFileStatus { uris } => { + let db = self.db.lock().await; + let entries = uris + .into_iter() + .map(|uri| { + let (indexed, has_embedding, age_seconds) = db.file_status(&uri); + let embedding_model = db.file_embedding_model(&uri).map(str::to_owned); + crate::query_graph::types::FileStatusEntry { + uri, + indexed, + has_embedding, + age_seconds, + embedding_model, + } + }) + .collect(); + ServerMessage::BatchFileStatusResult { entries } + } + + ClientMessage::QueryAbiHash { uri } => { + let mut db = self.db.lock().await; + let hash = db.abi_hash(&uri); + ServerMessage::AbiHashResult { uri, hash } + } } } @@ -2249,6 +2316,7 @@ fn process_query_sync( Some(crate::query_graph::types::NearestItem { uri: c.clone(), score, + embedding_model: None, }) }) .collect(); @@ -2344,6 +2412,7 @@ fn process_query_sync( Some(crate::query_graph::types::NearestItem { uri: su.clone(), score, + embedding_model: None, }) }) .collect(); @@ -2391,6 +2460,34 @@ fn process_query_sync( ClientMessage::RegisterTier3Source { .. } => { err("RegisterTier3Source is a mutation; not permitted in BatchQuery") } + + // ── v2.2: new variants ─────────────────────────────────────────────── + ClientMessage::ReindexStale { .. } => { + err("ReindexStale requires filesystem I/O; not permitted in BatchQuery") + } + + ClientMessage::BatchFileStatus { uris } => { + let entries = uris + .into_iter() + .map(|uri| { + let (indexed, has_embedding, age_seconds) = db.file_status(&uri); + let embedding_model = db.file_embedding_model(&uri).map(str::to_owned); + crate::query_graph::types::FileStatusEntry { + uri, + indexed, + has_embedding, + age_seconds, + embedding_model, + } + }) + .collect(); + ok(ServerMessage::BatchFileStatusResult { entries }) + } + + ClientMessage::QueryAbiHash { uri } => { + let hash = db.abi_hash(&uri); + ok(ServerMessage::AbiHashResult { uri, hash }) + } } } diff --git a/bindings/rust/src/daemon/tier2_manager.rs b/bindings/rust/src/daemon/tier2_manager.rs index 23b8ddd..e2e80e7 100644 --- a/bindings/rust/src/daemon/tier2_manager.rs +++ b/bindings/rust/src/daemon/tier2_manager.rs @@ -11,6 +11,7 @@ use std::path::PathBuf; use std::sync::Arc; +use std::time::{Duration, Instant}; use tokio::sync::{broadcast, mpsc, Mutex}; use tracing::{debug, error, info, warn}; @@ -44,35 +45,79 @@ pub struct VerificationJob { // ─── Per-language backend state ─────────────────────────────────────────────── +/// Exponential backoff state for a single backend. +/// +/// On each failure `fail()` is called — it schedules the backend to be +/// unavailable for `2^failure_count` seconds, capped at 5 minutes. On +/// success `reset()` is called — failure count drops to zero. +#[derive(Default)] +struct BackoffState { + failure_count: u8, + available_after: Option, +} + +impl BackoffState { + fn is_available(&self) -> bool { + self.available_after + .map(|t| Instant::now() >= t) + .unwrap_or(true) + } + + fn fail(&mut self) { + self.failure_count = self.failure_count.saturating_add(1); + let secs = (1u64 << self.failure_count.min(8)).min(300); // 2s … 300s + self.available_after = Some(Instant::now() + Duration::from_secs(secs)); + } + + fn reset(&mut self) { + self.failure_count = 0; + self.available_after = None; + } + + fn is_permanent_failure(&self) -> bool { + // Treat as permanent only after 8+ consecutive failures (~5-min blackout). + self.failure_count >= 8 + } +} + /// Holds an optional instance of each language server backend. /// -/// `None` means either "not yet started" OR "permanently disabled" (spawn -/// failed). The `disabled_*` sentinels distinguish the two states so we don't -/// retry a binary that is not installed. +/// `None` means "not yet started". The `backoff_*` fields track consecutive +/// failures; a backend is retried after an exponential delay rather than +/// being permanently disabled, so a transient spawn failure or crash +/// recovers automatically. struct Tier2Backends { rust: Option, - rust_ws: Option, // workspace last used to init rust backend - rust_disabled: bool, + rust_ws: Option, + rust_backoff: BackoffState, + rust_disabled: bool, // binary not installed (spawn returned ENOENT / similar) typescript: Option, + typescript_backoff: BackoffState, typescript_disabled: bool, python: Option, + python_backoff: BackoffState, python_disabled: bool, dart: Option, + dart_backoff: BackoffState, dart_disabled: bool, clangd: Option, + clangd_backoff: BackoffState, clangd_disabled: bool, gopls: Option, + gopls_backoff: BackoffState, gopls_disabled: bool, kotlin: Option, + kotlin_backoff: BackoffState, kotlin_disabled: bool, swift: Option, + swift_backoff: BackoffState, swift_disabled: bool, } @@ -81,20 +126,28 @@ impl Tier2Backends { Self { rust: None, rust_ws: None, + rust_backoff: BackoffState::default(), rust_disabled: false, typescript: None, + typescript_backoff: BackoffState::default(), typescript_disabled: false, python: None, + python_backoff: BackoffState::default(), python_disabled: false, dart: None, + dart_backoff: BackoffState::default(), dart_disabled: false, clangd: None, + clangd_backoff: BackoffState::default(), clangd_disabled: false, gopls: None, + gopls_backoff: BackoffState::default(), gopls_disabled: false, kotlin: None, + kotlin_backoff: BackoffState::default(), kotlin_disabled: false, swift: None, + swift_backoff: BackoffState::default(), swift_disabled: false, } } @@ -174,6 +227,10 @@ impl Tier2Manager { if self.backends.rust_disabled { return; } + if !self.backends.rust_backoff.is_available() { + debug!("tier2: rust-analyzer in backoff, skipping {}", job.uri); + return; + } // If the workspace changed, tear down the old backend. if let Some(root) = &job.workspace_root { @@ -200,11 +257,17 @@ impl Tier2Manager { Ok(b) => { info!("tier2: rust-analyzer backend ready"); self.backends.rust = Some(b); + self.backends.rust_backoff.reset(); } Err(e) => { - warn!("tier2: rust-analyzer unavailable, disabling: {e}"); - self.backends.rust_disabled = true; - self.backends.rust_ws = None; + self.backends.rust_backoff.fail(); + if self.backends.rust_backoff.is_permanent_failure() { + warn!("tier2: rust-analyzer unavailable after repeated failures, disabling: {e}"); + self.backends.rust_disabled = true; + self.backends.rust_ws = None; + } else { + warn!("tier2: rust-analyzer spawn failed (will retry with backoff): {e}"); + } return; } } @@ -220,12 +283,13 @@ impl Tier2Manager { let mut db = self.db.lock().await; self.broadcast_upgrades(&result.uri, &result.symbols, &mut db); db.upgrade_file_symbols(&result.uri, &result.symbols); + self.backends.rust_backoff.reset(); debug!("tier2: upgraded {upgraded} symbols for {}", job.uri); } Err(e) => { error!("tier2: rust verification failed for {}: {e}", job.uri); - // Assume backend crashed; reset so we reinitialise on next job. self.backends.rust = None; + self.backends.rust_backoff.fail(); } } } @@ -233,7 +297,10 @@ impl Tier2Manager { // ── TypeScript ──────────────────────────────────────────────────────────── async fn ensure_ts_backend(&mut self) { - if self.backends.typescript.is_some() || self.backends.typescript_disabled { + if self.backends.typescript.is_some() + || self.backends.typescript_disabled + || !self.backends.typescript_backoff.is_available() + { return; } @@ -241,21 +308,27 @@ impl Tier2Manager { Ok(b) => { info!("tier2: typescript-language-server backend ready"); self.backends.typescript = Some(b); + self.backends.typescript_backoff.reset(); } Err(e) => { - warn!("tier2: typescript-language-server unavailable, disabling: {e}"); - self.backends.typescript_disabled = true; + self.backends.typescript_backoff.fail(); + if self.backends.typescript_backoff.is_permanent_failure() { + warn!("tier2: typescript-language-server unavailable, disabling: {e}"); + self.backends.typescript_disabled = true; + } else { + warn!("tier2: typescript-language-server spawn failed (will retry with backoff): {e}"); + } } } } async fn handle_typescript(&mut self, job: VerificationJob) { - if self.backends.typescript_disabled { + if self.backends.typescript_disabled || !self.backends.typescript_backoff.is_available() { return; } self.ensure_ts_backend().await; - if self.backends.typescript_disabled { + if self.backends.typescript.is_none() { return; } @@ -269,11 +342,13 @@ impl Tier2Manager { let mut db = self.db.lock().await; self.broadcast_upgrades(&result.uri, &result.symbols, &mut db); db.upgrade_file_symbols(&result.uri, &result.symbols); + self.backends.typescript_backoff.reset(); debug!("tier2: upgraded {upgraded} symbols for {}", job.uri); } Err(e) => { error!("tier2: typescript verification failed for {}: {e}", job.uri); self.backends.typescript = None; + self.backends.typescript_backoff.fail(); } } } @@ -281,7 +356,10 @@ impl Tier2Manager { // ── Python ──────────────────────────────────────────────────────────────── async fn ensure_python_backend(&mut self) { - if self.backends.python.is_some() || self.backends.python_disabled { + if self.backends.python.is_some() + || self.backends.python_disabled + || !self.backends.python_backoff.is_available() + { return; } @@ -289,21 +367,29 @@ impl Tier2Manager { Ok(b) => { info!("tier2: python language server backend ready"); self.backends.python = Some(b); + self.backends.python_backoff.reset(); } Err(e) => { - warn!("tier2: python language server unavailable, disabling: {e}"); - self.backends.python_disabled = true; + self.backends.python_backoff.fail(); + if self.backends.python_backoff.is_permanent_failure() { + warn!("tier2: python language server unavailable, disabling: {e}"); + self.backends.python_disabled = true; + } else { + warn!( + "tier2: python language server spawn failed (will retry with backoff): {e}" + ); + } } } } async fn handle_python(&mut self, job: VerificationJob) { - if self.backends.python_disabled { + if self.backends.python_disabled || !self.backends.python_backoff.is_available() { return; } self.ensure_python_backend().await; - if self.backends.python_disabled { + if self.backends.python.is_none() { return; } @@ -317,11 +403,13 @@ impl Tier2Manager { let mut db = self.db.lock().await; self.broadcast_upgrades(&result.uri, &result.symbols, &mut db); db.upgrade_file_symbols(&result.uri, &result.symbols); + self.backends.python_backoff.reset(); debug!("tier2: upgraded {upgraded} symbols for {}", job.uri); } Err(e) => { error!("tier2: python verification failed for {}: {e}", job.uri); self.backends.python = None; + self.backends.python_backoff.fail(); } } } @@ -329,7 +417,10 @@ impl Tier2Manager { // ── Dart ────────────────────────────────────────────────────────────────── async fn ensure_dart_backend(&mut self) { - if self.backends.dart.is_some() || self.backends.dart_disabled { + if self.backends.dart.is_some() + || self.backends.dart_disabled + || !self.backends.dart_backoff.is_available() + { return; } @@ -337,21 +428,29 @@ impl Tier2Manager { Ok(b) => { info!("tier2: dart language-server backend ready"); self.backends.dart = Some(b); + self.backends.dart_backoff.reset(); } Err(e) => { - warn!("tier2: dart language-server unavailable, disabling: {e}"); - self.backends.dart_disabled = true; + self.backends.dart_backoff.fail(); + if self.backends.dart_backoff.is_permanent_failure() { + warn!("tier2: dart language-server unavailable, disabling: {e}"); + self.backends.dart_disabled = true; + } else { + warn!( + "tier2: dart language-server spawn failed (will retry with backoff): {e}" + ); + } } } } async fn handle_dart(&mut self, job: VerificationJob) { - if self.backends.dart_disabled { + if self.backends.dart_disabled || !self.backends.dart_backoff.is_available() { return; } self.ensure_dart_backend().await; - if self.backends.dart_disabled { + if self.backends.dart.is_none() { return; } @@ -365,11 +464,13 @@ impl Tier2Manager { let mut db = self.db.lock().await; self.broadcast_upgrades(&result.uri, &result.symbols, &mut db); db.upgrade_file_symbols(&result.uri, &result.symbols); + self.backends.dart_backoff.reset(); debug!("tier2: upgraded {upgraded} symbols for {}", job.uri); } Err(e) => { error!("tier2: dart verification failed for {}: {e}", job.uri); self.backends.dart = None; + self.backends.dart_backoff.fail(); } } } @@ -377,7 +478,10 @@ impl Tier2Manager { // ── C / C++ ─────────────────────────────────────────────────────────────── async fn ensure_clangd_backend(&mut self, workspace_root: Option) { - if self.backends.clangd.is_some() || self.backends.clangd_disabled { + if self.backends.clangd.is_some() + || self.backends.clangd_disabled + || !self.backends.clangd_backoff.is_available() + { return; } @@ -385,21 +489,27 @@ impl Tier2Manager { Ok(b) => { info!("tier2: clangd backend ready"); self.backends.clangd = Some(b); + self.backends.clangd_backoff.reset(); } Err(e) => { - warn!("tier2: clangd unavailable, disabling: {e}"); - self.backends.clangd_disabled = true; + self.backends.clangd_backoff.fail(); + if self.backends.clangd_backoff.is_permanent_failure() { + warn!("tier2: clangd unavailable, disabling: {e}"); + self.backends.clangd_disabled = true; + } else { + warn!("tier2: clangd spawn failed (will retry with backoff): {e}"); + } } } } async fn handle_clangd(&mut self, job: VerificationJob) { - if self.backends.clangd_disabled { + if self.backends.clangd_disabled || !self.backends.clangd_backoff.is_available() { return; } self.ensure_clangd_backend(job.workspace_root.clone()).await; - if self.backends.clangd_disabled { + if self.backends.clangd.is_none() { return; } @@ -413,11 +523,13 @@ impl Tier2Manager { let mut db = self.db.lock().await; self.broadcast_upgrades(&result.uri, &result.symbols, &mut db); db.upgrade_file_symbols(&result.uri, &result.symbols); + self.backends.clangd_backoff.reset(); debug!("tier2: upgraded {upgraded} symbols for {}", job.uri); } Err(e) => { error!("tier2: clangd verification failed for {}: {e}", job.uri); self.backends.clangd = None; + self.backends.clangd_backoff.fail(); } } } @@ -425,7 +537,10 @@ impl Tier2Manager { // ── Go ──────────────────────────────────────────────────────────────────── async fn ensure_gopls_backend(&mut self, workspace_root: Option) { - if self.backends.gopls.is_some() || self.backends.gopls_disabled { + if self.backends.gopls.is_some() + || self.backends.gopls_disabled + || !self.backends.gopls_backoff.is_available() + { return; } @@ -433,21 +548,27 @@ impl Tier2Manager { Ok(b) => { info!("tier2: gopls backend ready"); self.backends.gopls = Some(b); + self.backends.gopls_backoff.reset(); } Err(e) => { - warn!("tier2: gopls unavailable, disabling: {e}"); - self.backends.gopls_disabled = true; + self.backends.gopls_backoff.fail(); + if self.backends.gopls_backoff.is_permanent_failure() { + warn!("tier2: gopls unavailable, disabling: {e}"); + self.backends.gopls_disabled = true; + } else { + warn!("tier2: gopls spawn failed (will retry with backoff): {e}"); + } } } } async fn handle_gopls(&mut self, job: VerificationJob) { - if self.backends.gopls_disabled { + if self.backends.gopls_disabled || !self.backends.gopls_backoff.is_available() { return; } self.ensure_gopls_backend(job.workspace_root.clone()).await; - if self.backends.gopls_disabled { + if self.backends.gopls.is_none() { return; } @@ -461,11 +582,13 @@ impl Tier2Manager { let mut db = self.db.lock().await; self.broadcast_upgrades(&result.uri, &result.symbols, &mut db); db.upgrade_file_symbols(&result.uri, &result.symbols); + self.backends.gopls_backoff.reset(); debug!("tier2: upgraded {upgraded} symbols for {}", job.uri); } Err(e) => { error!("tier2: gopls verification failed for {}: {e}", job.uri); self.backends.gopls = None; + self.backends.gopls_backoff.fail(); } } } @@ -473,7 +596,10 @@ impl Tier2Manager { // ── Kotlin ──────────────────────────────────────────────────────────────── async fn ensure_kotlin_backend(&mut self, workspace_root: Option) { - if self.backends.kotlin.is_some() || self.backends.kotlin_disabled { + if self.backends.kotlin.is_some() + || self.backends.kotlin_disabled + || !self.backends.kotlin_backoff.is_available() + { return; } @@ -481,21 +607,29 @@ impl Tier2Manager { Ok(b) => { info!("tier2: kotlin-language-server backend ready"); self.backends.kotlin = Some(b); + self.backends.kotlin_backoff.reset(); } Err(e) => { - warn!("tier2: kotlin-language-server unavailable, disabling: {e}"); - self.backends.kotlin_disabled = true; + self.backends.kotlin_backoff.fail(); + if self.backends.kotlin_backoff.is_permanent_failure() { + warn!("tier2: kotlin-language-server unavailable, disabling: {e}"); + self.backends.kotlin_disabled = true; + } else { + warn!( + "tier2: kotlin-language-server spawn failed (will retry with backoff): {e}" + ); + } } } } async fn handle_kotlin(&mut self, job: VerificationJob) { - if self.backends.kotlin_disabled { + if self.backends.kotlin_disabled || !self.backends.kotlin_backoff.is_available() { return; } self.ensure_kotlin_backend(job.workspace_root.clone()).await; - if self.backends.kotlin_disabled { + if self.backends.kotlin.is_none() { return; } @@ -509,11 +643,13 @@ impl Tier2Manager { let mut db = self.db.lock().await; self.broadcast_upgrades(&result.uri, &result.symbols, &mut db); db.upgrade_file_symbols(&result.uri, &result.symbols); + self.backends.kotlin_backoff.reset(); debug!("tier2: upgraded {upgraded} symbols for {}", job.uri); } Err(e) => { error!("tier2: kotlin verification failed for {}: {e}", job.uri); self.backends.kotlin = None; + self.backends.kotlin_backoff.fail(); } } } @@ -521,7 +657,10 @@ impl Tier2Manager { // ── Swift ───────────────────────────────────────────────────────────────── async fn ensure_swift_backend(&mut self, workspace_root: Option) { - if self.backends.swift.is_some() || self.backends.swift_disabled { + if self.backends.swift.is_some() + || self.backends.swift_disabled + || !self.backends.swift_backoff.is_available() + { return; } @@ -529,21 +668,27 @@ impl Tier2Manager { Ok(b) => { info!("tier2: sourcekit-lsp backend ready"); self.backends.swift = Some(b); + self.backends.swift_backoff.reset(); } Err(e) => { - warn!("tier2: sourcekit-lsp unavailable, disabling: {e}"); - self.backends.swift_disabled = true; + self.backends.swift_backoff.fail(); + if self.backends.swift_backoff.is_permanent_failure() { + warn!("tier2: sourcekit-lsp unavailable, disabling: {e}"); + self.backends.swift_disabled = true; + } else { + warn!("tier2: sourcekit-lsp spawn failed (will retry with backoff): {e}"); + } } } } async fn handle_swift(&mut self, job: VerificationJob) { - if self.backends.swift_disabled { + if self.backends.swift_disabled || !self.backends.swift_backoff.is_available() { return; } self.ensure_swift_backend(job.workspace_root.clone()).await; - if self.backends.swift_disabled { + if self.backends.swift.is_none() { return; } @@ -557,11 +702,13 @@ impl Tier2Manager { let mut db = self.db.lock().await; self.broadcast_upgrades(&result.uri, &result.symbols, &mut db); db.upgrade_file_symbols(&result.uri, &result.symbols); + self.backends.swift_backoff.reset(); debug!("tier2: upgraded {upgraded} symbols for {}", job.uri); } Err(e) => { error!("tier2: swift verification failed for {}: {e}", job.uri); self.backends.swift = None; + self.backends.swift_backoff.fail(); } } } @@ -745,7 +892,8 @@ mod tests { workspace_root: None, version: 1, }; - tx.try_send(job).expect("channel should accept up to capacity"); + tx.try_send(job) + .expect("channel should accept up to capacity"); } // The next try_send must fail — this is the documented contract. @@ -1122,27 +1270,35 @@ mod tests { // ── Tier2Backends default state ────────────────────────────────────────── - /// Fresh `Tier2Backends` must have all backends as `None` and all - /// disabled flags as `false` — backends are lazily initialised. + /// Fresh `Tier2Backends` must have all backends as `None`, all + /// disabled flags as `false`, and all backoff states clear. #[test] fn backends_default_state() { let b = Tier2Backends::new(); assert!(b.rust.is_none()); assert!(!b.rust_disabled); + assert!(b.rust_backoff.is_available()); assert!(b.typescript.is_none()); assert!(!b.typescript_disabled); + assert!(b.typescript_backoff.is_available()); assert!(b.python.is_none()); assert!(!b.python_disabled); + assert!(b.python_backoff.is_available()); assert!(b.dart.is_none()); assert!(!b.dart_disabled); + assert!(b.dart_backoff.is_available()); assert!(b.clangd.is_none()); assert!(!b.clangd_disabled); + assert!(b.clangd_backoff.is_available()); assert!(b.gopls.is_none()); assert!(!b.gopls_disabled); + assert!(b.gopls_backoff.is_available()); assert!(b.kotlin.is_none()); assert!(!b.kotlin_disabled); + assert!(b.kotlin_backoff.is_available()); assert!(b.swift.is_none()); assert!(!b.swift_disabled); + assert!(b.swift_backoff.is_available()); } // ── Channel capacity constant ──────────────────────────────────────────── @@ -1151,4 +1307,50 @@ mod tests { fn channel_capacity_is_64() { assert_eq!(CHANNEL_CAPACITY, 64); } + + // ── BackoffState ───────────────────────────────────────────────────────── + + #[test] + fn backoff_fresh_is_available() { + let b = BackoffState::default(); + assert!(b.is_available()); + assert!(!b.is_permanent_failure()); + } + + #[test] + fn backoff_fail_makes_unavailable() { + let mut b = BackoffState::default(); + b.fail(); + assert!(!b.is_available()); + assert_eq!(b.failure_count, 1); + } + + #[test] + fn backoff_reset_clears_state() { + let mut b = BackoffState::default(); + b.fail(); + b.fail(); + b.reset(); + assert!(b.is_available()); + assert_eq!(b.failure_count, 0); + assert!(!b.is_permanent_failure()); + } + + #[test] + fn backoff_permanent_after_8_failures() { + let mut b = BackoffState::default(); + for _ in 0..8 { + b.fail(); + } + assert!(b.is_permanent_failure()); + } + + #[test] + fn backoff_not_permanent_before_8_failures() { + let mut b = BackoffState::default(); + for _ in 0..7 { + b.fail(); + } + assert!(!b.is_permanent_failure()); + } } diff --git a/bindings/rust/src/query_graph/db.rs b/bindings/rust/src/query_graph/db.rs index f3fe254..10e28f6 100644 --- a/bindings/rust/src/query_graph/db.rs +++ b/bindings/rust/src/query_graph/db.rs @@ -227,6 +227,153 @@ impl LipDatabase { out } + // ── ABI surface fingerprinting ──────────────────────────────────────── + + /// Compute a stable hash over the file's exported API surface. + /// + /// The hash is SHA-256 (hex) over the newline-joined list of + /// `"URI|kind|signature"` entries for all exported symbols in `uri`, + /// sorted by URI for determinism. Returns `None` when the file is not + /// in the daemon's index. + /// + /// A change in hash means the public interface changed — safe as a + /// downstream recompilation / re-verification trigger (Kotlin IC model). + pub fn abi_hash(&mut self, uri: &str) -> Option { + if !self.file_inputs.contains_key(uri) { + return None; + } + let syms = self.file_symbols(uri); + let mut surface: Vec = syms + .iter() + .filter(|s| s.is_exported) + .map(|s| { + format!( + "{}|{}|{}", + s.uri, + s.kind as u8, + s.signature.as_deref().unwrap_or("") + ) + }) + .collect(); + surface.sort(); + let payload = surface.join("\n"); + Some(sha256_hex(payload.as_bytes())) + } + + // ── Datalog Tier 1.5 inference ──────────────────────────────────────── + + /// Run a single fixed-point inference pass and return the number of + /// symbols whose confidence was raised. + /// + /// Rules applied (one iteration; caller loops to fixpoint): + /// + /// **Rule 1 — Callee elevation**: if every direct caller of a symbol + /// has confidence ≥ 80 (Tier 2 / SCIP quality), and the symbol itself + /// is below 65, raise it to 65 (Tier 1.5 level). The intuition: if + /// all callers have been verified to compiler accuracy, the callee is + /// unlikely to have been left dangling; the call site itself acts as + /// implicit type evidence. + /// + /// **Rule 2 — Exported leaf stability**: an exported symbol with no + /// callers in the local graph is a stable leaf if its confidence is + /// ≥ 40. Raise it by 5 points (capped at 65) — exported with no + /// internal callers means it is part of the public API, which is + /// typically more carefully maintained than internal helpers. + /// + /// Both rules are conservative: they never lower confidence and never + /// exceed the Tier 1.5 ceiling (65), leaving room for Tier 2 / SCIP + /// to raise further. + fn inference_step(&mut self) -> usize { + const TIER2_THRESHOLD: u8 = 80; + const TIER1_5_CEILING: u8 = 65; + + // Snapshot caller confidence per symbol before mutating. + // Build: callee_uri → vec of caller confidence scores. + let mut callee_caller_confs: HashMap> = HashMap::new(); + let all_file_uris: Vec = self.file_inputs.keys().cloned().collect(); + for file_uri in &all_file_uris { + let syms = self.file_symbols(file_uri).to_vec(); + for sym in &syms { + // For each callee edge, record this caller's confidence. + if let Some(callers) = self.callee_to_callers.get(&sym.uri).cloned() { + for caller_uri in callers { + // Look up confidence of the caller symbol. + if let Some((caller_file, _)) = self.def_index.get(&caller_uri).cloned() { + let caller_syms = self.file_symbols(&caller_file.clone()).to_vec(); + if let Some(caller_sym) = + caller_syms.iter().find(|s| s.uri == caller_uri) + { + callee_caller_confs + .entry(sym.uri.clone()) + .or_default() + .push(caller_sym.confidence_score); + } + } + } + } + } + } + + // Apply rules and collect upgrades. + let mut upgrades: Vec<(String, String, u8)> = Vec::new(); // (file_uri, sym_uri, new_conf) + for file_uri in &all_file_uris { + let syms = self.file_symbols(file_uri).to_vec(); + for sym in &syms { + if sym.confidence_score >= TIER1_5_CEILING { + continue; + } + let caller_confs = callee_caller_confs.get(&sym.uri); + let new_conf = if let Some(confs) = caller_confs { + if !confs.is_empty() + && confs.iter().all(|&c| c >= TIER2_THRESHOLD) + && sym.confidence_score < TIER1_5_CEILING + { + // Rule 1: all callers are Tier 2+. + Some(TIER1_5_CEILING) + } else { + None + } + } else if sym.is_exported && sym.confidence_score >= 40 { + // Rule 2: exported leaf, no local callers. + Some((sym.confidence_score + 5).min(TIER1_5_CEILING)) + } else { + None + }; + if let Some(conf) = new_conf { + if conf > sym.confidence_score { + upgrades.push((file_uri.clone(), sym.uri.clone(), conf)); + } + } + } + } + + let updated = upgrades.len(); + for (file_uri, sym_uri, new_conf) in upgrades { + let syms = self.file_symbols(&file_uri).to_vec(); + if let Some(sym) = syms.iter().find(|s| s.uri == sym_uri) { + let mut upgraded = sym.clone(); + upgraded.confidence_score = new_conf; + self.upgrade_file_symbols(&file_uri, &[upgraded]); + } + } + updated + } + + /// Run the Tier 1.5 Datalog inference loop to fixpoint. + /// + /// Returns the total number of symbol confidence scores raised. + pub fn run_tier1_5_inference(&mut self) -> usize { + let mut total = 0; + loop { + let changed = self.inference_step(); + total += changed; + if changed == 0 { + break; + } + } + total + } + // ── Mutations ───────────────────────────────────────────────────────── /// Register or update a file. Bumps the global revision and invalidates @@ -414,8 +561,7 @@ impl LipDatabase { // Seed sym_cache so file_symbols() returns the pre-computed symbols. let syms = Arc::new(symbols); - self.sym_cache - .insert(uri.clone(), Cached::new(syms, rev)); + self.sym_cache.insert(uri.clone(), Cached::new(syms, rev)); // Consumed-names index (same as upsert_file). { @@ -1067,27 +1213,49 @@ impl LipDatabase { let mut semantic_items = Vec::new(); if min_score.is_some() { - // NOTE: searches file-level embeddings, not per-symbol embeddings. - // Per-function chunked embeddings aren't integrated yet; when they - // land, this upgrades without a wire format change (semantic_items - // already carries symbol_uri, currently empty at file granularity). - if let Some(embedding) = self.file_embeddings.get(file_uri).cloned() { - let static_files: HashSet<&str> = static_result - .affected_files - .iter() - .map(|s| s.as_str()) - .collect(); - + let static_files: HashSet = + static_result.affected_files.iter().cloned().collect(); + + // Prefer per-symbol embeddings (function-level granularity) when + // available. Fall back to file-level embeddings when the symbol has + // no stored vector. This degrades gracefully for callers that have + // not yet run `EmbeddingBatch` with `lip://` URIs. + if let Some(sym_embedding) = self.symbol_embeddings.get(&sym.uri).cloned() { + let sym_neighbours = + self.nearest_symbol_by_vector(&sym_embedding, 20, Some(&sym.uri), None); + for n in sym_neighbours { + if n.score < threshold { + continue; + } + // Map symbol hit back to its defining file. + let hit_file = self + .def_index + .get(&n.uri) + .map(|(f, _)| f.clone()) + .unwrap_or_else(|| n.uri.clone()); + let source = if static_files.contains(&hit_file) { + ImpactSource::Both + } else { + ImpactSource::Semantic + }; + semantic_items.push(SemanticImpactItem { + file_uri: hit_file, + symbol_uri: n.uri, + similarity: n.score, + source, + }); + } + } else if let Some(file_embedding) = self.file_embeddings.get(file_uri).cloned() + { let neighbours = self.nearest_by_vector( - &embedding, + &file_embedding, 20, Some(file_uri), None, Some(threshold), ); - for neighbour in neighbours { - let source = if static_files.contains(neighbour.uri.as_str()) { + let source = if static_files.contains(&neighbour.uri) { ImpactSource::Both } else { ImpactSource::Semantic @@ -1281,7 +1449,14 @@ impl LipDatabase { scored .into_iter() .take(top_k) - .map(|(uri, score)| crate::query_graph::types::NearestItem { uri, score }) + .map(|(uri, score)| { + let embedding_model = self.symbol_embedding_models.get(&uri).cloned(); + crate::query_graph::types::NearestItem { + uri, + score, + embedding_model, + } + }) .collect() } @@ -1352,7 +1527,14 @@ impl LipDatabase { scored .into_iter() .take(top_k) - .map(|(uri, score)| crate::query_graph::types::NearestItem { uri, score }) + .map(|(uri, score)| { + let embedding_model = self.file_embedding_models.get(&uri).cloned(); + crate::query_graph::types::NearestItem { + uri, + score, + embedding_model, + } + }) .collect() } @@ -1458,9 +1640,16 @@ impl LipDatabase { return vec![]; } if pairs.len() == 1 { + let uri = pairs[0].0.to_owned(); + let embedding_model = if uri.starts_with("lip://") { + self.symbol_embedding_models.get(&uri).cloned() + } else { + self.file_embedding_models.get(&uri).cloned() + }; return vec![NearestItem { - uri: pairs[0].0.to_owned(), + uri, score: 0.0, + embedding_model, }]; } @@ -1498,7 +1687,18 @@ impl LipDatabase { scores .into_iter() .take(top_k) - .map(|(uri, score)| NearestItem { uri, score }) + .map(|(uri, score)| { + let embedding_model = if uri.starts_with("lip://") { + self.symbol_embedding_models.get(&uri).cloned() + } else { + self.file_embedding_models.get(&uri).cloned() + }; + NearestItem { + uri, + score, + embedding_model, + } + }) .collect() } @@ -3695,14 +3895,25 @@ impl Greeter { override_doc: None, }]; - db.upsert_file_precomputed(uri.clone(), "rust".into(), "hash123".into(), symbols, occurrences, vec![]); + db.upsert_file_precomputed( + uri.clone(), + "rust".into(), + "hash123".into(), + symbols, + occurrences, + vec![], + ); let syms = db.file_symbols(&uri); assert_eq!(syms.len(), 1); assert_eq!(syms[0].display_name, "MyStruct"); let results = db.workspace_symbols("MyStruct", 10); - assert_eq!(results.len(), 1, "pre-computed symbol must appear in workspace search"); + assert_eq!( + results.len(), + 1, + "pre-computed symbol must appear in workspace search" + ); assert!( db.symbol_definition_location(&sym_uri).is_some(), @@ -3730,14 +3941,33 @@ impl Greeter { }; let occ = OwnedOccurrence { symbol_uri: "lip://local/lib.rs#Foo".into(), - range: OwnedRange { start_line: 0, start_char: 0, end_line: 0, end_char: 3 }, + range: OwnedRange { + start_line: 0, + start_char: 0, + end_line: 0, + end_char: 3, + }, confidence_score: 90, role: Role::Definition, override_doc: None, }; - db.upsert_file_precomputed(uri.clone(), "rust".into(), "hash1".into(), vec![sym.clone()], vec![occ.clone()], vec![]); - db.upsert_file_precomputed(uri.clone(), "rust".into(), "hash1".into(), vec![sym], vec![occ], vec![]); + db.upsert_file_precomputed( + uri.clone(), + "rust".into(), + "hash1".into(), + vec![sym.clone()], + vec![occ.clone()], + vec![], + ); + db.upsert_file_precomputed( + uri.clone(), + "rust".into(), + "hash1".into(), + vec![sym], + vec![occ], + vec![], + ); let results = db.workspace_symbols("Foo", 10); assert_eq!(results.len(), 1, "re-upsert must not duplicate symbols"); @@ -3756,28 +3986,46 @@ impl Greeter { let sym_foo = OwnedSymbolInfo::new("lip://local/a.rs#foo", "foo"); let occ_def = OwnedOccurrence { symbol_uri: "lip://local/a.rs#foo".into(), - range: OwnedRange { start_line: 0, start_char: 0, end_line: 0, end_char: 3 }, + range: OwnedRange { + start_line: 0, + start_char: 0, + end_line: 0, + end_char: 3, + }, confidence_score: 90, role: Role::Definition, override_doc: None, }; db.upsert_file_precomputed( - uri_a.clone(), "rust".into(), "h1".into(), - vec![sym_foo], vec![occ_def], vec![], + uri_a.clone(), + "rust".into(), + "h1".into(), + vec![sym_foo], + vec![occ_def], + vec![], ); // File B: references foo (defined in A → external) let uri_b = "lip://local/b.rs".to_owned(); let occ_ref = OwnedOccurrence { symbol_uri: "lip://local/a.rs#foo".into(), - range: OwnedRange { start_line: 0, start_char: 0, end_line: 0, end_char: 3 }, + range: OwnedRange { + start_line: 0, + start_char: 0, + end_line: 0, + end_char: 3, + }, confidence_score: 80, role: Role::Reference, override_doc: None, }; db.upsert_file_precomputed( - uri_b.clone(), "rust".into(), "h2".into(), - vec![], vec![occ_ref], vec![], + uri_b.clone(), + "rust".into(), + "h2".into(), + vec![], + vec![occ_ref], + vec![], ); let invalidated = db.invalidated_files_for(&["lip://local/a.rs#foo".into()]); @@ -3794,18 +4042,30 @@ impl Greeter { let sym_bar = OwnedSymbolInfo::new("lip://local/c.rs#bar", "bar"); let occ_def = OwnedOccurrence { symbol_uri: "lip://local/c.rs#bar".into(), - range: OwnedRange { start_line: 0, start_char: 0, end_line: 0, end_char: 3 }, + range: OwnedRange { + start_line: 0, + start_char: 0, + end_line: 0, + end_char: 3, + }, confidence_score: 90, role: Role::Definition, override_doc: None, }; db.upsert_file_precomputed( - uri_c.clone(), "rust".into(), "h1".into(), - vec![sym_bar], vec![occ_def], vec![], + uri_c.clone(), + "rust".into(), + "h1".into(), + vec![sym_bar], + vec![occ_def], + vec![], ); let invalidated = db.invalidated_files_for(&["lip://local/c.rs#bar".into()]); - assert!(invalidated.is_empty(), "unreferenced symbol should invalidate nothing"); + assert!( + invalidated.is_empty(), + "unreferenced symbol should invalidate nothing" + ); } #[test] @@ -3819,28 +4079,46 @@ impl Greeter { let sym_foo = OwnedSymbolInfo::new("lip://local/a.rs#foo", "foo"); let occ_def = OwnedOccurrence { symbol_uri: "lip://local/a.rs#foo".into(), - range: OwnedRange { start_line: 0, start_char: 0, end_line: 0, end_char: 3 }, + range: OwnedRange { + start_line: 0, + start_char: 0, + end_line: 0, + end_char: 3, + }, confidence_score: 90, role: Role::Definition, override_doc: None, }; db.upsert_file_precomputed( - uri_a.clone(), "rust".into(), "h1".into(), - vec![sym_foo], vec![occ_def], vec![], + uri_a.clone(), + "rust".into(), + "h1".into(), + vec![sym_foo], + vec![occ_def], + vec![], ); // File B: references foo let uri_b = "lip://local/b.rs".to_owned(); let occ_ref = OwnedOccurrence { symbol_uri: "lip://local/a.rs#foo".into(), - range: OwnedRange { start_line: 0, start_char: 0, end_line: 0, end_char: 3 }, + range: OwnedRange { + start_line: 0, + start_char: 0, + end_line: 0, + end_char: 3, + }, confidence_score: 80, role: Role::Reference, override_doc: None, }; db.upsert_file_precomputed( - uri_b.clone(), "rust".into(), "h2".into(), - vec![], vec![occ_ref], vec![], + uri_b.clone(), + "rust".into(), + "h2".into(), + vec![], + vec![occ_ref], + vec![], ); // Sanity: B is invalidated before removal @@ -3853,6 +4131,9 @@ impl Greeter { db.remove_file(&uri_b); let invalidated = db.invalidated_files_for(&["lip://local/a.rs#foo".into()]); - assert!(invalidated.is_empty(), "removed file must not appear in invalidation results"); + assert!( + invalidated.is_empty(), + "removed file must not appear in invalidation results" + ); } } diff --git a/bindings/rust/src/query_graph/types.rs b/bindings/rust/src/query_graph/types.rs index 0700eb8..a385fa5 100644 --- a/bindings/rust/src/query_graph/types.rs +++ b/bindings/rust/src/query_graph/types.rs @@ -104,6 +104,20 @@ pub struct NearestItem { pub uri: String, /// Cosine similarity in [0.0, 1.0] — higher is more similar. pub score: f32, + /// Model that produced the stored embedding for this item. + /// `None` when the item has no embedding or the model is unknown. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub embedding_model: Option, +} + +/// Per-file entry inside [`ServerMessage::BatchFileStatusResult`]. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct FileStatusEntry { + pub uri: String, + pub indexed: bool, + pub has_embedding: bool, + pub age_seconds: Option, + pub embedding_model: Option, } /// A line-range chunk boundary returned by [`ServerMessage::BoundariesResult`]. @@ -560,6 +574,28 @@ pub enum ServerMessage { #[serde(skip_serializing_if = "Option::is_none")] error: Option, }, + + // ── v2.2 features ──────────────────────────────────────────────────── + /// Response to [`ClientMessage::ReindexStale`]. + ReindexStaleResult { + /// URIs that were re-indexed from disk. + reindexed: Vec, + /// URIs that were within the age threshold and were skipped. + skipped: Vec, + }, + /// Response to [`ClientMessage::BatchFileStatus`]. + BatchFileStatusResult { + entries: Vec, + }, + /// Response to [`ClientMessage::QueryAbiHash`]. + /// + /// The hash is a hex-encoded SHA-256 over the file's exported symbols + /// sorted by URI. A change in hash means the public interface changed. + AbiHashResult { + uri: String, + /// `None` when the file is not in the daemon's index. + hash: Option, + }, } /// Provenance record for a Tier 3 ingestion source (typically a SCIP @@ -1159,6 +1195,33 @@ pub enum ClientMessage { RegisterTier3Source { source: Tier3Source, }, + + // ── v2.2 features ──────────────────────────────────────────────────── + /// Re-index stale files atomically. For each URI, if the file is + /// not indexed or was last indexed more than `max_age_seconds` ago, + /// it is re-read from disk and re-indexed. URIs within the threshold + /// are skipped. Pass `max_age_seconds = 0` to force re-index of all + /// listed URIs regardless of age. Returns `ReindexStaleResult`. + ReindexStale { + uris: Vec, + /// Files older than this threshold are re-indexed. + max_age_seconds: u64, + }, + /// Query the index status of multiple files in a single round-trip. + /// Equivalent to issuing `QueryFileStatus` once per URI inside a + /// `Batch`, but without the overhead of individual messages. + /// Returns `BatchFileStatusResult`. + BatchFileStatus { + uris: Vec, + }, + /// Query the ABI surface hash for a file. The hash is a stable hex + /// string computed over the file's exported symbols sorted by URI, + /// including their signatures and kinds. A change in hash means the + /// public interface changed — useful as a recompilation trigger. + /// Returns `AbiHashResult`. + QueryAbiHash { + uri: String, + }, } impl ClientMessage { @@ -1222,6 +1285,9 @@ impl ClientMessage { "embed_text", "stream_context", "register_tier3_source", + "reindex_stale", + "batch_file_status", + "query_abi_hash", ] .iter() .map(|s| (*s).to_owned()) @@ -1293,6 +1359,9 @@ impl ClientMessage { ClientMessage::EmbedText { .. } => "embed_text", ClientMessage::StreamContext { .. } => "stream_context", ClientMessage::RegisterTier3Source { .. } => "register_tier3_source", + ClientMessage::ReindexStale { .. } => "reindex_stale", + ClientMessage::BatchFileStatus { .. } => "batch_file_status", + ClientMessage::QueryAbiHash { .. } => "query_abi_hash", } } @@ -1315,6 +1384,7 @@ impl ClientMessage { | ClientMessage::PruneDeleted | ClientMessage::QueryStaleEmbeddings { .. } | ClientMessage::ExplainMatch { .. } + | ClientMessage::ReindexStale { .. } ) } } @@ -1459,6 +1529,146 @@ mod tests { assert!(tier3_sources.is_empty()); } + // ── v2.2 round-trip tests ───────────────────────────────────────── + + #[test] + fn reindex_stale_round_trips() { + let msg = ClientMessage::ReindexStale { + uris: vec!["file:///src/main.rs".into()], + max_age_seconds: 300, + }; + let rt = round_trip_client(&msg); + let ClientMessage::ReindexStale { + uris, + max_age_seconds, + } = rt + else { + panic!("wrong variant"); + }; + assert_eq!(uris, ["file:///src/main.rs"]); + assert_eq!(max_age_seconds, 300); + } + + #[test] + fn reindex_stale_not_batchable() { + assert!(!ClientMessage::ReindexStale { + uris: vec![], + max_age_seconds: 0 + } + .is_batchable()); + } + + #[test] + fn batch_file_status_round_trips() { + let msg = ClientMessage::BatchFileStatus { + uris: vec!["file:///a.rs".into(), "file:///b.rs".into()], + }; + let rt = round_trip_client(&msg); + let ClientMessage::BatchFileStatus { uris } = rt else { + panic!("wrong variant"); + }; + assert_eq!(uris.len(), 2); + } + + #[test] + fn batch_file_status_is_batchable() { + assert!(ClientMessage::BatchFileStatus { uris: vec![] }.is_batchable()); + } + + #[test] + fn query_abi_hash_round_trips() { + let msg = ClientMessage::QueryAbiHash { + uri: "file:///src/lib.rs".into(), + }; + let rt = round_trip_client(&msg); + let ClientMessage::QueryAbiHash { uri } = rt else { + panic!("wrong variant"); + }; + assert_eq!(uri, "file:///src/lib.rs"); + } + + #[test] + fn query_abi_hash_is_batchable() { + assert!(ClientMessage::QueryAbiHash { uri: String::new() }.is_batchable()); + } + + #[test] + fn reindex_stale_result_round_trips() { + let msg = ServerMessage::ReindexStaleResult { + reindexed: vec!["file:///src/a.rs".into()], + skipped: vec!["file:///src/b.rs".into()], + }; + let rt = round_trip_server(&msg); + let ServerMessage::ReindexStaleResult { reindexed, skipped } = rt else { + panic!("wrong variant"); + }; + assert_eq!(reindexed, ["file:///src/a.rs"]); + assert_eq!(skipped, ["file:///src/b.rs"]); + } + + #[test] + fn batch_file_status_result_round_trips() { + let msg = ServerMessage::BatchFileStatusResult { + entries: vec![FileStatusEntry { + uri: "file:///src/main.rs".into(), + indexed: true, + has_embedding: false, + age_seconds: Some(42), + embedding_model: None, + }], + }; + let rt = round_trip_server(&msg); + let ServerMessage::BatchFileStatusResult { entries } = rt else { + panic!("wrong variant"); + }; + assert_eq!(entries.len(), 1); + assert!(entries[0].indexed); + assert_eq!(entries[0].age_seconds, Some(42)); + } + + #[test] + fn abi_hash_result_round_trips() { + let msg = ServerMessage::AbiHashResult { + uri: "file:///src/lib.rs".into(), + hash: Some("deadbeef".into()), + }; + let rt = round_trip_server(&msg); + let ServerMessage::AbiHashResult { uri, hash } = rt else { + panic!("wrong variant"); + }; + assert_eq!(uri, "file:///src/lib.rs"); + assert_eq!(hash.as_deref(), Some("deadbeef")); + } + + #[test] + fn nearest_item_embedding_model_round_trips() { + let msg = ServerMessage::NearestResult { + results: vec![NearestItem { + uri: "file:///src/auth.rs".into(), + score: 0.95, + embedding_model: Some("text-embedding-3-small".into()), + }], + }; + let rt = round_trip_server(&msg); + let ServerMessage::NearestResult { results } = rt else { + panic!("wrong variant"); + }; + assert_eq!( + results[0].embedding_model.as_deref(), + Some("text-embedding-3-small") + ); + } + + #[test] + fn nearest_item_missing_embedding_model_deserializes_as_none() { + let json = r#"{"type":"nearest_result","results":[{"uri":"file:///a.rs","score":0.9}]}"#; + let msg: ServerMessage = serde_json::from_str(json).unwrap(); + let ServerMessage::NearestResult { results } = msg else { + panic!("wrong variant"); + }; + assert!(results[0].embedding_model.is_none()); + } + /// Drift guard: every tag produced by [`ClientMessage::variant_tag`] /// must also appear in [`ClientMessage::supported_messages`], and /// the two lists must be the same size. Combined with the @@ -1690,6 +1900,12 @@ mod tests { imported_at_ms: 0, }, }, + ClientMessage::ReindexStale { + uris: vec![], + max_age_seconds: 0, + }, + ClientMessage::BatchFileStatus { uris: vec![] }, + ClientMessage::QueryAbiHash { uri: String::new() }, ]; let supported = ClientMessage::supported_messages(); @@ -1832,6 +2048,7 @@ mod tests { vec![NearestItem { uri: "file:///a.rs".into(), score: 0.9, + embedding_model: None, }], vec![], ], @@ -2018,6 +2235,7 @@ mod tests { outliers: vec![NearestItem { uri: "file:///src/billing.go".into(), score: 0.12, + embedding_model: None, }], }; let rt = round_trip_server(&msg); @@ -2224,6 +2442,7 @@ mod tests { moving_toward: vec![NearestItem { uri: "file:///src/auth.rs".into(), score: 0.91, + embedding_model: None, }], }; let rt = round_trip_server(&msg); From fa3e8e49fd432a85475785184d4650892f838a02 Mon Sep 17 00:00:00 2001 From: Lisa Date: Tue, 21 Apr 2026 02:31:56 +0200 Subject: [PATCH 07/18] chore: bump version to 2.2.0, finalize CHANGELOG Co-Authored-By: Claude Sonnet 4.6 --- CHANGELOG.md | 4 ++++ Cargo.toml | 2 +- tools/lip-cli/Cargo.toml | 2 +- tools/lip-registry/Cargo.toml | 2 +- 4 files changed, 7 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index da33c77..1379e53 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,10 @@ All notable changes to this project are documented here. ## [Unreleased] +--- + +## [2.2.0] — 2026-04-21 + ### Added - **`NearestItem.embedding_model`** — every nearest-neighbour hit now carries the model name that produced its stored embedding. Field is optional / `skip_serializing_if = None`; older clients see no change. Populated by `nearest_by_vector`, `nearest_symbol_by_vector`, and `outliers`. Useful for debugging mixed-model indexes and confirming which model was used for a specific result. diff --git a/Cargo.toml b/Cargo.toml index 92c6f53..1e5eeb9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,7 +7,7 @@ members = [ ] [workspace.package] -version = "2.1.1" +version = "2.2.0" edition = "2021" rust-version = "1.78" authors = ["Lisa Welsch "] diff --git a/tools/lip-cli/Cargo.toml b/tools/lip-cli/Cargo.toml index 13fa242..fe17f30 100644 --- a/tools/lip-cli/Cargo.toml +++ b/tools/lip-cli/Cargo.toml @@ -18,7 +18,7 @@ name = "lip" path = "src/main.rs" [dependencies] -lip = { package = "lip-core", path = "../../bindings/rust", version = "2.1.1" } +lip = { package = "lip-core", path = "../../bindings/rust", version = "2.2.0" } clap = { version = "4", features = ["derive", "env"] } tokio = { version = "1", features = ["full"] } tower-lsp = "0.20" diff --git a/tools/lip-registry/Cargo.toml b/tools/lip-registry/Cargo.toml index ba4c695..72c5219 100644 --- a/tools/lip-registry/Cargo.toml +++ b/tools/lip-registry/Cargo.toml @@ -18,7 +18,7 @@ name = "lip-registry" path = "src/main.rs" [dependencies] -lip = { package = "lip-core", path = "../../bindings/rust", version = "2.1.1" } +lip = { package = "lip-core", path = "../../bindings/rust", version = "2.2.0" } axum = "0.7" tokio = { version = "1", features = ["full"] } tower-http = { version = "0.5", features = ["fs", "trace"] } From b9a27af504dd5b6251c5621e26bf2a57b5fa4204 Mon Sep 17 00:00:00 2001 From: Lisa Date: Tue, 21 Apr 2026 03:44:24 +0200 Subject: [PATCH 08/18] fix: blast_radius_batch wire gap + precomputed sym_cache fallback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit EnrichedBlastRadius gains file_uri so callers can trace results back to their input file. BlastRadiusBatchResult gains not_indexed_uris (skip_serializing_if empty, back-compat) to distinguish "URI not in index" from "URI has zero callers" — previously both were silent empty. blast_radius_batch now checks file_inputs before calling file_symbols. file_symbols guards against the cold-cache path for precomputed files: if sym_cache is cold and file is marked precomputed, return [] instead of falling through to Tier 1 parsing on empty text. Three new tests: blast_radius_batch_not_indexed_uris_reported, blast_radius_batch_file_uri_populated, file_symbols_precomputed_cold_cache_returns_empty. Docs: v2.1 + v2.2 roadmap sections added to spec.mdx and LIP_SPEC.mdx; ReindexStale/BatchFileStatus/QueryAbiHash added to daemon message table. Co-Authored-By: Claude Sonnet 4.6 --- bindings/rust/src/daemon/session.rs | 16 +++-- bindings/rust/src/query_graph/db.rs | 87 +++++++++++++++++++++++++- bindings/rust/src/query_graph/types.rs | 6 ++ docs/LIP_SPEC.mdx | 24 ++++++- website/src/pages/docs/daemon.mdx | 3 + website/src/pages/docs/spec.mdx | 20 ++++++ 6 files changed, 148 insertions(+), 8 deletions(-) diff --git a/bindings/rust/src/daemon/session.rs b/bindings/rust/src/daemon/session.rs index eba70e1..a7556a1 100644 --- a/bindings/rust/src/daemon/session.rs +++ b/bindings/rust/src/daemon/session.rs @@ -404,8 +404,12 @@ impl Session { min_score, } => { let mut db = self.db.lock().await; - let results = db.blast_radius_batch(&changed_file_uris, min_score); - ServerMessage::BlastRadiusBatchResult { results } + let (results, not_indexed_uris) = + db.blast_radius_batch(&changed_file_uris, min_score); + ServerMessage::BlastRadiusBatchResult { + results, + not_indexed_uris, + } } ClientMessage::QueryWorkspaceSymbols { query, limit } => { @@ -1968,8 +1972,12 @@ fn process_query_sync( changed_file_uris, min_score, } => { - let results = db.blast_radius_batch(&changed_file_uris, min_score); - ok(ServerMessage::BlastRadiusBatchResult { results }) + let (results, not_indexed_uris) = + db.blast_radius_batch(&changed_file_uris, min_score); + ok(ServerMessage::BlastRadiusBatchResult { + results, + not_indexed_uris, + }) } ClientMessage::QueryWorkspaceSymbols { query, limit } => { diff --git a/bindings/rust/src/query_graph/db.rs b/bindings/rust/src/query_graph/db.rs index 10e28f6..e2fc38f 100644 --- a/bindings/rust/src/query_graph/db.rs +++ b/bindings/rust/src/query_graph/db.rs @@ -842,6 +842,13 @@ impl LipDatabase { } } + // Precomputed files (SCIP imports) have no source text — Tier 1 parsing + // on empty text returns nothing anyway, but bail early to make the + // invariant explicit: precomputed symbols live only in sym_cache. + if self.file_inputs.get(uri).is_some_and(|f| f.precomputed) { + return Arc::new(vec![]); + } + let result = self.compute_symbols(uri); self.sym_cache .insert(uri.to_owned(), Cached::new(result.clone(), file_rev)); @@ -1179,8 +1186,9 @@ impl LipDatabase { &mut self, changed_file_uris: &[String], min_score: Option, - ) -> Vec { + ) -> (Vec, Vec) { let mut results = Vec::new(); + let mut not_indexed_uris = Vec::new(); let mut seen_symbols: HashSet = HashSet::new(); let threshold = min_score.unwrap_or(0.6); @@ -1201,6 +1209,10 @@ impl LipDatabase { }; for file_uri in changed_file_uris { + if !self.file_inputs.contains_key(file_uri.as_str()) { + not_indexed_uris.push(file_uri.clone()); + continue; + } let syms = self.file_symbols(file_uri); for sym in syms.iter() { if !interesting(sym.kind) { @@ -1271,12 +1283,13 @@ impl LipDatabase { } results.push(EnrichedBlastRadius { + file_uri: file_uri.clone(), static_result, semantic_items, }); } } - results + (results, not_indexed_uris) } /// Find the symbol URI whose occurrence range contains `(line, col)` in `uri`. @@ -3336,6 +3349,76 @@ impl Greeter { ); } + #[test] + fn blast_radius_batch_not_indexed_uris_reported() { + let mut db = LipDatabase::new(); + db.upsert_file( + "file:///project/lib.rs".to_owned(), + "pub fn f() {}".to_owned(), + "rust".to_owned(), + ); + let unknown = "file:///project/ghost.rs".to_owned(); + let (results, not_indexed) = + db.blast_radius_batch(&[unknown.clone()], None); + assert!(results.is_empty()); + assert_eq!(not_indexed, vec![unknown]); + } + + #[test] + fn blast_radius_batch_file_uri_populated() { + let mut db = LipDatabase::new(); + let lib_uri = "file:///project/lib.rs".to_owned(); + db.upsert_file( + lib_uri.clone(), + "pub fn exported() {}".to_owned(), + "rust".to_owned(), + ); + let (results, not_indexed) = + db.blast_radius_batch(&[lib_uri.clone()], None); + assert!(not_indexed.is_empty()); + for entry in &results { + assert_eq!(entry.file_uri, lib_uri, "file_uri must trace back to input"); + } + } + + #[test] + fn file_symbols_precomputed_cold_cache_returns_empty() { + let mut db = LipDatabase::new(); + let uri = "file:///project/imported.go".to_owned(); + use crate::schema::{OwnedSymbolInfo, SymbolKind}; + let sym = OwnedSymbolInfo { + uri: "lip://local/imported.go#Foo".to_owned(), + kind: SymbolKind::Function, + display_name: "Foo".to_owned(), + confidence_score: 90, + signature: None, + documentation: None, + relationships: vec![], + runtime_p99_ms: None, + call_rate_per_s: None, + taint_labels: vec![], + blast_radius: 0, + is_exported: true, + }; + db.upsert_file_precomputed( + uri.clone(), + "go".to_owned(), + "abc123".to_owned(), + vec![sym], + vec![], + vec![], + ); + // Warm path: sym_cache is populated — must return the precomputed symbol. + let syms = db.file_symbols(&uri); + assert_eq!(syms.len(), 1, "warm path must return precomputed symbol"); + + // Simulate cold cache; file_inputs still marks precomputed=true. + db.sym_cache.remove(&uri); + let syms_cold = db.file_symbols(&uri); + // Must not fall through to Tier 1 (which would parse empty text). + assert!(syms_cold.is_empty(), "cold precomputed cache must not run Tier-1 parser"); + } + // ── WS3: name consumption index ─────────────────────────────────────────── #[test] diff --git a/bindings/rust/src/query_graph/types.rs b/bindings/rust/src/query_graph/types.rs index a385fa5..74048f8 100644 --- a/bindings/rust/src/query_graph/types.rs +++ b/bindings/rust/src/query_graph/types.rs @@ -78,6 +78,8 @@ pub enum ImpactSource { /// A single entry in a batch blast-radius result. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct EnrichedBlastRadius { + /// The input file URI this result was computed for. + pub file_uri: String, /// The static blast-radius result. #[serde(flatten)] pub static_result: BlastRadiusResult, @@ -244,6 +246,10 @@ pub enum ServerMessage { BlastRadiusResult(BlastRadiusResult), BlastRadiusBatchResult { results: Vec, + /// Input URIs from `changed_file_uris` that were not present in the index. + /// Absent when empty (all inputs were indexed). + #[serde(default, skip_serializing_if = "Vec::is_empty")] + not_indexed_uris: Vec, }, WorkspaceSymbolsResult { symbols: Vec, diff --git a/docs/LIP_SPEC.mdx b/docs/LIP_SPEC.mdx index b36707d..639564f 100644 --- a/docs/LIP_SPEC.mdx +++ b/docs/LIP_SPEC.mdx @@ -1,6 +1,6 @@ --- title: "LIP — Linked Incremental Protocol" -subtitle: "Design Document & Specification v2.0" +subtitle: "Design Document & Specification v2.2" status: "Stable" license: "MIT" authors: ["Lisa Welsch"] @@ -1197,6 +1197,26 @@ lip-protocol/ - [x] **`QueryStaleEmbeddings { root }`** — report files under `root` whose stored embedding is older than their current mtime. Detects the case where LIP was offline during a batch of writes and search results are silently stale. Returns `StaleEmbeddingsResult { uris }`. Not permitted inside `BatchQuery`. - [x] 2 new MCP tools (`lip_get_centroid`, `lip_stale_embeddings`) + `filter`/`min_score` params on 5 existing tools. +### v2.1 — Streaming context + forward-compat primitives ✓ + +- [x] **`StreamContext { file_uri, cursor_position, max_tokens, model? }`** — streaming wire message. Daemon ranks symbols relevant to the cursor and emits one `SymbolInfo` frame at a time, terminating with exactly one `EndStream { reason, emitted, total_candidates, error? }` frame. Reasons: `budget_reached`, `exhausted`, `error`, `cursor_out_of_range`, `file_not_indexed`. Replaces the "fetch top-k, locally truncate" pattern. `protocol_version` bumped to `2`. Spec §9.2. +- [x] **`EmbedText { text, model? }`** — embed an arbitrary text string and return the raw vector. Closes the gap left by `EmbeddingBatch` (URI-only) and `QueryNearestByText` (discards the vector). Returns `EmbedTextResult { vector: Vec, embedding_model: String }`. Not permitted inside `BatchQuery`. +- [x] **`RegisterTier3Source { source: Tier3Source }`** — expose provenance for SCIP-import batches. `Tier3Source { source_id, tool_name, tool_version, project_root, imported_at_ms }` records what producer generated the symbols and when. `IndexStatusResult.tier3_sources` is `#[serde(default)]`; older daemons yield an empty vector. Not permitted inside `BatchQuery` (mutation). +- [x] **`HandshakeResult.supported_messages`** — handshake response now lists every `ClientMessage` type tag the daemon understands. Clients probe for individual messages without version-integer comparison. `#[serde(default)]`; older daemons yield empty vector. +- [x] **`ServerMessage::UnknownMessage { message_type, supported }`** — unknown type tag returns this response *and keeps the socket open*, enabling graceful client downgrade. +- [x] **`ErrorCode`** enum — stable machine-readable error categories: `unknown_message_type`, `unknown_model`, `embedding_not_configured`, `no_embedding`, `cursor_out_of_range`, `index_locked`, `invalid_request`, `internal` (default). +- [x] **`ClientMessage::variant_tag` + `supported_messages_covers_all_variants` test** — compile-time exhaustive match plus paired test that fails when a new variant is added without being advertised in `supported_messages()`. + +### v2.2 — Function-level blast radius + intelligence layer ✓ + +- [x] **`NearestItem.embedding_model`** — every nearest-neighbour hit now carries the model name that produced its stored embedding. Optional / `skip_serializing_if = None`; older clients see no change. Populated by `nearest_by_vector`, `nearest_symbol_by_vector`, and `outliers`. +- [x] **Function-level blast radius** — `QueryBlastRadiusBatch` semantic enrichment now uses per-symbol embeddings when available. `semantic_items[].symbol_uri` is populated when `EmbeddingBatch` has been called with `lip://` URIs (function-level chunks); falls back to file embeddings transparently. +- [x] **`ReindexStale { uris, max_age_seconds }`** — atomic check-then-reindex. Re-reads from disk only URIs that are unindexed or whose `last_indexed_at` exceeds the threshold. `max_age_seconds = 0` forces unconditional reindex. Returns `ReindexStaleResult { reindexed, skipped }`. Not permitted inside `BatchQuery`. +- [x] **`BatchFileStatus { uris }`** — query index and embedding status for multiple files in one round-trip. Returns `BatchFileStatusResult { entries: Vec }`. Safe inside `BatchQuery`. +- [x] **`QueryAbiHash { uri }`** — stable SHA-256 hex hash over a file's exported API surface (exported symbol URIs + kinds + signatures, sorted). Hash change ↔ public interface change — safe as a downstream recompilation or re-verification trigger. Returns `AbiHashResult { uri, hash: Option }`. Safe inside `BatchQuery`. +- [x] **Tier 1.5 Datalog inference** — `LipDatabase::run_tier1_5_inference()` fixed-point loop: (1) callee elevation when all callers ≥ 80 confidence → raise to 65; (2) exported leaf symbols with ≥ 40 confidence → +5 capped at 65. Never lowers confidence; ceiling 65 leaves headroom for Tier 2. +- [x] **Tier 2 backoff recovery** — language server backends recover from transient crashes with exponential backoff (2–300 s, permanent disable only after 8 failures). `BackoffState { failure_count, available_after }` per backend. Replaces immediate permanent disable on first crash. + ### v1.2 — In progress - [ ] FlatBuffers binary IPC — replace JSON wire framing with generated FlatBuffers tables @@ -1309,5 +1329,5 @@ via the `flatbuffers` crate, aligning with LIP's reference implementation langua --- -*LIP Specification v2.0.1 · April 2026 · MIT License* +*LIP Specification v2.2.0 · April 2026 · MIT License* *Lisa Welsch* diff --git a/website/src/pages/docs/daemon.mdx b/website/src/pages/docs/daemon.mdx index de908c1..07f22b2 100644 --- a/website/src/pages/docs/daemon.mdx +++ b/website/src/pages/docs/daemon.mdx @@ -103,6 +103,9 @@ Each connection handles one request/response pair. The protocol is synchronous p | `QueryNearestByText` | Top-K nearest files by text query | | `QueryIndexStatus` | Daemon health and embedding coverage | | `QueryFileStatus` | Per-file index and embedding status | +| `ReindexStale` | Atomic reindex if stale — re-reads only URIs that are unindexed or older than `max_age_seconds` | +| `BatchFileStatus` | Query index status for multiple files in one round-trip (batchable) | +| `QueryAbiHash` | SHA-256 over a file's exported API surface — stable recompilation trigger (batchable) | | `BatchQuery` | Multiple queries in one round-trip | **Acknowledgment:** Every `Delta` receives a `DeltaAck { seq, accepted }` response, eliminating the fire-and-forget drift that LSP is known for. diff --git a/website/src/pages/docs/spec.mdx b/website/src/pages/docs/spec.mdx index 3bd9ea0..1c35189 100644 --- a/website/src/pages/docs/spec.mdx +++ b/website/src/pages/docs/spec.mdx @@ -1148,6 +1148,26 @@ lip-protocol/ - [x] **`ExplainMatch { query, result_uri, top_k, chunk_lines, model }`** — explain *why* a result file ranked as a strong match. Chunks `result_uri`'s source into line-windows, batch-embeds each, and cosine-scores against the query embedding. Returns `ExplainMatchResult { chunks: Vec, query_model }`. Not permitted inside `BatchQuery`. New MCP tool: `lip_explain_match`. - [x] **Model provenance** — every embedding now records the model name that produced it. `QueryFileStatus` returns `embedding_model: Option`. `QueryIndexStatus` returns `mixed_models: bool` and `models_in_index: Vec` with a `⚠ MIXED MODELS` warning when cosine scores are unreliable across a model upgrade boundary. +### v2.1 — Streaming context + forward-compat primitives ✓ + +- [x] **`StreamContext { file_uri, cursor_position, max_tokens, model? }`** — streaming wire message. Daemon ranks symbols relevant to the cursor and emits one `SymbolInfo` frame at a time, terminating with exactly one `EndStream { reason, emitted, total_candidates, error? }` frame. Reasons: `budget_reached`, `exhausted`, `error`, `cursor_out_of_range`, `file_not_indexed`. Replaces the "fetch top-k, locally truncate" pattern. `protocol_version` bumped to `2`. Spec §9.2. +- [x] **`EmbedText { text, model? }`** — embed an arbitrary text string and return the raw vector. Closes the gap left by `EmbeddingBatch` (URI-only) and `QueryNearestByText` (discards the vector). Returns `EmbedTextResult { vector: Vec, embedding_model: String }`. Not permitted inside `BatchQuery`. +- [x] **`RegisterTier3Source { source: Tier3Source }`** — expose provenance for SCIP-import batches. `Tier3Source { source_id, tool_name, tool_version, project_root, imported_at_ms }` records what producer generated the symbols and when. `IndexStatusResult.tier3_sources` is `#[serde(default)]`; older daemons yield an empty vector. Not permitted inside `BatchQuery` (mutation). +- [x] **`HandshakeResult.supported_messages`** — handshake response now lists every `ClientMessage` type tag the daemon understands. Clients probe for individual messages (`stream_context`, `embed_text`, …) without version-integer comparison. `#[serde(default)]`; older daemons yield empty vector. +- [x] **`ServerMessage::UnknownMessage { message_type, supported }`** — unknown type tag returns this response *and keeps the socket open*, enabling graceful client downgrade. +- [x] **`ErrorCode`** enum — stable machine-readable error categories: `unknown_message_type`, `unknown_model`, `embedding_not_configured`, `no_embedding`, `cursor_out_of_range`, `index_locked`, `invalid_request`, `internal` (default). Clients branch on this instead of string-matching `message`. +- [x] **`ClientMessage::variant_tag` + `supported_messages_covers_all_variants` test** — drift guard: compile-time exhaustive match plus paired test that fails when a new `ClientMessage` variant is added without being advertised in `supported_messages()`. + +### v2.2 — Function-level blast radius + intelligence layer ✓ + +- [x] **`NearestItem.embedding_model`** — every nearest-neighbour hit now carries the model name that produced its stored embedding. Optional / `skip_serializing_if = None`; older clients see no change. Populated by `nearest_by_vector`, `nearest_symbol_by_vector`, and `outliers`. +- [x] **Function-level blast radius** — `QueryBlastRadiusBatch` semantic enrichment now uses per-symbol embeddings when available. `semantic_items[].symbol_uri` is populated when `EmbeddingBatch` has been called with `lip://` URIs (function-level chunks); falls back to file embeddings transparently. +- [x] **`ReindexStale { uris, max_age_seconds }`** — atomic check-then-reindex. Re-reads from disk only URIs that are unindexed or whose `last_indexed_at` timestamp exceeds the threshold. `max_age_seconds = 0` forces unconditional reindex. Returns `ReindexStaleResult { reindexed, skipped }`. Replaces the manual `QueryFileStatus` → `ReindexFiles` race. Not permitted inside `BatchQuery`. +- [x] **`BatchFileStatus { uris }`** — query index and embedding status for multiple files in one round-trip. Returns `BatchFileStatusResult { entries: Vec }`. Safe inside `BatchQuery`. +- [x] **`QueryAbiHash { uri }`** — stable SHA-256 hex hash over a file's exported API surface (exported symbol URIs + kinds + signatures, sorted). Hash change ↔ public interface change — safe as a downstream recompilation or re-verification trigger (Kotlin IC model). Returns `AbiHashResult { uri, hash: Option }`. Safe inside `BatchQuery`. +- [x] **Tier 1.5 Datalog inference** — `LipDatabase::run_tier1_5_inference()` runs a fixed-point loop applying two conservative rules: (1) if every direct caller of a symbol is at confidence ≥ 80, raise the callee to 65; (2) exported symbols with no local callers are raised by 5 points (capped at 65). Never lowers confidence; ceiling 65 leaves headroom for Tier 2. +- [x] **Tier 2 backoff recovery** — language server backends now recover from transient crashes with exponential backoff (2–300 s, permanent disable only after 8 consecutive failures). `BackoffState { failure_count, available_after }` tracks per-backend state. Replaces immediate permanent disable on first crash. + ### v1.2 — In progress - [ ] FlatBuffers binary IPC — replace JSON wire framing with generated FlatBuffers tables From c4c28e76b66c01c96d28bbb8b9963ff32a2c849e Mon Sep 17 00:00:00 2001 From: Lisa Date: Tue, 21 Apr 2026 11:28:54 +0200 Subject: [PATCH 09/18] =?UTF-8?q?feat:=20v2.3.0=20=E2=80=94=20CKB=20struct?= =?UTF-8?q?ural-parity=20bundle?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ships 5 additive features so CKB can retire its duplicate SCIP parser. Protocol version stays at 2; every new field uses serde defaults + skip_serializing_if. Drift-guard test covers supported_messages and variant_tag for the two new client messages. #1 Rich symbol metadata — signature_normalized, modifiers, visibility + visibility_confidence, container_name, extraction_tier, modifiers_source on OwnedSymbolInfo. Tier-1 populates the structural fields; SCIP importer derives modifiers via prefix-parse and uses upstream-compatible enclosing_symbol=8. #2 Reference classification — ReferenceKind (Unknown/Call/Read/Write/ Type/Implements/Extends) + is_test on OwnedOccurrence. Tier-1 classifier uses tree-sitter parent/field lookup; SCIP import/export maps to SymbolRole::Read/WriteAccess and Test bits. #3 QueryBlastRadiusSymbol — single-symbol wrapper around blast_radius_for_symbol with semantic enrichment; returns None for unknown or unindexed symbols. #4 QueryOutgoingCalls — forward call-graph BFS. New caller_to_callees index mirrors the reverse map, populated in upsert paths and cleaned in remove_file_call_edges. Depth clamped [1,8]; NODE_LIMIT=200 with truncated flag. #5 Ranked workspace symbols — kind_filter, scope, modifier_filter on QueryWorkspaceSymbols; WorkspaceSymbolsResult gains ranked: Vec with tiered scoring (Exact=1.0 / Prefix=0.8 / Fuzzy=0.5). Empty query preserves pre-v2.3 behavior (ranked=[]). 21 integration tests green; new coverage for every feature. Co-Authored-By: Claude Sonnet 4.6 --- Cargo.lock | 6 +- bindings/rust/src/bridge/lsp_server.rs | 5 +- bindings/rust/src/bridge/translate.rs | 2 + bindings/rust/src/daemon/journal.rs | 7 +- bindings/rust/src/daemon/session.rs | 76 +- bindings/rust/src/daemon/tier2_manager.rs | 5 + bindings/rust/src/indexer/symbol_extractor.rs | 847 +++++++++++++++-- bindings/rust/src/indexer/tier1.rs | 319 +++++++ bindings/rust/src/indexer/tier2/clangd.rs | 33 +- bindings/rust/src/indexer/tier2/dart_ls.rs | 28 +- bindings/rust/src/indexer/tier2/enrich.rs | 238 +++++ bindings/rust/src/indexer/tier2/gopls.rs | 28 +- bindings/rust/src/indexer/tier2/kotlin.rs | 28 +- bindings/rust/src/indexer/tier2/mod.rs | 1 + bindings/rust/src/indexer/tier2/py_ls.rs | 19 +- .../rust/src/indexer/tier2/rust_analyzer.rs | 29 +- bindings/rust/src/indexer/tier2/swift_ls.rs | 28 +- bindings/rust/src/indexer/tier2/ts_server.rs | 38 +- bindings/rust/src/query_graph/db.rs | 303 ++++++- bindings/rust/src/query_graph/types.rs | 96 ++ bindings/rust/src/schema/mod.rs | 10 +- bindings/rust/src/schema/signature.rs | 344 +++++++ bindings/rust/src/schema/types.rs | 213 ++++- bindings/rust/src/schema/visibility.rs | 356 ++++++++ bindings/rust/tests/integration.rs | 851 +++++++++++++++++- tools/lip-cli/src/cmd/export.rs | 75 +- tools/lip-cli/src/cmd/import.rs | 397 +++++++- tools/lip-cli/src/cmd/mcp.rs | 5 +- tools/lip-cli/src/cmd/query.rs | 3 + tools/lip-cli/src/proto/scip.proto | 3 + 30 files changed, 4212 insertions(+), 181 deletions(-) create mode 100644 bindings/rust/src/indexer/tier2/enrich.rs create mode 100644 bindings/rust/src/schema/signature.rs create mode 100644 bindings/rust/src/schema/visibility.rs diff --git a/Cargo.lock b/Cargo.lock index ee812bf..31fbf39 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1074,7 +1074,7 @@ checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53" [[package]] name = "lip-cli" -version = "2.1.1" +version = "2.2.0" dependencies = [ "anyhow", "clap", @@ -1093,7 +1093,7 @@ dependencies = [ [[package]] name = "lip-core" -version = "2.1.1" +version = "2.2.0" dependencies = [ "anyhow", "criterion", @@ -1130,7 +1130,7 @@ dependencies = [ [[package]] name = "lip-registry" -version = "2.1.1" +version = "2.2.0" dependencies = [ "anyhow", "axum", diff --git a/bindings/rust/src/bridge/lsp_server.rs b/bindings/rust/src/bridge/lsp_server.rs index 69a2aa6..a8da2b5 100644 --- a/bindings/rust/src/bridge/lsp_server.rs +++ b/bindings/rust/src/bridge/lsp_server.rs @@ -350,12 +350,15 @@ impl LanguageServer for LipLspBackend { .rpc(ClientMessage::QueryWorkspaceSymbols { query: params.query, limit: Some(100), + kind_filter: None, + scope: None, + modifier_filter: None, }) .await .map_err(Self::to_rpc_error)?; let syms = match resp { - ServerMessage::WorkspaceSymbolsResult { symbols } => symbols, + ServerMessage::WorkspaceSymbolsResult { symbols, .. } => symbols, _ => return Ok(None), }; diff --git a/bindings/rust/src/bridge/translate.rs b/bindings/rust/src/bridge/translate.rs index 41f2db9..07b4b01 100644 --- a/bindings/rust/src/bridge/translate.rs +++ b/bindings/rust/src/bridge/translate.rs @@ -338,6 +338,8 @@ mod tests { confidence_score: 20, role: crate::schema::Role::Reference, override_doc: None, + kind: crate::schema::ReferenceKind::Unknown, + is_test: false, }; let locs = occurrences_to_locations(&[valid], "file:///src/a.rs"); assert_eq!(locs.len(), 1); diff --git a/bindings/rust/src/daemon/journal.rs b/bindings/rust/src/daemon/journal.rs index f3c0e1c..515a5c5 100644 --- a/bindings/rust/src/daemon/journal.rs +++ b/bindings/rust/src/daemon/journal.rs @@ -398,7 +398,9 @@ mod tests { #[test] fn precomputed_survives_compact_replay() { - use crate::schema::{OwnedOccurrence, OwnedRange, OwnedSymbolInfo, Role, SymbolKind}; + use crate::schema::{ + OwnedOccurrence, OwnedRange, OwnedSymbolInfo, ReferenceKind, Role, SymbolKind, + }; let tmp = NamedTempFile::new().unwrap(); let path = tmp.path().to_owned(); @@ -416,6 +418,7 @@ mod tests { taint_labels: vec![], blast_radius: 0, is_exported: false, + ..Default::default() }; let occ = OwnedOccurrence { symbol_uri: "lip://local/lib.rs#Foo".into(), @@ -428,6 +431,8 @@ mod tests { confidence_score: 90, role: Role::Definition, override_doc: None, + kind: ReferenceKind::Unknown, + is_test: false, }; // Write a precomputed entry. diff --git a/bindings/rust/src/daemon/session.rs b/bindings/rust/src/daemon/session.rs index a7556a1..096e6ce 100644 --- a/bindings/rust/src/daemon/session.rs +++ b/bindings/rust/src/daemon/session.rs @@ -412,11 +412,44 @@ impl Session { } } - ClientMessage::QueryWorkspaceSymbols { query, limit } => { + ClientMessage::QueryBlastRadiusSymbol { + symbol_uri, + min_score, + } => { + let mut db = self.db.lock().await; + let result = db.blast_radius_for_symbol(&symbol_uri, min_score); + ServerMessage::BlastRadiusSymbolResult { result } + } + + ClientMessage::QueryOutgoingCalls { symbol_uri, depth } => { + let db = self.db.lock().await; + let (pairs, truncated) = db.outgoing_calls(&symbol_uri, depth); + let edges = pairs + .into_iter() + .map(|(from_uri, to_uri)| { + crate::query_graph::types::OutgoingCallEdge { from_uri, to_uri } + }) + .collect(); + ServerMessage::OutgoingCallsResult { edges, truncated } + } + + ClientMessage::QueryWorkspaceSymbols { + query, + limit, + kind_filter, + scope, + modifier_filter, + } => { let limit = limit.unwrap_or(100); let mut db = self.db.lock().await; - let syms = db.workspace_symbols(&query, limit); - ServerMessage::WorkspaceSymbolsResult { symbols: syms } + let (symbols, ranked) = db.workspace_symbols_ranked( + &query, + limit, + kind_filter.as_deref(), + scope.as_deref(), + modifier_filter.as_deref(), + ); + ServerMessage::WorkspaceSymbolsResult { symbols, ranked } } ClientMessage::QueryDocumentSymbols { uri } => { @@ -1980,10 +2013,41 @@ fn process_query_sync( }) } - ClientMessage::QueryWorkspaceSymbols { query, limit } => { + ClientMessage::QueryBlastRadiusSymbol { + symbol_uri, + min_score, + } => { + let result = db.blast_radius_for_symbol(&symbol_uri, min_score); + ok(ServerMessage::BlastRadiusSymbolResult { result }) + } + + ClientMessage::QueryOutgoingCalls { symbol_uri, depth } => { + let (pairs, truncated) = db.outgoing_calls(&symbol_uri, depth); + let edges = pairs + .into_iter() + .map(|(from_uri, to_uri)| { + crate::query_graph::types::OutgoingCallEdge { from_uri, to_uri } + }) + .collect(); + ok(ServerMessage::OutgoingCallsResult { edges, truncated }) + } + + ClientMessage::QueryWorkspaceSymbols { + query, + limit, + kind_filter, + scope, + modifier_filter, + } => { let limit = limit.unwrap_or(100); - let syms = db.workspace_symbols(&query, limit); - ok(ServerMessage::WorkspaceSymbolsResult { symbols: syms }) + let (symbols, ranked) = db.workspace_symbols_ranked( + &query, + limit, + kind_filter.as_deref(), + scope.as_deref(), + modifier_filter.as_deref(), + ); + ok(ServerMessage::WorkspaceSymbolsResult { symbols, ranked }) } ClientMessage::QueryDocumentSymbols { uri } => { diff --git a/bindings/rust/src/daemon/tier2_manager.rs b/bindings/rust/src/daemon/tier2_manager.rs index e2e80e7..03744dc 100644 --- a/bindings/rust/src/daemon/tier2_manager.rs +++ b/bindings/rust/src/daemon/tier2_manager.rs @@ -814,6 +814,7 @@ mod tests { taint_labels: vec![], blast_radius: 0, is_exported: false, + ..Default::default() } } @@ -1121,6 +1122,7 @@ mod tests { taint_labels: vec![], blast_radius: 0, is_exported: false, + ..Default::default() }; db.upsert_file_precomputed( @@ -1152,6 +1154,7 @@ mod tests { taint_labels: vec![], blast_radius: 0, is_exported: true, + ..Default::default() }; db.upgrade_file_symbols(file_uri, &[upgrade]); @@ -1196,6 +1199,7 @@ mod tests { taint_labels: vec![], blast_radius: 0, is_exported: false, + ..Default::default() }; db.upsert_file_precomputed( @@ -1221,6 +1225,7 @@ mod tests { taint_labels: vec![], blast_radius: 0, is_exported: false, + ..Default::default() }; db.upgrade_file_symbols(file_uri, &[stale]); diff --git a/bindings/rust/src/indexer/symbol_extractor.rs b/bindings/rust/src/indexer/symbol_extractor.rs index 7a421a6..fd26b52 100644 --- a/bindings/rust/src/indexer/symbol_extractor.rs +++ b/bindings/rust/src/indexer/symbol_extractor.rs @@ -1,7 +1,8 @@ use tree_sitter::{Node, Tree}; use crate::schema::{ - EdgeKind, OwnedGraphEdge, OwnedOccurrence, OwnedRange, OwnedSymbolInfo, Role, SymbolKind, + normalize_signature, visibility, EdgeKind, ExtractionTier, OwnedGraphEdge, OwnedOccurrence, + OwnedRange, OwnedSymbolInfo, ReferenceKind, Role, SymbolKind, }; use super::language::Language; @@ -85,6 +86,119 @@ impl<'a> SymbolExtractor<'a> { format!("lip://local/{path}#{name}") } + /// Heuristic: is the current file a test file? + /// + /// Looks at path segments and filename patterns common across ecosystems. + /// Conservative — matches cleanly-named test files; misses configurable + /// test dirs (e.g. Python `conftest.py`) and inline `#[cfg(test)]` modules. + /// Tier-2 can refine per-file with compiler-level knowledge. + fn is_test_file(&self) -> bool { + let u = self.file_uri; + u.contains("/tests/") + || u.contains("/test/") + || u.contains("/__tests__/") + || u.contains("/spec/") + || u.contains(".test.") + || u.contains(".spec.") + || u.contains("_test.") + || u.ends_with("Test.java") + || u.ends_with("Test.kt") + || u.ends_with("Tests.swift") + } + + /// Classify a reference occurrence based on its tree-sitter parent context. + /// + /// Returns `Call` when the identifier is the callee of a call expression, + /// `Write` when it is the LHS of an assignment, otherwise `Read`. Type / + /// Implements / Extends classification requires Tier-2 type info and is + /// left to the LSP backends. Returns `Unknown` when the node has no + /// parent (a bare module). + fn classify_ref_kind(&self, node: &Node) -> ReferenceKind { + let Some(parent) = node.parent() else { + return ReferenceKind::Unknown; + }; + let pk = parent.kind(); + + // Call site: the identifier is the function/method being invoked. + let is_call_parent = matches!( + (self.language, pk), + (Language::Rust, "call_expression" | "macro_invocation") + | ( + Language::TypeScript + | Language::JavaScript + | Language::JavaScriptReact, + "call_expression" | "new_expression" + ) + | (Language::Python, "call") + | (Language::Go, "call_expression") + | (Language::C | Language::Cpp, "call_expression") + | ( + Language::Dart, + "method_invocation" | "function_expression_invocation" + ) + | (Language::Kotlin, "call_expression") + | (Language::Swift, "call_expression") + ); + if is_call_parent { + // When the call has a receiver (`obj.method()`), only the method + // identifier is the callee — the receiver is still a Read. + let callee_field = parent + .child_by_field_name("function") + .or_else(|| parent.child_by_field_name("method")); + let is_callee = callee_field + .map(|f| { + f.id() == node.id() + || f.child_by_field_name("property") + .map(|p| p.id() == node.id()) + .unwrap_or(false) + || f.child_by_field_name("field") + .map(|p| p.id() == node.id()) + .unwrap_or(false) + }) + .unwrap_or(true); + if is_callee { + return ReferenceKind::Call; + } + } + + // Assignment LHS → Write. Covers Python/TS/JS `=`, augmented variants, + // and Rust assignment_expression. + let is_assign_lhs = matches!( + pk, + "assignment_expression" + | "assignment" + | "augmented_assignment_expression" + | "augmented_assignment" + | "compound_assignment_expr" + ) && parent + .child_by_field_name("left") + .map(|c| c.id() == node.id()) + .unwrap_or(false); + if is_assign_lhs { + return ReferenceKind::Write; + } + + ReferenceKind::Read + } + + /// Build a Tier-1 occurrence with v2.3 classification fields populated. + fn make_occurrence(&self, node: &Node, name: &str, role: Role) -> OwnedOccurrence { + let kind = if matches!(role, Role::Reference) { + self.classify_ref_kind(node) + } else { + ReferenceKind::Unknown + }; + OwnedOccurrence { + symbol_uri: self.lip_uri(name), + range: Self::node_range(node), + confidence_score: 20, + role, + override_doc: None, + kind, + is_test: self.is_test_file(), + } + } + // ── Rust ───────────────────────────────────────────────────────────────── fn walk_symbols(&self, node: Node, out: &mut Vec) { @@ -143,19 +257,29 @@ impl<'a> SymbolExtractor<'a> { if let Some(name_node) = node.child_by_field_name(name_field) { let name = self.node_text(&name_node); if !name.is_empty() { - // `pub` keyword is a `visibility_modifier` child; check node text as a - // fast heuristic. Covers `pub fn`, `pub struct`, `pub(crate)` etc. - let is_exported = (0..node.child_count()).any(|i| { - node.child(i) - .map(|c| c.kind() == "visibility_modifier") - .unwrap_or(false) - }); + let modifiers = self.rust_modifiers(&node); + let is_exported = + modifiers.iter().any(|m| m == "pub" || m.starts_with("pub(")); + let (vis, vc) = visibility::infer(name, &modifiers, self.language); + let container = self.rust_container(&node); + let signature = self.rust_signature(&node); + let signature_normalized = signature + .as_deref() + .map(|s| normalize_signature(s, self.language)); + out.push(OwnedSymbolInfo { uri: self.lip_uri(name), display_name: name.to_owned(), kind, confidence_score: 30, is_exported, + modifiers, + visibility: Some(vis), + visibility_confidence: Some(vc as f32 / 100.0), + container_name: container, + signature, + signature_normalized, + extraction_tier: ExtractionTier::Tier1, ..OwnedSymbolInfo::new("", "") }); } @@ -169,6 +293,74 @@ impl<'a> SymbolExtractor<'a> { } } + /// Collect Rust modifier keywords adjacent to a definition node. + /// + /// Picks up `visibility_modifier` children (`pub`, `pub(crate)`, …) plus + /// `function_modifiers` tokens (`async`, `unsafe`, `const`, `extern`). + fn rust_modifiers(&self, node: &Node) -> Vec { + let mut mods = Vec::new(); + for i in 0..node.child_count() { + let Some(child) = node.child(i) else { continue }; + match child.kind() { + "visibility_modifier" => { + let text = self.node_text(&child).trim().to_owned(); + if !text.is_empty() { + mods.push(text); + } + } + "function_modifiers" => { + for j in 0..child.child_count() { + if let Some(m) = child.child(j) { + let t = self.node_text(&m).trim(); + if !t.is_empty() { + mods.push(t.to_owned()); + } + } + } + } + _ => {} + } + } + mods + } + + /// Walk up from a definition node to find the enclosing container name. + /// + /// Recognizes `impl`/`trait`/`struct`/`enum`/`mod` ancestors; returns the + /// first container found, or `None` at top level. + fn rust_container(&self, node: &Node) -> Option { + let mut cur = node.parent(); + while let Some(n) = cur { + match n.kind() { + "impl_item" => { + if let Some(ty) = n.child_by_field_name("type") { + return Some(self.node_text(&ty).trim().to_owned()); + } + } + "trait_item" | "struct_item" | "enum_item" | "mod_item" | "union_item" => { + if let Some(name) = n.child_by_field_name("name") { + return Some(self.node_text(&name).trim().to_owned()); + } + } + _ => {} + } + cur = n.parent(); + } + None + } + + /// Declaration head for a Rust function (everything before the body block). + /// Returns `None` for non-function items. + fn rust_signature(&self, node: &Node) -> Option { + if node.kind() != "function_item" { + return None; + } + let body = node.child_by_field_name("body")?; + let text = + std::str::from_utf8(&self.source[node.start_byte()..body.start_byte()]).ok()?; + Some(text.trim().to_owned()) + } + fn rust_occurrences(&self, node: Node, out: &mut Vec) { if matches!(node.kind(), "identifier" | "type_identifier") { let name = self.node_text(&node); @@ -198,13 +390,7 @@ impl<'a> SymbolExtractor<'a> { Role::Reference } }); - out.push(OwnedOccurrence { - symbol_uri: self.lip_uri(name), - range: Self::node_range(&node), - confidence_score: 20, - role, - override_doc: None, - }); + out.push(self.make_occurrence(&node, name, role)); } } for i in 0..node.child_count() { @@ -273,12 +459,30 @@ impl<'a> SymbolExtractor<'a> { .parent() .map(|p| p.kind() == "export_statement") .unwrap_or(false); + let mut modifiers = self.ts_modifiers(&node); + if is_exported { + modifiers.push("export".to_owned()); + } + let (vis, vc) = visibility::infer(name, &modifiers, self.language); + let container = self.ts_container(&node); + let signature = self.ts_signature(&node); + let signature_normalized = signature + .as_deref() + .map(|s| normalize_signature(s, self.language)); + out.push(OwnedSymbolInfo { uri: self.lip_uri(name), display_name: name.to_owned(), kind, confidence_score: 30, is_exported, + modifiers, + visibility: Some(vis), + visibility_confidence: Some(vc as f32 / 100.0), + container_name: container, + signature, + signature_normalized, + extraction_tier: ExtractionTier::Tier1, ..OwnedSymbolInfo::new("", "") }); } @@ -291,6 +495,67 @@ impl<'a> SymbolExtractor<'a> { } } + /// Collect TypeScript/JavaScript modifier keywords from a declaration node. + fn ts_modifiers(&self, node: &Node) -> Vec { + const KEYWORDS: &[&str] = &[ + "async", + "static", + "readonly", + "abstract", + "override", + "public", + "private", + "protected", + "declare", + ]; + let mut mods = Vec::new(); + // Direct children may carry keyword tokens or an `accessibility_modifier`. + for i in 0..node.child_count() { + let Some(child) = node.child(i) else { continue }; + if KEYWORDS.contains(&child.kind()) { + mods.push(child.kind().to_owned()); + } + if child.kind() == "accessibility_modifier" { + let text = self.node_text(&child).trim().to_owned(); + if !text.is_empty() { + mods.push(text); + } + } + } + mods + } + + /// Walk up for the enclosing class/interface container. + fn ts_container(&self, node: &Node) -> Option { + let mut cur = node.parent(); + while let Some(n) = cur { + match n.kind() { + "class_declaration" + | "interface_declaration" + | "enum_declaration" + | "abstract_class_declaration" => { + if let Some(name) = n.child_by_field_name("name") { + return Some(self.node_text(&name).trim().to_owned()); + } + } + _ => {} + } + cur = n.parent(); + } + None + } + + /// Declaration head for a TS/JS function or method (text before body). + fn ts_signature(&self, node: &Node) -> Option { + if !matches!(node.kind(), "function_declaration" | "method_definition") { + return None; + } + let body = node.child_by_field_name("body")?; + let text = + std::str::from_utf8(&self.source[node.start_byte()..body.start_byte()]).ok()?; + Some(text.trim().to_owned()) + } + fn ts_occurrences(&self, node: Node, out: &mut Vec) { if matches!(node.kind(), "identifier" | "type_identifier") { let name = self.node_text(&node); @@ -316,13 +581,7 @@ impl<'a> SymbolExtractor<'a> { Role::Reference } }); - out.push(OwnedOccurrence { - symbol_uri: self.lip_uri(name), - range: Self::node_range(&node), - confidence_score: 20, - role, - override_doc: None, - }); + out.push(self.make_occurrence(&node, name, role)); } } for i in 0..node.child_count() { @@ -361,12 +620,27 @@ impl<'a> SymbolExtractor<'a> { if !name.is_empty() { // Python convention: names starting with _ are private. let is_exported = !name.starts_with('_'); + let modifiers: Vec = Vec::new(); + let (vis, vc) = visibility::infer(name, &modifiers, self.language); + let container = self.py_container(&node); + let signature = self.py_signature(&node); + let signature_normalized = signature + .as_deref() + .map(|s| normalize_signature(s, self.language)); + out.push(OwnedSymbolInfo { uri: self.lip_uri(name), display_name: name.to_owned(), kind, confidence_score: 30, is_exported, + modifiers, + visibility: Some(vis), + visibility_confidence: Some(vc as f32 / 100.0), + container_name: container, + signature, + signature_normalized, + extraction_tier: ExtractionTier::Tier1, ..OwnedSymbolInfo::new("", "") }); } @@ -379,6 +653,33 @@ impl<'a> SymbolExtractor<'a> { } } + /// Walk up for the enclosing Python class container. + fn py_container(&self, node: &Node) -> Option { + let mut cur = node.parent(); + while let Some(n) = cur { + if n.kind() == "class_definition" { + if let Some(name) = n.child_by_field_name("name") { + return Some(self.node_text(&name).trim().to_owned()); + } + } + cur = n.parent(); + } + None + } + + /// Declaration head for a Python function (text before body). + fn py_signature(&self, node: &Node) -> Option { + if node.kind() != "function_definition" { + return None; + } + let body = node.child_by_field_name("body")?; + let text = + std::str::from_utf8(&self.source[node.start_byte()..body.start_byte()]).ok()?; + // Strip the trailing `:` that separates signature from body. + let trimmed = text.trim().trim_end_matches(':').trim_end(); + Some(trimmed.to_owned()) + } + fn py_occurrences(&self, node: Node, out: &mut Vec) { if node.kind() == "identifier" { let name = self.node_text(&node); @@ -396,13 +697,7 @@ impl<'a> SymbolExtractor<'a> { Role::Reference } }); - out.push(OwnedOccurrence { - symbol_uri: self.lip_uri(name), - range: Self::node_range(&node), - confidence_score: 20, - role, - override_doc: None, - }); + out.push(self.make_occurrence(&node, name, role)); } } for i in 0..node.child_count() { @@ -421,16 +716,31 @@ impl<'a> SymbolExtractor<'a> { // the field "name" of that signature node. // - Classes use `class_definition` (not `class_declaration`), field "name" works. // - `mixin_declaration` exists but its identifier child has no named field. - let push = |name: &str, kind: SymbolKind, out: &mut Vec| { + let push = |name: &str, kind: SymbolKind, decl: &Node, out: &mut Vec| { if name.is_empty() { return; } + let modifiers = self.dart_modifiers(decl); + let (vis, vc) = visibility::infer(name, &modifiers, self.language); + let container = self.dart_container(decl); + let signature = self.dart_signature(decl); + let signature_normalized = signature + .as_deref() + .map(|s| normalize_signature(s, self.language)); + out.push(OwnedSymbolInfo { uri: self.lip_uri(name), display_name: name.to_owned(), kind, confidence_score: 30, is_exported: !name.starts_with('_'), + modifiers, + visibility: Some(vis), + visibility_confidence: Some(vc as f32 / 100.0), + container_name: container, + signature, + signature_normalized, + extraction_tier: ExtractionTier::Tier1, ..OwnedSymbolInfo::new("", "") }); }; @@ -444,12 +754,12 @@ impl<'a> SymbolExtractor<'a> { .and_then(|sig| sig.child_by_field_name("name")) .map(|n| self.node_text(&n).to_owned()) { - push(&name, SymbolKind::Function, out); + push(&name, SymbolKind::Function, &node, out); } } "class_definition" => { if let Some(name_node) = node.child_by_field_name("name") { - push(self.node_text(&name_node), SymbolKind::Class, out); + push(self.node_text(&name_node), SymbolKind::Class, &node, out); } } "mixin_declaration" => { @@ -459,7 +769,7 @@ impl<'a> SymbolExtractor<'a> { .find(|c| c.kind() == "identifier") .map(|c| self.node_text(&c).to_owned()) .unwrap_or_default(); - push(&name, SymbolKind::Class, out); + push(&name, SymbolKind::Class, &node, out); } "method_declaration" | "constructor_declaration" @@ -471,12 +781,17 @@ impl<'a> SymbolExtractor<'a> { SymbolKind::Method }; if let Some(name_node) = node.child_by_field_name("name") { - push(self.node_text(&name_node), kind, out); + push(self.node_text(&name_node), kind, &node, out); } } "extension_declaration" => { if let Some(name_node) = node.child_by_field_name("name") { - push(self.node_text(&name_node), SymbolKind::Namespace, out); + push( + self.node_text(&name_node), + SymbolKind::Namespace, + &node, + out, + ); } } _ => {} @@ -489,6 +804,50 @@ impl<'a> SymbolExtractor<'a> { } } + /// Collect Dart modifier keywords from a declaration node's direct children. + fn dart_modifiers(&self, node: &Node) -> Vec { + const KEYWORDS: &[&str] = &[ + "static", "abstract", "final", "const", "external", "factory", "late", "covariant", + ]; + collect_matching_keywords(*node, KEYWORDS) + } + + /// Walk up for the enclosing Dart class/mixin/extension container. + fn dart_container(&self, node: &Node) -> Option { + let mut cur = node.parent(); + while let Some(n) = cur { + match n.kind() { + "class_definition" | "extension_declaration" => { + if let Some(name) = n.child_by_field_name("name") { + return Some(self.node_text(&name).trim().to_owned()); + } + } + "mixin_declaration" => { + let name = (0..n.named_child_count()) + .filter_map(|i| n.named_child(i)) + .find(|c| c.kind() == "identifier") + .map(|c| self.node_text(&c).trim().to_owned()); + if name.as_deref().map_or(false, |s| !s.is_empty()) { + return name; + } + } + _ => {} + } + cur = n.parent(); + } + None + } + + /// Declaration head for a Dart function or method (text before body). + fn dart_signature(&self, node: &Node) -> Option { + let body = node + .child_by_field_name("body") + .or_else(|| node.child_by_field_name("function_body"))?; + let text = + std::str::from_utf8(&self.source[node.start_byte()..body.start_byte()]).ok()?; + Some(text.trim().to_owned()) + } + fn dart_occurrences(&self, node: Node, out: &mut Vec) { if node.kind() == "identifier" { let name = self.node_text(&node); @@ -525,13 +884,7 @@ impl<'a> SymbolExtractor<'a> { Role::Reference } }); - out.push(OwnedOccurrence { - symbol_uri: self.lip_uri(name), - range: Self::node_range(&node), - confidence_score: 20, - role, - override_doc: None, - }); + out.push(self.make_occurrence(&node, name, role)); } } for i in 0..node.child_count() { @@ -725,7 +1078,7 @@ impl<'a> SymbolExtractor<'a> { /// nodes: `function_declarator → pointer_declarator → identifier`. fn c_declarator_name(&self, node: Node) -> Option { match node.kind() { - "identifier" => { + "identifier" | "field_identifier" => { let name = self.node_text(&node); if name.is_empty() { None @@ -738,9 +1091,31 @@ impl<'a> SymbolExtractor<'a> { | "array_declarator" | "abstract_function_declarator" | "parenthesized_declarator" - | "reference_declarator" => node - .child_by_field_name("declarator") - .and_then(|child| self.c_declarator_name(child)), + | "reference_declarator" => { + // Try the "declarator" field first; if absent (e.g. C++ member + // functions have the field_identifier as a direct child without + // a named field), fall back to scanning named children. + if let Some(child) = node.child_by_field_name("declarator") { + return self.c_declarator_name(child); + } + for i in 0..node.named_child_count() { + let Some(c) = node.named_child(i) else { continue }; + if matches!( + c.kind(), + "identifier" + | "field_identifier" + | "function_declarator" + | "pointer_declarator" + | "reference_declarator" + | "parenthesized_declarator" + ) { + if let Some(n) = self.c_declarator_name(c) { + return Some(n); + } + } + } + None + } _ => None, } } @@ -764,12 +1139,24 @@ impl<'a> SymbolExtractor<'a> { "function_definition" => { if let Some(name) = self.c_function_name(&node) { let is_exported = !self.c_has_static_storage(&node); + let modifiers = self.c_modifiers(&node); + let (vis, vc) = visibility::infer(&name, &modifiers, self.language); + let signature = self.c_signature(&node); + let signature_normalized = signature + .as_deref() + .map(|s| normalize_signature(s, self.language)); out.push(OwnedSymbolInfo { uri: self.lip_uri(&name), display_name: name, kind: SymbolKind::Function, confidence_score: 30, is_exported, + modifiers, + visibility: Some(vis), + visibility_confidence: Some(vc as f32 / 100.0), + signature, + signature_normalized, + extraction_tier: ExtractionTier::Tier1, ..OwnedSymbolInfo::new("", "") }); } @@ -784,6 +1171,9 @@ impl<'a> SymbolExtractor<'a> { kind: SymbolKind::Class, confidence_score: 30, is_exported: true, + visibility: Some(crate::schema::Visibility::Public), + visibility_confidence: Some(0.5), + extraction_tier: ExtractionTier::Tier1, ..OwnedSymbolInfo::new("", "") }); } @@ -799,6 +1189,9 @@ impl<'a> SymbolExtractor<'a> { kind: SymbolKind::Enum, confidence_score: 30, is_exported: true, + visibility: Some(crate::schema::Visibility::Public), + visibility_confidence: Some(0.5), + extraction_tier: ExtractionTier::Tier1, ..OwnedSymbolInfo::new("", "") }); } @@ -815,6 +1208,9 @@ impl<'a> SymbolExtractor<'a> { kind: SymbolKind::TypeAlias, confidence_score: 30, is_exported: true, + visibility: Some(crate::schema::Visibility::Public), + visibility_confidence: Some(0.5), + extraction_tier: ExtractionTier::Tier1, ..OwnedSymbolInfo::new("", "") }); } @@ -830,6 +1226,68 @@ impl<'a> SymbolExtractor<'a> { } } + /// Collect C/C++ modifier keywords from a function_definition's direct children. + fn c_modifiers(&self, node: &Node) -> Vec { + const KEYWORDS: &[&str] = &[ + "static", + "extern", + "inline", + "const", + "virtual", + "override", + "final", + "explicit", + "constexpr", + ]; + let mut mods = Vec::new(); + for i in 0..node.child_count() { + let Some(child) = node.child(i) else { continue }; + match child.kind() { + "storage_class_specifier" | "type_qualifier" | "function_specifier" + | "virtual_function_specifier" | "explicit_function_specifier" => { + let t = self.node_text(&child).trim().to_owned(); + if !t.is_empty() { + mods.push(t); + } + } + k if KEYWORDS.contains(&k) => mods.push(k.to_owned()), + _ => {} + } + } + mods + } + + /// Declaration head for a C/C++ function (text before body block). + fn c_signature(&self, node: &Node) -> Option { + if node.kind() != "function_definition" { + return None; + } + let body = node.child_by_field_name("body")?; + let text = + std::str::from_utf8(&self.source[node.start_byte()..body.start_byte()]).ok()?; + Some(text.trim().to_owned()) + } + + /// Walk up for the enclosing C++ class/struct/namespace container. + fn cpp_container(&self, node: &Node) -> Option { + let mut cur = node.parent(); + while let Some(n) = cur { + match n.kind() { + "class_specifier" + | "struct_specifier" + | "union_specifier" + | "namespace_definition" => { + if let Some(name) = n.child_by_field_name("name") { + return Some(self.node_text(&name).trim().to_owned()); + } + } + _ => {} + } + cur = n.parent(); + } + None + } + fn c_occurrences(&self, node: Node, out: &mut Vec) { if matches!( node.kind(), @@ -860,13 +1318,7 @@ impl<'a> SymbolExtractor<'a> { Role::Reference } }); - out.push(OwnedOccurrence { - symbol_uri: self.lip_uri(name), - range: Self::node_range(&node), - confidence_score: 20, - role, - override_doc: None, - }); + out.push(self.make_occurrence(&node, name, role)); } } for i in 0..node.child_count() { @@ -923,12 +1375,32 @@ impl<'a> SymbolExtractor<'a> { "function_definition" => { if let Some(name) = self.c_function_name(&node) { let is_exported = !self.c_has_static_storage(&node); + let modifiers = self.c_modifiers(&node); + let (vis, vc) = visibility::infer(&name, &modifiers, self.language); + let container = self.cpp_container(&node); + let signature = self.c_signature(&node); + let signature_normalized = signature + .as_deref() + .map(|s| normalize_signature(s, self.language)); + // Inside a class body, treat as Method. + let kind = if container.is_some() { + SymbolKind::Method + } else { + SymbolKind::Function + }; out.push(OwnedSymbolInfo { uri: self.lip_uri(&name), display_name: name, - kind: SymbolKind::Function, + kind, confidence_score: 30, is_exported, + modifiers, + visibility: Some(vis), + visibility_confidence: Some(vc as f32 / 100.0), + container_name: container, + signature, + signature_normalized, + extraction_tier: ExtractionTier::Tier1, ..OwnedSymbolInfo::new("", "") }); } @@ -943,6 +1415,9 @@ impl<'a> SymbolExtractor<'a> { kind: SymbolKind::Class, confidence_score: 30, is_exported: true, + visibility: Some(crate::schema::Visibility::Public), + visibility_confidence: Some(0.5), + extraction_tier: ExtractionTier::Tier1, ..OwnedSymbolInfo::new("", "") }); } @@ -958,6 +1433,9 @@ impl<'a> SymbolExtractor<'a> { kind: SymbolKind::Enum, confidence_score: 30, is_exported: true, + visibility: Some(crate::schema::Visibility::Public), + visibility_confidence: Some(0.5), + extraction_tier: ExtractionTier::Tier1, ..OwnedSymbolInfo::new("", "") }); } @@ -974,6 +1452,9 @@ impl<'a> SymbolExtractor<'a> { kind: SymbolKind::TypeAlias, confidence_score: 30, is_exported: true, + visibility: Some(crate::schema::Visibility::Public), + visibility_confidence: Some(0.5), + extraction_tier: ExtractionTier::Tier1, ..OwnedSymbolInfo::new("", "") }); } @@ -991,6 +1472,9 @@ impl<'a> SymbolExtractor<'a> { kind: SymbolKind::Class, confidence_score: 30, is_exported: true, + visibility: Some(crate::schema::Visibility::Public), + visibility_confidence: Some(0.5), + extraction_tier: ExtractionTier::Tier1, ..OwnedSymbolInfo::new("", "") }); } @@ -1006,6 +1490,9 @@ impl<'a> SymbolExtractor<'a> { kind: SymbolKind::Namespace, confidence_score: 30, is_exported: true, + visibility: Some(crate::schema::Visibility::Public), + visibility_confidence: Some(0.5), + extraction_tier: ExtractionTier::Tier1, ..OwnedSymbolInfo::new("", "") }); } @@ -1052,13 +1539,7 @@ impl<'a> SymbolExtractor<'a> { Role::Reference } }); - out.push(OwnedOccurrence { - symbol_uri: self.lip_uri(name), - range: Self::node_range(&node), - confidence_score: 20, - role, - override_doc: None, - }); + out.push(self.make_occurrence(&node, name, role)); } } for i in 0..node.child_count() { @@ -1119,12 +1600,22 @@ impl<'a> SymbolExtractor<'a> { if let Some(name_node) = node.child_by_field_name("name") { let name = self.node_text(&name_node).to_owned(); if !name.is_empty() { + let (vis, vc) = visibility::infer(&name, &[], self.language); + let signature = self.go_signature(&node); + let signature_normalized = signature + .as_deref() + .map(|s| normalize_signature(s, self.language)); out.push(OwnedSymbolInfo { uri: self.lip_uri(&name), display_name: name.clone(), kind: SymbolKind::Function, confidence_score: 30, is_exported: go_is_exported(&name), + visibility: Some(vis), + visibility_confidence: Some(vc as f32 / 100.0), + signature, + signature_normalized, + extraction_tier: ExtractionTier::Tier1, ..OwnedSymbolInfo::new("", "") }); } @@ -1134,12 +1625,24 @@ impl<'a> SymbolExtractor<'a> { if let Some(name_node) = node.child_by_field_name("name") { let name = self.node_text(&name_node).to_owned(); if !name.is_empty() { + let (vis, vc) = visibility::infer(&name, &[], self.language); + let container = self.go_receiver_type(&node); + let signature = self.go_signature(&node); + let signature_normalized = signature + .as_deref() + .map(|s| normalize_signature(s, self.language)); out.push(OwnedSymbolInfo { uri: self.lip_uri(&name), display_name: name.clone(), kind: SymbolKind::Method, confidence_score: 30, is_exported: go_is_exported(&name), + visibility: Some(vis), + visibility_confidence: Some(vc as f32 / 100.0), + container_name: container, + signature, + signature_normalized, + extraction_tier: ExtractionTier::Tier1, ..OwnedSymbolInfo::new("", "") }); } @@ -1161,12 +1664,17 @@ impl<'a> SymbolExtractor<'a> { _ => SymbolKind::TypeAlias, }) .unwrap_or(SymbolKind::TypeAlias); + let (vis, vc) = + visibility::infer(&name, &[], self.language); out.push(OwnedSymbolInfo { uri: self.lip_uri(&name), display_name: name.clone(), kind, confidence_score: 30, is_exported: go_is_exported(&name), + visibility: Some(vis), + visibility_confidence: Some(vc as f32 / 100.0), + extraction_tier: ExtractionTier::Tier1, ..OwnedSymbolInfo::new("", "") }); } @@ -1183,12 +1691,17 @@ impl<'a> SymbolExtractor<'a> { if let Some(name_node) = spec.child_by_field_name("name") { let name = self.node_text(&name_node).to_owned(); if !name.is_empty() { + let (vis, vc) = + visibility::infer(&name, &[], self.language); out.push(OwnedSymbolInfo { uri: self.lip_uri(&name), display_name: name.clone(), kind: SymbolKind::Variable, confidence_score: 25, is_exported: go_is_exported(&name), + visibility: Some(vis), + visibility_confidence: Some(vc as f32 / 100.0), + extraction_tier: ExtractionTier::Tier1, ..OwnedSymbolInfo::new("", "") }); } @@ -1207,6 +1720,41 @@ impl<'a> SymbolExtractor<'a> { } } + /// Declaration head for a Go function/method (text before body block). + fn go_signature(&self, node: &Node) -> Option { + if !matches!(node.kind(), "function_declaration" | "method_declaration") { + return None; + } + let body = node.child_by_field_name("body")?; + let text = + std::str::from_utf8(&self.source[node.start_byte()..body.start_byte()]).ok()?; + Some(text.trim().to_owned()) + } + + /// Receiver-type name for a Go method declaration, e.g. `Foo` in + /// `func (f *Foo) Bar() {}`. + fn go_receiver_type(&self, node: &Node) -> Option { + let recv = node.child_by_field_name("receiver")?; + // `receiver` is a parameter_list; the first parameter_declaration has a + // `type` field that may be `type_identifier` or `pointer_type → type_identifier`. + for i in 0..recv.named_child_count() { + let p = recv.named_child(i)?; + if p.kind() != "parameter_declaration" { + continue; + } + let ty = p.child_by_field_name("type")?; + let ident = match ty.kind() { + "type_identifier" => Some(ty), + "pointer_type" => ty + .named_child(0) + .filter(|c| c.kind() == "type_identifier"), + _ => None, + }; + return ident.map(|n| self.node_text(&n).trim().to_owned()); + } + None + } + fn go_occurrences(&self, node: Node, out: &mut Vec) { if matches!( node.kind(), @@ -1232,13 +1780,7 @@ impl<'a> SymbolExtractor<'a> { Role::Reference } }); - out.push(OwnedOccurrence { - symbol_uri: self.lip_uri(name), - range: Self::node_range(&node), - confidence_score: 20, - role, - override_doc: None, - }); + out.push(self.make_occurrence(&node, name, role)); } } for i in 0..node.child_count() { @@ -1339,12 +1881,26 @@ impl<'a> SymbolExtractor<'a> { let name = self.node_text(&name_node).to_owned(); if !name.is_empty() { let is_exported = kotlin_is_exported(node); + let modifiers = kotlin_modifiers(node); + let (vis, vc) = visibility::infer(&name, &modifiers, self.language); + let container = self.kotlin_container(&node); + let signature = self.kotlin_signature(&node); + let signature_normalized = signature + .as_deref() + .map(|s| normalize_signature(s, self.language)); out.push(OwnedSymbolInfo { uri: self.lip_uri(&name), display_name: name, kind: k, confidence_score: 30, is_exported, + modifiers, + visibility: Some(vis), + visibility_confidence: Some(vc as f32 / 100.0), + container_name: container, + signature, + signature_normalized, + extraction_tier: ExtractionTier::Tier1, ..OwnedSymbolInfo::new("", "") }); } @@ -1360,6 +1916,34 @@ impl<'a> SymbolExtractor<'a> { } } + /// Walk up for the enclosing Kotlin class/object container. + fn kotlin_container(&self, node: &Node) -> Option { + let mut cur = node.parent(); + while let Some(n) = cur { + if matches!(n.kind(), "class_declaration" | "object_declaration") { + if let Some(name) = kotlin_first_name_child(n) { + return Some(self.node_text(&name).trim().to_owned()); + } + } + cur = n.parent(); + } + None + } + + /// Declaration head for a Kotlin function (text before body block). + fn kotlin_signature(&self, node: &Node) -> Option { + if node.kind() != "function_declaration" { + return None; + } + // Kotlin bodies come via field "body" (block or expression). + let body = node.child_by_field_name("body")?; + let text = + std::str::from_utf8(&self.source[node.start_byte()..body.start_byte()]).ok()?; + // Strip trailing `=` that introduces an expression body. + let trimmed = text.trim().trim_end_matches('=').trim_end(); + Some(trimmed.to_owned()) + } + fn kotlin_occurrences(&self, node: Node, out: &mut Vec) { if matches!(node.kind(), "simple_identifier" | "type_identifier") { let name = self.node_text(&node); @@ -1380,13 +1964,7 @@ impl<'a> SymbolExtractor<'a> { Role::Reference } }); - out.push(OwnedOccurrence { - symbol_uri: self.lip_uri(name), - range: Self::node_range(&node), - confidence_score: 20, - role, - override_doc: None, - }); + out.push(self.make_occurrence(&node, name, role)); } } for i in 0..node.child_count() { @@ -1471,12 +2049,26 @@ impl<'a> SymbolExtractor<'a> { let name = self.node_text(&name_node).to_owned(); if !name.is_empty() { let is_exported = swift_is_exported(node); + let modifiers = swift_modifiers(node); + let (vis, vc) = visibility::infer(&name, &modifiers, self.language); + let container = self.swift_container(&node); + let signature = self.swift_signature(&node); + let signature_normalized = signature + .as_deref() + .map(|s| normalize_signature(s, self.language)); out.push(OwnedSymbolInfo { uri: self.lip_uri(&name), display_name: name, kind: k, confidence_score: 30, is_exported, + modifiers, + visibility: Some(vis), + visibility_confidence: Some(vc as f32 / 100.0), + container_name: container, + signature, + signature_normalized, + extraction_tier: ExtractionTier::Tier1, ..OwnedSymbolInfo::new("", "") }); } @@ -1490,6 +2082,34 @@ impl<'a> SymbolExtractor<'a> { } } + /// Walk up for the enclosing Swift class/protocol/extension container. + fn swift_container(&self, node: &Node) -> Option { + let mut cur = node.parent(); + while let Some(n) = cur { + if matches!( + n.kind(), + "class_declaration" | "protocol_declaration" | "extension_declaration" + ) { + if let Some(name) = n.child_by_field_name("name") { + return Some(self.node_text(&name).trim().to_owned()); + } + } + cur = n.parent(); + } + None + } + + /// Declaration head for a Swift function (text before body block). + fn swift_signature(&self, node: &Node) -> Option { + if node.kind() != "function_declaration" { + return None; + } + let body = node.child_by_field_name("body")?; + let text = + std::str::from_utf8(&self.source[node.start_byte()..body.start_byte()]).ok()?; + Some(text.trim().to_owned()) + } + fn swift_occurrences(&self, node: Node, out: &mut Vec) { if matches!(node.kind(), "simple_identifier" | "type_identifier") { let name = self.node_text(&node); @@ -1511,13 +2131,7 @@ impl<'a> SymbolExtractor<'a> { Role::Reference } }); - out.push(OwnedOccurrence { - symbol_uri: self.lip_uri(name), - range: Self::node_range(&node), - confidence_score: 20, - role, - override_doc: None, - }); + out.push(self.make_occurrence(&node, name, role)); } } for i in 0..node.child_count() { @@ -1594,6 +2208,27 @@ fn kotlin_first_name_child(node: Node) -> Option { None } +/// Collect every node in `node`'s subtree whose `kind()` matches one of +/// `keywords`. Duplicates are preserved — callers that want unique values +/// must deduplicate. Tree-sitter anonymous nodes use their source text as +/// their kind, so matching on kind is reliable without reading source bytes. +fn collect_matching_keywords(node: Node, keywords: &[&str]) -> Vec { + let mut out = Vec::new(); + walk_collect_keywords(node, keywords, &mut out); + out +} + +fn walk_collect_keywords(node: Node, keywords: &[&str], out: &mut Vec) { + if keywords.contains(&node.kind()) { + out.push(node.kind().to_owned()); + } + for i in 0..node.child_count() { + if let Some(child) = node.child(i) { + walk_collect_keywords(child, keywords, out); + } + } +} + /// Returns true if any node in the subtree has a kind matching one of the given keywords. /// /// Tree-sitter anonymous nodes (keywords) use their source text as their kind, so @@ -1626,6 +2261,23 @@ fn kotlin_is_exported(node: Node) -> bool { true } +/// Collect Kotlin modifier keywords from the `modifiers` child node. +fn kotlin_modifiers(node: Node) -> Vec { + const KEYWORDS: &[&str] = &[ + "private", "protected", "internal", "public", "abstract", "final", "open", "override", + "suspend", "inline", "external", "data", "sealed", "enum", "companion", "lateinit", + "const", "operator", "infix", "tailrec", + ]; + for i in 0..node.named_child_count() { + if let Some(child) = node.named_child(i) { + if child.kind() == "modifiers" { + return collect_matching_keywords(child, KEYWORDS); + } + } + } + Vec::new() +} + /// A Swift declaration is exported unless it has a `private` or `fileprivate` /// access modifier. fn swift_is_exported(node: Node) -> bool { @@ -1640,3 +2292,34 @@ fn swift_is_exported(node: Node) -> bool { } true } + +/// Collect Swift modifier keywords from any `modifiers`/`modifier` child. +fn swift_modifiers(node: Node) -> Vec { + const KEYWORDS: &[&str] = &[ + "private", + "fileprivate", + "internal", + "public", + "open", + "static", + "final", + "override", + "mutating", + "nonmutating", + "class", + "required", + "convenience", + "lazy", + "weak", + "unowned", + ]; + let mut out = Vec::new(); + for i in 0..node.named_child_count() { + if let Some(child) = node.named_child(i) { + if matches!(child.kind(), "modifiers" | "modifier") { + out.extend(collect_matching_keywords(child, KEYWORDS)); + } + } + } + out +} diff --git a/bindings/rust/src/indexer/tier1.rs b/bindings/rust/src/indexer/tier1.rs index bf494ed..510c9ff 100644 --- a/bindings/rust/src/indexer/tier1.rs +++ b/bindings/rust/src/indexer/tier1.rs @@ -233,6 +233,96 @@ mod tests { assert!(s.is_exported); } + // ── Rust: v2.3 structural metadata ──────────────────────────────────────── + + #[test] + fn rust_pub_fn_visibility_public() { + use crate::schema::{ExtractionTier, Visibility}; + let syms = sym("pub fn bar(x: i32) {}", Language::Rust); + let s = find(&syms, "bar"); + assert_eq!(s.visibility, Some(Visibility::Public)); + assert!(s.modifiers.iter().any(|m| m == "pub")); + assert_eq!(s.extraction_tier, ExtractionTier::Tier1); + // Confidence from explicit keyword → 1.0. + assert_eq!(s.visibility_confidence, Some(1.0)); + } + + #[test] + fn rust_pub_crate_fn_visibility_internal() { + use crate::schema::Visibility; + let syms = sym("pub(crate) fn helper() {}", Language::Rust); + let s = find(&syms, "helper"); + assert_eq!(s.visibility, Some(Visibility::Internal)); + assert!(s.modifiers.iter().any(|m| m.starts_with("pub("))); + } + + #[test] + fn rust_private_fn_visibility_private() { + use crate::schema::Visibility; + let syms = sym("fn hidden() {}", Language::Rust); + let s = find(&syms, "hidden"); + assert_eq!(s.visibility, Some(Visibility::Private)); + // No modifier keyword → 0.5 confidence. + assert_eq!(s.visibility_confidence, Some(0.5)); + } + + #[test] + fn rust_async_unsafe_modifiers_collected() { + let src = "pub async unsafe fn io() {}"; + let syms = sym(src, Language::Rust); + let s = find(&syms, "io"); + assert!(s.modifiers.iter().any(|m| m == "pub")); + assert!(s.modifiers.iter().any(|m| m == "async")); + assert!(s.modifiers.iter().any(|m| m == "unsafe")); + } + + #[test] + fn rust_container_name_from_impl() { + let src = "impl Foo { pub fn bar(&self) {} }"; + let syms = sym(src, Language::Rust); + let s = find(&syms, "bar"); + assert_eq!(s.container_name.as_deref(), Some("Foo")); + } + + #[test] + fn rust_container_name_from_trait() { + // Default method has a body and parses as `function_item`, so it is + // extracted. Abstract trait methods (`function_signature_item`) are + // not extracted today — orthogonal gap. + let src = "pub trait Render { fn draw(&self) {} }"; + let syms = sym(src, Language::Rust); + let s = find(&syms, "draw"); + assert_eq!(s.container_name.as_deref(), Some("Render")); + } + + #[test] + fn rust_no_container_at_top_level() { + let syms = sym("pub fn top() {}", Language::Rust); + assert_eq!(find(&syms, "top").container_name, None); + } + + #[test] + fn rust_signature_and_normalized() { + let syms = sym("pub fn add(x: i32, y: i32) -> i32 { x + y }", Language::Rust); + let s = find(&syms, "add"); + assert_eq!( + s.signature.as_deref(), + Some("pub fn add(x: i32, y: i32) -> i32") + ); + assert_eq!( + s.signature_normalized.as_deref(), + Some("pub fn add(_: i32, _: i32) -> i32") + ); + } + + #[test] + fn rust_non_function_has_no_signature() { + let syms = sym("pub struct Point { x: i32 }", Language::Rust); + let s = find(&syms, "Point"); + assert_eq!(s.signature, None); + assert_eq!(s.signature_normalized, None); + } + #[test] fn rust_macro_definition() { let syms = sym("macro_rules! vec_of { () => {} }", Language::Rust); @@ -807,4 +897,233 @@ mod tests { fn empty_source_returns_empty_swift() { assert!(sym("", Language::Swift).is_empty()); } + + // ── v2.3 structural metadata: smoke tests per language ─────────────────── + + #[test] + fn ts_method_visibility_and_container() { + use crate::schema::{ExtractionTier, Visibility}; + let src = "class Svc { private handle(x: number): boolean { return true; } }"; + let syms = sym(src, Language::TypeScript); + let s = find(&syms, "handle"); + assert_eq!(s.visibility, Some(Visibility::Private)); + assert_eq!(s.container_name.as_deref(), Some("Svc")); + assert!(s.modifiers.iter().any(|m| m == "private")); + assert_eq!(s.extraction_tier, ExtractionTier::Tier1); + assert_eq!( + s.signature_normalized.as_deref(), + Some("private handle(_: number): boolean") + ); + } + + #[test] + fn ts_exported_function_modifier() { + use crate::schema::Visibility; + let syms = sym("export function send(x: number): void {}", Language::TypeScript); + let s = find(&syms, "send"); + assert!(s.modifiers.iter().any(|m| m == "export")); + assert_eq!(s.visibility, Some(Visibility::Public)); + } + + #[test] + fn py_method_container_and_visibility() { + use crate::schema::{ExtractionTier, Visibility}; + let src = "class C:\n def _private(self, x: int) -> None:\n pass\n"; + let syms = sym(src, Language::Python); + let s = find(&syms, "_private"); + assert_eq!(s.visibility, Some(Visibility::Private)); + assert_eq!(s.container_name.as_deref(), Some("C")); + assert_eq!(s.extraction_tier, ExtractionTier::Tier1); + // `self` has no `:` and is left as-is; only the typed param is normalized. + assert_eq!( + s.signature_normalized.as_deref(), + Some("def _private(self, _: int) -> None") + ); + } + + #[test] + fn go_func_visibility_from_name() { + use crate::schema::Visibility; + let syms = sym("package p\nfunc Exported(x int) bool { return true }", Language::Go); + let s = find(&syms, "Exported"); + assert_eq!(s.visibility, Some(Visibility::Public)); + assert_eq!( + s.signature.as_deref(), + Some("func Exported(x int) bool") + ); + } + + #[test] + fn go_method_receiver_as_container() { + let src = "package p\nfunc (f *Foo) Bar() {}"; + let syms = sym(src, Language::Go); + let s = find(&syms, "Bar"); + assert_eq!(s.container_name.as_deref(), Some("Foo")); + } + + #[test] + fn dart_private_underscore_visibility_top_level() { + // Note: Dart class-body methods (`class_member_definition` → + // `method_signature`) are a pre-existing extractor gap — we only + // match `method_declaration` today. Validate the underscore-private + // convention on a top-level function instead. + use crate::schema::Visibility; + let src = "void _priv(int x) {}"; + let syms = sym(src, Language::Dart); + let s = find(&syms, "_priv"); + assert_eq!(s.visibility, Some(Visibility::Private)); + assert!(!s.is_exported); + } + + #[test] + fn c_static_modifier_and_signature() { + use crate::schema::Visibility; + let syms = sym("static int helper(int n) { return n; }", Language::C); + let s = find(&syms, "helper"); + assert!(s.modifiers.iter().any(|m| m == "static")); + // Signature at minimum covers the visible declarator; normalized form is whitespace-collapsed. + assert!(s.signature.as_deref().unwrap_or("").contains("helper")); + assert_eq!(s.visibility, Some(Visibility::Public)); + } + + #[test] + fn cpp_method_container_in_class() { + use crate::schema::SymbolKind; + let src = "class Svc { public: int run() { return 0; } };"; + let syms = sym(src, Language::Cpp); + let s = find(&syms, "run"); + assert_eq!(s.container_name.as_deref(), Some("Svc")); + assert_eq!(s.kind, SymbolKind::Method); + } + + #[test] + fn kotlin_private_modifier_and_visibility() { + use crate::schema::Visibility; + let src = "class Svc { private fun hidden(x: Int): Boolean = true }"; + let syms = sym(src, Language::Kotlin); + let s = find(&syms, "hidden"); + assert!(s.modifiers.iter().any(|m| m == "private")); + assert_eq!(s.visibility, Some(Visibility::Private)); + assert_eq!(s.container_name.as_deref(), Some("Svc")); + } + + #[test] + fn swift_fileprivate_modifier_and_visibility() { + use crate::schema::Visibility; + let src = "class Svc {\n fileprivate func hidden() {}\n}"; + let syms = sym(src, Language::Swift); + let s = find(&syms, "hidden"); + assert!(s.modifiers.iter().any(|m| m == "fileprivate")); + assert_eq!(s.visibility, Some(Visibility::Private)); + assert_eq!(s.container_name.as_deref(), Some("Svc")); + } + + // ── v2.3 reference classification (Call/Read/Write + is_test) ──────────── + + fn occs_at(uri: &str, source: &str, lang: Language) -> Vec { + Tier1Indexer::new().occurrences_for_source(uri, source, lang) + } + + #[test] + fn ref_kind_call_rust() { + use crate::schema::ReferenceKind; + let occs_list = occs("fn a() { b(); } fn b() {}", Language::Rust); + let call = occs_list + .iter() + .find(|o| o.symbol_uri.contains("#b") && o.role == Role::Reference) + .expect("b() should be a reference"); + assert_eq!(call.kind, ReferenceKind::Call); + } + + #[test] + fn ref_kind_call_typescript_method_property() { + use crate::schema::ReferenceKind; + // `obj.method()` — the property identifier is the callee. + let src = "function demo(obj: any) { obj.method(); }"; + let occs_list = occs(src, Language::TypeScript); + let callee = occs_list + .iter() + .find(|o| o.symbol_uri.contains("#method")); + if let Some(c) = callee { + assert_eq!( + c.kind, + ReferenceKind::Call, + "obj.method() callee must be classified as Call, got {:?}", + c.kind + ); + } + } + + #[test] + fn ref_kind_read_rust_local_variable_use() { + use crate::schema::ReferenceKind; + let src = "fn f(x: i32) -> i32 { x + 1 }"; + let occs_list = occs(src, Language::Rust); + // `x` on the RHS of the expression is a Read. + let read = occs_list + .iter() + .find(|o| o.symbol_uri.contains("#x") && o.role == Role::Reference); + if let Some(r) = read { + assert_eq!(r.kind, ReferenceKind::Read); + } + } + + #[test] + fn ref_kind_write_python_assignment() { + use crate::schema::ReferenceKind; + let src = "x = 1\nx = 2\n"; + let occs_list = occs(src, Language::Python); + // At least one Write occurrence for `x`. + let has_write = occs_list + .iter() + .any(|o| o.symbol_uri.contains("#x") && o.kind == ReferenceKind::Write); + assert!( + has_write, + "expected at least one Write occurrence on `x = ...`; got {:?}", + occs_list + .iter() + .filter(|o| o.symbol_uri.contains("#x")) + .collect::>() + ); + } + + #[test] + fn is_test_file_detects_common_paths() { + let cases = [ + ("file:///proj/tests/foo.rs", true), + ("file:///proj/src/foo_test.go", true), + ("file:///proj/src/foo.test.ts", true), + ("file:///proj/src/foo.spec.js", true), + ("file:///proj/__tests__/foo.ts", true), + ("file:///proj/src/MyServiceTest.java", true), + ("file:///proj/src/lib.rs", false), + ("file:///proj/src/foo.rs", false), + ]; + for (uri, expected) in cases { + let occs_list = occs_at(uri, "fn foo() {}", Language::Rust); + let any = occs_list.first(); + if let Some(o) = any { + assert_eq!( + o.is_test, expected, + "wrong is_test for uri {uri}: got {}", + o.is_test + ); + } + } + } + + #[test] + fn definition_role_leaves_kind_unknown() { + use crate::schema::ReferenceKind; + let occs_list = occs("pub fn defined() {}", Language::Rust); + let def = occs_list + .iter() + .find(|o| o.symbol_uri.contains("#defined") && o.role == Role::Definition) + .expect("definition occurrence"); + assert_eq!( + def.kind, + ReferenceKind::Unknown, + "definitions must leave kind as Unknown — ReferenceKind only classifies references" + ); + } } diff --git a/bindings/rust/src/indexer/tier2/clangd.rs b/bindings/rust/src/indexer/tier2/clangd.rs index d0eb42a..798c36f 100644 --- a/bindings/rust/src/indexer/tier2/clangd.rs +++ b/bindings/rust/src/indexer/tier2/clangd.rs @@ -28,8 +28,10 @@ use serde_json::{json, Value}; use tokio::process::{Child, Command}; use tracing::{debug, info}; +use crate::indexer::language::Language; use crate::schema::{OwnedSymbolInfo, SymbolKind}; +use super::enrich::enrich_v23; use super::lsp_client::LspClient; use super::rust_analyzer::VerificationResult; @@ -172,7 +174,7 @@ impl ClangdBackend { }; let mut out = vec![]; - collect_symbols(&items, &mut out); + collect_symbols(&items, None, &mut out); Ok(out) } @@ -242,13 +244,18 @@ impl ClangdBackend { // At Tier 2 we trust clangd's symbol list, so mark all as exported // unless the name starts with an underscore (internal convention). let is_exported = !sym.name.starts_with('_'); + let c_lang = if language_id == "c" { + Language::C + } else { + Language::Cpp + }; - symbols.push(OwnedSymbolInfo { + let mut info = OwnedSymbolInfo { uri: sym_uri, display_name: sym.name.clone(), kind: lsp_kind_to_lip(sym.kind), documentation: None, - signature: sig, + signature: sig.clone(), confidence_score: 90, relationships: vec![], runtime_p99_ms: None, @@ -256,7 +263,10 @@ impl ClangdBackend { taint_labels: vec![], blast_radius: 0, is_exported, - }); + ..Default::default() + }; + enrich_v23(&mut info, sig.as_deref(), sym.container.clone(), c_lang); + symbols.push(info); } Ok(VerificationResult { @@ -273,12 +283,13 @@ struct RawSymbol { kind: u64, line: u32, col: u32, + container: Option, } /// Recursively collect symbols from `DocumentSymbol[]` (nested) or /// `SymbolInformation[]` (flat). clangd returns the hierarchical form when /// `hierarchicalDocumentSymbolSupport` is true. -fn collect_symbols(items: &[Value], out: &mut Vec) { +fn collect_symbols(items: &[Value], parent: Option<&str>, out: &mut Vec) { for item in items { let name = item .get("name") @@ -304,15 +315,23 @@ fn collect_symbols(items: &[Value], out: &mut Vec) { .and_then(|v| v.as_u64()) .unwrap_or(0) as u32; + let container = item + .get("containerName") + .and_then(|v| v.as_str()) + .filter(|s| !s.is_empty()) + .map(str::to_owned) + .or_else(|| parent.map(str::to_owned)); + out.push(RawSymbol { - name, + name: name.clone(), kind, line, col, + container, }); if let Some(Value::Array(children)) = item.get("children") { - collect_symbols(children, out); + collect_symbols(children, Some(&name), out); } } } diff --git a/bindings/rust/src/indexer/tier2/dart_ls.rs b/bindings/rust/src/indexer/tier2/dart_ls.rs index 5ef1682..de96aec 100644 --- a/bindings/rust/src/indexer/tier2/dart_ls.rs +++ b/bindings/rust/src/indexer/tier2/dart_ls.rs @@ -25,8 +25,10 @@ use serde_json::{json, Value}; use tokio::process::{Child, Command}; use tracing::{debug, info}; +use crate::indexer::language::Language; use crate::schema::{OwnedRelationship, OwnedSymbolInfo, SymbolKind}; +use super::enrich::enrich_v23; use super::lsp_client::LspClient; use super::rust_analyzer::{file_uri_to_lip_uri, VerificationResult}; @@ -162,7 +164,7 @@ impl DartBackend { }; let mut out = vec![]; - collect_symbols(&items, &mut out); + collect_symbols(&items, None, &mut out); Ok(out) } @@ -276,12 +278,12 @@ impl DartBackend { // Dart convention: names starting with _ are library-private. let is_exported = !sym.name.starts_with('_'); - symbols.push(OwnedSymbolInfo { + let mut info = OwnedSymbolInfo { uri: sym_uri, display_name: sym.name.clone(), kind: lsp_kind_to_lip(sym.kind), documentation: None, - signature: sig, + signature: sig.clone(), confidence_score: 90, relationships: type_rel.into_iter().collect(), runtime_p99_ms: None, @@ -289,7 +291,10 @@ impl DartBackend { taint_labels: vec![], blast_radius: 0, is_exported, - }); + ..Default::default() + }; + enrich_v23(&mut info, sig.as_deref(), sym.container.clone(), Language::Dart); + symbols.push(info); } Ok(VerificationResult { @@ -306,12 +311,13 @@ struct RawSymbol { kind: u64, line: u32, col: u32, + container: Option, } /// Recursively collect symbols from `DocumentSymbol[]` (nested) or /// `SymbolInformation[]` (flat). The Dart analysis server returns the /// hierarchical form when `hierarchicalDocumentSymbolSupport` is true. -fn collect_symbols(items: &[Value], out: &mut Vec) { +fn collect_symbols(items: &[Value], parent: Option<&str>, out: &mut Vec) { for item in items { let name = item .get("name") @@ -338,16 +344,24 @@ fn collect_symbols(items: &[Value], out: &mut Vec) { .and_then(|v| v.as_u64()) .unwrap_or(0) as u32; + let container = item + .get("containerName") + .and_then(|v| v.as_str()) + .filter(|s| !s.is_empty()) + .map(str::to_owned) + .or_else(|| parent.map(str::to_owned)); + out.push(RawSymbol { - name, + name: name.clone(), kind, line, col, + container, }); // Recurse into nested children (classes contain methods, etc.). if let Some(Value::Array(children)) = item.get("children") { - collect_symbols(children, out); + collect_symbols(children, Some(&name), out); } } } diff --git a/bindings/rust/src/indexer/tier2/enrich.rs b/bindings/rust/src/indexer/tier2/enrich.rs new file mode 100644 index 0000000..9ce29fb --- /dev/null +++ b/bindings/rust/src/indexer/tier2/enrich.rs @@ -0,0 +1,238 @@ +//! Shared v2.3 structural-metadata enrichment for Tier-2 LSP backends. +//! +//! Every backend calls [`enrich_v23`] once per symbol before pushing it into the +//! result set. The helper: +//! +//! 1. Tags the record with `extraction_tier = Tier2`. +//! 2. Extracts modifier keywords from the LSP-provided signature prefix. +//! 3. Resolves canonical `visibility` + `visibility_confidence` via +//! [`crate::schema::visibility::infer`] — the same oracle Tier 1 uses. +//! 4. Records `container_name` from LSP `SymbolInformation.containerName`. +//! 5. Normalises the signature via [`crate::schema::normalize_signature`]. +//! +//! `modifiers_source` stays `None` — that field is reserved for SCIP imports +//! (spec §v2.3 C.5). Tier-2 native paths are self-evidently LSP-verified. + +use crate::indexer::language::Language; +use crate::schema::{normalize_signature, visibility, ExtractionTier, OwnedSymbolInfo}; + +/// Populate the v2.3 structural-metadata fields on `sym` in place. +/// +/// `signature` is the raw hover text (may be `None` when the server returned no +/// hover). `container` is the LSP `containerName` for the symbol, if any. +pub fn enrich_v23( + sym: &mut OwnedSymbolInfo, + signature: Option<&str>, + container: Option, + lang: Language, +) { + sym.extraction_tier = ExtractionTier::Tier2; + + let modifiers = signature + .map(|s| extract_modifiers(s, lang)) + .unwrap_or_default(); + + let (vis, confidence) = visibility::infer(&sym.display_name, &modifiers, lang); + sym.visibility = Some(vis); + sym.visibility_confidence = Some(confidence as f32 / 100.0); + sym.modifiers = modifiers; + + if let Some(name) = container.filter(|n| !n.is_empty()) { + sym.container_name = Some(name); + } + + if let Some(sig) = signature { + sym.signature_normalized = Some(normalize_signature(sig, lang)); + } +} + +/// Extract modifier keywords appearing at the start of a hover signature. +/// +/// Scans leading whitespace-separated tokens that match the language's known +/// modifier vocabulary and stops at the first token that is not a modifier. +/// Order is preserved. Rust `pub(crate)` / `pub(super)` style visibility tokens +/// are emitted verbatim so [`visibility::infer`] can recognise them. +pub fn extract_modifiers(signature: &str, lang: Language) -> Vec { + let keywords = modifier_keywords(lang); + let mut out = Vec::new(); + let mut rest = signature.trim_start(); + + loop { + // Rust `pub(...)` — consume as one token even though it has parens. + if matches!(lang, Language::Rust) && rest.starts_with("pub(") { + if let Some(close) = rest.find(')') { + out.push(rest[..=close].to_owned()); + rest = rest[close + 1..].trim_start(); + continue; + } + } + + let end = rest + .find(|c: char| c.is_whitespace() || c == '(' || c == '<' || c == ':') + .unwrap_or(rest.len()); + if end == 0 { + break; + } + let tok = &rest[..end]; + if keywords.contains(&tok) { + out.push(tok.to_owned()); + rest = rest[end..].trim_start(); + } else { + break; + } + } + out +} + +fn modifier_keywords(lang: Language) -> &'static [&'static str] { + match lang { + Language::Rust => &[ + "pub", "const", "async", "unsafe", "extern", "static", "mut", "default", "move", + ], + Language::TypeScript + | Language::JavaScript + | Language::JavaScriptReact => &[ + "export", "default", "async", "static", "readonly", "public", "private", "protected", + "abstract", "declare", "override", "const", "let", "var", + ], + Language::Python => &["async", "def"], + Language::Dart => &[ + "static", "abstract", "final", "const", "external", "factory", "late", "covariant", + "async", + ], + Language::Go => &["func"], + Language::Kotlin => &[ + "private", "protected", "internal", "public", "abstract", "final", "open", "override", + "suspend", "inline", "external", "data", "sealed", "enum", "companion", "lateinit", + "const", "operator", "infix", "tailrec", + ], + Language::Swift => &[ + "private", "fileprivate", "internal", "public", "open", "static", "final", "override", + "mutating", "nonmutating", "class", "required", "convenience", "lazy", "weak", + "unowned", "dynamic", + ], + Language::C | Language::Cpp => &[ + "static", "extern", "const", "virtual", "override", "explicit", "inline", "constexpr", + "private", "protected", "public", "friend", "mutable", "volatile", "register", + "typedef", + ], + Language::Unknown => &[], + } +} + +// ─── Tests ──────────────────────────────────────────────────────────────────── + +#[cfg(test)] +mod tests { + use super::*; + use crate::schema::{SymbolKind, Visibility}; + + fn sym(name: &str) -> OwnedSymbolInfo { + let mut s = OwnedSymbolInfo::new("lip://local/f#x", name); + s.kind = SymbolKind::Function; + s + } + + #[test] + fn rust_extracts_pub_async() { + let mods = extract_modifiers("pub async fn foo(x: i32) -> Bar", Language::Rust); + assert_eq!(mods, vec!["pub", "async"]); + } + + #[test] + fn rust_extracts_pub_crate_verbatim() { + let mods = extract_modifiers("pub(crate) fn foo()", Language::Rust); + assert_eq!(mods, vec!["pub(crate)"]); + } + + #[test] + fn rust_extracts_pub_in_path() { + let mods = extract_modifiers("pub(in crate::x) fn foo()", Language::Rust); + assert_eq!(mods, vec!["pub(in crate::x)"]); + } + + #[test] + fn rust_no_modifiers() { + assert!(extract_modifiers("fn foo()", Language::Rust).is_empty()); + } + + #[test] + fn ts_export_async() { + let mods = extract_modifiers("export async function foo(): Promise", Language::TypeScript); + assert_eq!(mods, vec!["export", "async"]); + } + + #[test] + fn kotlin_private_suspend() { + let mods = extract_modifiers("private suspend fun foo(): Int", Language::Kotlin); + assert_eq!(mods, vec!["private", "suspend"]); + } + + #[test] + fn swift_public_final() { + let mods = extract_modifiers("public final func foo() -> Int", Language::Swift); + assert_eq!(mods, vec!["public", "final"]); + } + + #[test] + fn cpp_static_inline() { + let mods = extract_modifiers("static inline int foo()", Language::Cpp); + assert_eq!(mods, vec!["static", "inline"]); + } + + #[test] + fn enrich_sets_tier_and_visibility() { + let mut s = sym("foo"); + enrich_v23(&mut s, Some("pub fn foo() -> i32"), None, Language::Rust); + assert_eq!(s.extraction_tier, ExtractionTier::Tier2); + assert_eq!(s.visibility, Some(Visibility::Public)); + assert_eq!(s.visibility_confidence, Some(1.0)); + assert_eq!(s.modifiers, vec!["pub".to_owned()]); + assert_eq!(s.signature_normalized.as_deref(), Some("pub fn foo() -> i32")); + } + + #[test] + fn enrich_without_signature_still_runs_inference() { + // No hover → no modifiers, but visibility still inferred from name (Go rule). + let mut s = sym("Foo"); + enrich_v23(&mut s, None, None, Language::Go); + assert_eq!(s.extraction_tier, ExtractionTier::Tier2); + assert_eq!(s.visibility, Some(Visibility::Public)); + assert_eq!(s.signature_normalized, None); + assert!(s.modifiers.is_empty()); + } + + #[test] + fn enrich_records_container() { + let mut s = sym("method"); + enrich_v23( + &mut s, + Some("public int run()"), + Some("Svc".to_owned()), + Language::Cpp, + ); + assert_eq!(s.container_name.as_deref(), Some("Svc")); + } + + #[test] + fn enrich_skips_empty_container() { + let mut s = sym("foo"); + enrich_v23(&mut s, Some("fn foo()"), Some(String::new()), Language::Rust); + assert!(s.container_name.is_none()); + } + + #[test] + fn enrich_python_name_convention() { + let mut s = sym("_helper"); + enrich_v23(&mut s, Some("def _helper() -> None"), None, Language::Python); + assert_eq!(s.visibility, Some(Visibility::Private)); + } + + #[test] + fn enrich_ts_no_modifier_is_low_conf_internal() { + let mut s = sym("foo"); + enrich_v23(&mut s, Some("function foo(): void"), None, Language::TypeScript); + assert_eq!(s.visibility, Some(Visibility::Internal)); + assert_eq!(s.visibility_confidence, Some(0.5)); + } +} diff --git a/bindings/rust/src/indexer/tier2/gopls.rs b/bindings/rust/src/indexer/tier2/gopls.rs index 7555d81..956f26f 100644 --- a/bindings/rust/src/indexer/tier2/gopls.rs +++ b/bindings/rust/src/indexer/tier2/gopls.rs @@ -25,8 +25,10 @@ use serde_json::{json, Value}; use tokio::process::{Child, Command}; use tracing::{debug, info}; +use crate::indexer::language::Language; use crate::schema::{OwnedSymbolInfo, SymbolKind}; +use super::enrich::enrich_v23; use super::lsp_client::LspClient; use super::rust_analyzer::VerificationResult; @@ -160,7 +162,7 @@ impl GoplsBackend { }; let mut out = vec![]; - collect_symbols(&items, &mut out); + collect_symbols(&items, None, &mut out); Ok(out) } @@ -229,12 +231,12 @@ impl GoplsBackend { .map(|c| c.is_uppercase()) .unwrap_or(false); - symbols.push(OwnedSymbolInfo { + let mut info = OwnedSymbolInfo { uri: sym_uri, display_name: sym.name.clone(), kind: lsp_kind_to_lip(sym.kind), documentation: None, - signature: sig, + signature: sig.clone(), confidence_score: 90, relationships: vec![], runtime_p99_ms: None, @@ -242,7 +244,10 @@ impl GoplsBackend { taint_labels: vec![], blast_radius: 0, is_exported, - }); + ..Default::default() + }; + enrich_v23(&mut info, sig.as_deref(), sym.container.clone(), Language::Go); + symbols.push(info); } Ok(VerificationResult { @@ -259,9 +264,10 @@ struct RawSymbol { kind: u64, line: u32, col: u32, + container: Option, } -fn collect_symbols(items: &[Value], out: &mut Vec) { +fn collect_symbols(items: &[Value], parent: Option<&str>, out: &mut Vec) { for item in items { let name = item .get("name") @@ -287,15 +293,23 @@ fn collect_symbols(items: &[Value], out: &mut Vec) { .and_then(|v| v.as_u64()) .unwrap_or(0) as u32; + let container = item + .get("containerName") + .and_then(|v| v.as_str()) + .filter(|s| !s.is_empty()) + .map(str::to_owned) + .or_else(|| parent.map(str::to_owned)); + out.push(RawSymbol { - name, + name: name.clone(), kind, line, col, + container, }); if let Some(Value::Array(children)) = item.get("children") { - collect_symbols(children, out); + collect_symbols(children, Some(&name), out); } } } diff --git a/bindings/rust/src/indexer/tier2/kotlin.rs b/bindings/rust/src/indexer/tier2/kotlin.rs index 8b488df..8575f0e 100644 --- a/bindings/rust/src/indexer/tier2/kotlin.rs +++ b/bindings/rust/src/indexer/tier2/kotlin.rs @@ -26,8 +26,10 @@ use serde_json::{json, Value}; use tokio::process::{Child, Command}; use tracing::{debug, info}; +use crate::indexer::language::Language; use crate::schema::{OwnedSymbolInfo, SymbolKind}; +use super::enrich::enrich_v23; use super::lsp_client::LspClient; use super::rust_analyzer::VerificationResult; @@ -163,7 +165,7 @@ impl KotlinBackend { }; let mut out = vec![]; - collect_symbols(&items, &mut out); + collect_symbols(&items, None, &mut out); Ok(out) } @@ -233,12 +235,12 @@ impl KotlinBackend { .map(|c| c.is_uppercase()) .unwrap_or(false); - symbols.push(OwnedSymbolInfo { + let mut info = OwnedSymbolInfo { uri: sym_uri, display_name: sym.name.clone(), kind: lsp_kind_to_lip(sym.kind), documentation: None, - signature: sig, + signature: sig.clone(), confidence_score: 90, relationships: vec![], runtime_p99_ms: None, @@ -246,7 +248,10 @@ impl KotlinBackend { taint_labels: vec![], blast_radius: 0, is_exported, - }); + ..Default::default() + }; + enrich_v23(&mut info, sig.as_deref(), sym.container.clone(), Language::Kotlin); + symbols.push(info); } Ok(VerificationResult { @@ -263,9 +268,10 @@ struct RawSymbol { kind: u64, line: u32, col: u32, + container: Option, } -fn collect_symbols(items: &[Value], out: &mut Vec) { +fn collect_symbols(items: &[Value], parent: Option<&str>, out: &mut Vec) { for item in items { let name = item .get("name") @@ -291,15 +297,23 @@ fn collect_symbols(items: &[Value], out: &mut Vec) { .and_then(|v| v.as_u64()) .unwrap_or(0) as u32; + let container = item + .get("containerName") + .and_then(|v| v.as_str()) + .filter(|s| !s.is_empty()) + .map(str::to_owned) + .or_else(|| parent.map(str::to_owned)); + out.push(RawSymbol { - name, + name: name.clone(), kind, line, col, + container, }); if let Some(Value::Array(children)) = item.get("children") { - collect_symbols(children, out); + collect_symbols(children, Some(&name), out); } } } diff --git a/bindings/rust/src/indexer/tier2/mod.rs b/bindings/rust/src/indexer/tier2/mod.rs index 31bf531..7303fe9 100644 --- a/bindings/rust/src/indexer/tier2/mod.rs +++ b/bindings/rust/src/indexer/tier2/mod.rs @@ -27,6 +27,7 @@ pub mod clangd; pub mod dart_ls; +mod enrich; pub mod gopls; pub mod kotlin; pub mod lsp_client; diff --git a/bindings/rust/src/indexer/tier2/py_ls.rs b/bindings/rust/src/indexer/tier2/py_ls.rs index 50402ee..a2b5232 100644 --- a/bindings/rust/src/indexer/tier2/py_ls.rs +++ b/bindings/rust/src/indexer/tier2/py_ls.rs @@ -22,8 +22,10 @@ use serde_json::{json, Value}; use tokio::process::{Child, Command}; use tracing::{debug, info, warn}; +use crate::indexer::language::Language; use crate::schema::{OwnedRelationship, OwnedSymbolInfo, SymbolKind}; +use super::enrich::enrich_v23; use super::lsp_client::LspClient; use super::rust_analyzer::{file_uri_to_lip_uri, VerificationResult}; @@ -181,11 +183,18 @@ impl PythonBackend { .and_then(|v| v.as_u64()) .unwrap_or(0) as u32; + let container = item + .get("containerName") + .and_then(|v| v.as_str()) + .filter(|s| !s.is_empty()) + .map(str::to_owned); + out.push(RawSymbol { name, kind, line, col, + container, }); } Ok(out) @@ -299,12 +308,12 @@ impl PythonBackend { // Python convention: names starting with _ are private. let is_exported = !sym.name.starts_with('_'); - symbols.push(OwnedSymbolInfo { + let mut info = OwnedSymbolInfo { uri: sym_uri, display_name: sym.name.clone(), kind: lsp_kind_to_lip(sym.kind), documentation: None, - signature: sig, + signature: sig.clone(), confidence_score: 90, relationships: type_rel.into_iter().collect(), runtime_p99_ms: None, @@ -312,7 +321,10 @@ impl PythonBackend { taint_labels: vec![], blast_radius: 0, is_exported, - }); + ..Default::default() + }; + enrich_v23(&mut info, sig.as_deref(), sym.container.clone(), Language::Python); + symbols.push(info); } Ok(VerificationResult { @@ -329,6 +341,7 @@ struct RawSymbol { kind: u64, line: u32, col: u32, + container: Option, } /// Try to spawn `pyright-langserver --stdio`; if not found, try `pylsp`. diff --git a/bindings/rust/src/indexer/tier2/rust_analyzer.rs b/bindings/rust/src/indexer/tier2/rust_analyzer.rs index 7984a47..7a7a606 100644 --- a/bindings/rust/src/indexer/tier2/rust_analyzer.rs +++ b/bindings/rust/src/indexer/tier2/rust_analyzer.rs @@ -23,8 +23,10 @@ use serde_json::{json, Value}; use tokio::process::{Child, Command}; use tracing::{debug, info}; +use crate::indexer::language::Language; use crate::schema::{OwnedRelationship, OwnedSymbolInfo, SymbolKind}; +use super::enrich::enrich_v23; use super::lsp_client::LspClient; // ─── Public types ───────────────────────────────────────────────────────────── @@ -199,11 +201,18 @@ impl RustAnalyzerBackend { .and_then(|v| v.as_u64()) .unwrap_or(0) as u32; + let container = item + .get("containerName") + .and_then(|v| v.as_str()) + .filter(|s| !s.is_empty()) + .map(str::to_owned); + out.push(RawSymbol { name, kind, line, col, + container, }); } Ok(out) @@ -389,12 +398,12 @@ impl RustAnalyzerBackend { .as_deref() .map(|s| s.starts_with("pub")) .unwrap_or(false); - symbols.push(OwnedSymbolInfo { + let mut info = OwnedSymbolInfo { uri: sym_uri, display_name: sym.name.clone(), kind: lsp_kind_to_lip(sym.kind), documentation: None, - signature: sig, + signature: sig.clone(), confidence_score: 90, relationships: type_rel.into_iter().collect(), runtime_p99_ms: None, @@ -402,7 +411,10 @@ impl RustAnalyzerBackend { taint_labels: vec![], blast_radius: 0, is_exported, - }); + ..Default::default() + }; + enrich_v23(&mut info, sig.as_deref(), sym.container.clone(), Language::Rust); + symbols.push(info); } // Collect local variable types from inlay hints — these are bindings @@ -440,12 +452,13 @@ impl RustAnalyzerBackend { }; // @line:col suffix makes the URI unique for same-name locals. let sym_uri = format!("lip://local/{path}#{name}@{hint_line}:{hint_col}"); - symbols.push(OwnedSymbolInfo { + let local_sig = format!("{name}: {label}"); + let mut info = OwnedSymbolInfo { uri: sym_uri, display_name: name.to_owned(), kind: SymbolKind::Variable, documentation: None, - signature: Some(format!("{name}: {label}")), + signature: Some(local_sig.clone()), confidence_score: 90, relationships: vec![], runtime_p99_ms: None, @@ -453,7 +466,10 @@ impl RustAnalyzerBackend { taint_labels: vec![], blast_radius: 0, is_exported: false, - }); + ..Default::default() + }; + enrich_v23(&mut info, Some(&local_sig), None, Language::Rust); + symbols.push(info); } Ok(VerificationResult { @@ -470,6 +486,7 @@ struct RawSymbol { kind: u64, line: u32, col: u32, + container: Option, } /// Convert a `file://` URI to a LIP `lip://local/` URI. diff --git a/bindings/rust/src/indexer/tier2/swift_ls.rs b/bindings/rust/src/indexer/tier2/swift_ls.rs index 15f76ff..f43087a 100644 --- a/bindings/rust/src/indexer/tier2/swift_ls.rs +++ b/bindings/rust/src/indexer/tier2/swift_ls.rs @@ -30,8 +30,10 @@ use serde_json::{json, Value}; use tokio::process::{Child, Command}; use tracing::{debug, info}; +use crate::indexer::language::Language; use crate::schema::{OwnedSymbolInfo, SymbolKind}; +use super::enrich::enrich_v23; use super::lsp_client::LspClient; use super::rust_analyzer::VerificationResult; @@ -166,7 +168,7 @@ impl SwiftBackend { }; let mut out = vec![]; - collect_symbols(&items, &mut out); + collect_symbols(&items, None, &mut out); Ok(out) } @@ -232,12 +234,12 @@ impl SwiftBackend { .map(|s| !s.starts_with("private") && !s.starts_with("fileprivate")) .unwrap_or(true); - symbols.push(OwnedSymbolInfo { + let mut info = OwnedSymbolInfo { uri: sym_uri, display_name: sym.name.clone(), kind: lsp_kind_to_lip(sym.kind), documentation: None, - signature: sig, + signature: sig.clone(), confidence_score: 90, relationships: vec![], runtime_p99_ms: None, @@ -245,7 +247,10 @@ impl SwiftBackend { taint_labels: vec![], blast_radius: 0, is_exported, - }); + ..Default::default() + }; + enrich_v23(&mut info, sig.as_deref(), sym.container.clone(), Language::Swift); + symbols.push(info); } Ok(VerificationResult { @@ -262,9 +267,10 @@ struct RawSymbol { kind: u64, line: u32, col: u32, + container: Option, } -fn collect_symbols(items: &[Value], out: &mut Vec) { +fn collect_symbols(items: &[Value], parent: Option<&str>, out: &mut Vec) { for item in items { let name = item .get("name") @@ -290,15 +296,23 @@ fn collect_symbols(items: &[Value], out: &mut Vec) { .and_then(|v| v.as_u64()) .unwrap_or(0) as u32; + let container = item + .get("containerName") + .and_then(|v| v.as_str()) + .filter(|s| !s.is_empty()) + .map(str::to_owned) + .or_else(|| parent.map(str::to_owned)); + out.push(RawSymbol { - name, + name: name.clone(), kind, line, col, + container, }); if let Some(Value::Array(children)) = item.get("children") { - collect_symbols(children, out); + collect_symbols(children, Some(&name), out); } } } diff --git a/bindings/rust/src/indexer/tier2/ts_server.rs b/bindings/rust/src/indexer/tier2/ts_server.rs index 5056da0..b99b9bb 100644 --- a/bindings/rust/src/indexer/tier2/ts_server.rs +++ b/bindings/rust/src/indexer/tier2/ts_server.rs @@ -21,8 +21,10 @@ use serde_json::{json, Value}; use tokio::process::{Child, Command}; use tracing::{debug, info}; +use crate::indexer::language::Language; use crate::schema::{OwnedRelationship, OwnedSymbolInfo, SymbolKind}; +use super::enrich::enrich_v23; use super::lsp_client::LspClient; use super::rust_analyzer::{file_uri_to_lip_uri, VerificationResult}; @@ -161,7 +163,7 @@ impl TypeScriptBackend { }; let mut out = vec![]; - collect_symbols(&items, &mut out); + collect_symbols(&items, None, &mut out); Ok(out) } @@ -287,12 +289,19 @@ impl TypeScriptBackend { .as_deref() .map(|s| s.starts_with("export")) .unwrap_or(false); - symbols.push(OwnedSymbolInfo { + let ts_lang = if uri.ends_with(".js") || uri.ends_with(".mjs") || uri.ends_with(".cjs") { + Language::JavaScript + } else if uri.ends_with(".jsx") { + Language::JavaScriptReact + } else { + Language::TypeScript + }; + let mut info = OwnedSymbolInfo { uri: sym_uri, display_name: sym.name.clone(), kind: lsp_kind_to_lip(sym.kind), documentation: None, - signature: sig, + signature: sig.clone(), confidence_score: 90, relationships: type_rel.into_iter().collect(), runtime_p99_ms: None, @@ -300,7 +309,10 @@ impl TypeScriptBackend { taint_labels: vec![], blast_radius: 0, is_exported, - }); + ..Default::default() + }; + enrich_v23(&mut info, sig.as_deref(), sym.container.clone(), ts_lang); + symbols.push(info); } Ok(VerificationResult { @@ -317,11 +329,12 @@ struct RawSymbol { kind: u64, line: u32, col: u32, + container: Option, } /// Recursively collect symbols from either `DocumentSymbol[]` (nested, with /// `selectionRange`) or `SymbolInformation[]` (flat, with `location.range`). -fn collect_symbols(items: &[Value], out: &mut Vec) { +fn collect_symbols(items: &[Value], parent: Option<&str>, out: &mut Vec) { for item in items { let name = item .get("name") @@ -349,16 +362,27 @@ fn collect_symbols(items: &[Value], out: &mut Vec) { .and_then(|v| v.as_u64()) .unwrap_or(0) as u32; + // SymbolInformation has `containerName`; for nested DocumentSymbol + // trees we propagate the parent's name so children see the enclosing + // class / interface / namespace. + let container = item + .get("containerName") + .and_then(|v| v.as_str()) + .filter(|s| !s.is_empty()) + .map(str::to_owned) + .or_else(|| parent.map(str::to_owned)); + out.push(RawSymbol { - name, + name: name.clone(), kind, line, col, + container, }); // Recurse into nested DocumentSymbol children. if let Some(Value::Array(children)) = item.get("children") { - collect_symbols(children, out); + collect_symbols(children, Some(&name), out); } } } diff --git a/bindings/rust/src/query_graph/db.rs b/bindings/rust/src/query_graph/db.rs index e2fc38f..34d13f2 100644 --- a/bindings/rust/src/query_graph/db.rs +++ b/bindings/rust/src/query_graph/db.rs @@ -144,6 +144,11 @@ pub struct LipDatabase { /// CPG reverse call graph: callee_uri → [caller_uris]. /// Populated eagerly in `upsert_file`; used by `blast_radius_for`. callee_to_callers: HashMap>, + /// CPG forward call graph: caller_uri → [callee_uris] (v2.3 Feature #4). + /// Symmetric mirror of `callee_to_callers`; populated from the same edges + /// during upsert, cleared during `remove_file_call_edges`. Used by + /// `QueryOutgoingCalls` to answer "what does this symbol call?". + caller_to_callees: HashMap>, /// Per-file call edge index for cleanup on re-upsert or remove. file_call_edges: HashMap>, /// Global name index: display_name → [definition symbol_uris]. @@ -197,6 +202,7 @@ impl LipDatabase { workspace_root: None, annotations: HashMap::new(), callee_to_callers: HashMap::new(), + caller_to_callees: HashMap::new(), file_call_edges: HashMap::new(), name_to_symbols: HashMap::new(), callee_name_to_callers: HashMap::new(), @@ -447,6 +453,10 @@ impl LipDatabase { .entry(edge.to_uri.clone()) .or_default() .push(edge.from_uri.clone()); + self.caller_to_callees + .entry(edge.from_uri.clone()) + .or_default() + .push(edge.to_uri.clone()); // Name-based index: enables cross-file resolution in blast_radius_for. let callee_name = extract_name(&edge.to_uri).to_owned(); if !callee_name.is_empty() { @@ -591,6 +601,10 @@ impl LipDatabase { .entry(edge.to_uri.clone()) .or_default() .push(edge.from_uri.clone()); + self.caller_to_callees + .entry(edge.from_uri.clone()) + .or_default() + .push(edge.to_uri.clone()); let callee_name = extract_name(&edge.to_uri).to_owned(); if !callee_name.is_empty() { self.callee_name_to_callers @@ -646,6 +660,12 @@ impl LipDatabase { if let Some(callers) = self.callee_to_callers.get_mut(&to) { callers.retain(|c| *c != from); } + if let Some(callees) = self.caller_to_callees.get_mut(&from) { + callees.retain(|c| *c != to); + if callees.is_empty() { + self.caller_to_callees.remove(&from); + } + } let callee_name = extract_name(&to); if let Some(callers) = self.callee_name_to_callers.get_mut(callee_name) { callers.retain(|c| *c != from); @@ -1292,6 +1312,134 @@ impl LipDatabase { (results, not_indexed_uris) } + /// Symbol-scoped blast radius with optional semantic enrichment (v2.3). + /// + /// Returns `None` when the symbol has no known defining file (either the + /// URI doesn't resolve or the file isn't indexed). The semantic-enrichment + /// path mirrors [`blast_radius_batch`]: per-symbol embeddings preferred, + /// file-level fallback. When `min_score` is `None`, enrichment is skipped. + pub fn blast_radius_for_symbol( + &mut self, + symbol_uri: &str, + min_score: Option, + ) -> Option { + let file_uri = self.def_index.get(symbol_uri).map(|(f, _)| f.clone())?; + if !self.file_inputs.contains_key(file_uri.as_str()) { + return None; + } + let threshold = min_score.unwrap_or(0.6); + let static_result = self.blast_radius_for(symbol_uri); + + let mut semantic_items = Vec::new(); + if min_score.is_some() { + let static_files: HashSet = + static_result.affected_files.iter().cloned().collect(); + + if let Some(sym_embedding) = self.symbol_embeddings.get(symbol_uri).cloned() { + let neighbours = + self.nearest_symbol_by_vector(&sym_embedding, 20, Some(symbol_uri), None); + for n in neighbours { + if n.score < threshold { + continue; + } + let hit_file = self + .def_index + .get(&n.uri) + .map(|(f, _)| f.clone()) + .unwrap_or_else(|| n.uri.clone()); + let source = if static_files.contains(&hit_file) { + ImpactSource::Both + } else { + ImpactSource::Semantic + }; + semantic_items.push(SemanticImpactItem { + file_uri: hit_file, + symbol_uri: n.uri, + similarity: n.score, + source, + }); + } + } else if let Some(file_embedding) = self.file_embeddings.get(&file_uri).cloned() { + let neighbours = self.nearest_by_vector( + &file_embedding, + 20, + Some(&file_uri), + None, + Some(threshold), + ); + for neighbour in neighbours { + let source = if static_files.contains(&neighbour.uri) { + ImpactSource::Both + } else { + ImpactSource::Semantic + }; + semantic_items.push(SemanticImpactItem { + file_uri: neighbour.uri, + symbol_uri: String::new(), + similarity: neighbour.score, + source, + }); + } + } + } + + Some(EnrichedBlastRadius { + file_uri, + static_result, + semantic_items, + }) + } + + /// Forward-call BFS starting at `symbol_uri` (v2.3 Feature #4). + /// + /// Walks `caller_to_callees` up to `depth` hops. Returns a flat + /// `(caller, callee)` edge list and a `truncated` flag that is `true` + /// when the node cap was hit. + pub fn outgoing_calls( + &self, + symbol_uri: &str, + depth: u32, + ) -> (Vec<(String, String)>, bool) { + const NODE_LIMIT: usize = 200; + let depth = depth.clamp(1, 8); + + let mut edges: Vec<(String, String)> = Vec::new(); + let mut seen_edges: HashSet<(String, String)> = HashSet::new(); + let mut visited: HashSet = HashSet::new(); + visited.insert(symbol_uri.to_owned()); + + let mut frontier: Vec = vec![symbol_uri.to_owned()]; + let mut truncated = false; + + for _ in 0..depth { + let mut next: Vec = Vec::new(); + for caller in &frontier { + let Some(callees) = self.caller_to_callees.get(caller) else { + continue; + }; + for callee in callees { + let edge = (caller.clone(), callee.clone()); + if seen_edges.insert(edge.clone()) { + if edges.len() >= NODE_LIMIT { + truncated = true; + return (edges, truncated); + } + edges.push(edge); + } + if visited.insert(callee.clone()) { + next.push(callee.clone()); + } + } + } + if next.is_empty() { + break; + } + frontier = next; + } + + (edges, truncated) + } + /// Find the symbol URI whose occurrence range contains `(line, col)` in `uri`. /// /// Returns `None` if no occurrence covers the given position. @@ -2163,30 +2311,131 @@ impl LipDatabase { /// Symbol search across all tracked files and mounted slices. pub fn workspace_symbols(&mut self, query: &str, limit: usize) -> Vec { + let (symbols, _) = self.workspace_symbols_ranked(query, limit, None, None, None); + symbols + } + + /// Filtered + ranked workspace symbol search (v2.3 Feature #5). + /// + /// - `kind_filter`: if `Some`, only symbols whose kind is in the slice pass. + /// - `scope`: if `Some`, only symbols whose def-file URI starts with the prefix. + /// - `modifier_filter`: if `Some`, symbols must carry at least one modifier + /// from the slice. + /// - Ranking tiers: `Exact` (case-sensitive equality) = 1.0; + /// case-insensitive prefix = 0.8; case-insensitive substring = 0.5. + /// An empty query is treated as "match all" with score 0.2 (no ranking + /// intent) and produces an empty `ranked` list. + /// + /// The two returned vecs are parallel: `ranked[i]` describes `symbols[i]`. + /// When `query` is empty, `ranked` is empty (pre-v2.3 callers' behavior). + pub fn workspace_symbols_ranked( + &mut self, + query: &str, + limit: usize, + kind_filter: Option<&[crate::schema::SymbolKind]>, + scope: Option<&str>, + modifier_filter: Option<&[String]>, + ) -> (Vec, Vec) { + use crate::query_graph::types::{MatchType, RankedSymbol}; + + let q_lower = query.to_lowercase(); + let has_query = !query.is_empty(); + + let passes_filters = |sym: &OwnedSymbolInfo, def_file: Option<&str>| -> bool { + if let Some(kinds) = kind_filter { + if !kinds.contains(&sym.kind) { + return false; + } + } + if let Some(prefix) = scope { + let file = def_file.unwrap_or(""); + if !file.starts_with(prefix) { + return false; + } + } + if let Some(mods) = modifier_filter { + if !mods.iter().any(|m| sym.modifiers.iter().any(|sm| sm == m)) { + return false; + } + } + true + }; + + let classify = |name: &str| -> Option<(f32, MatchType)> { + if !has_query { + return Some((0.2, MatchType::Fuzzy)); + } + if name == query { + Some((1.0, MatchType::Exact)) + } else if name.to_lowercase().starts_with(&q_lower) { + Some((0.8, MatchType::Prefix)) + } else if name.to_lowercase().contains(&q_lower) { + Some((0.5, MatchType::Fuzzy)) + } else { + None + } + }; + + #[derive(Clone)] + struct Hit { + sym: OwnedSymbolInfo, + score: f32, + match_type: MatchType, + } + let uris: Vec = self.file_inputs.keys().cloned().collect(); - let q = query.to_lowercase(); - let mut matches = vec![]; - 'outer: for uri in &uris { + let mut hits: Vec = Vec::new(); + for uri in &uris { for sym in self.file_symbols(uri).iter() { - if sym.display_name.to_lowercase().contains(&q) { - matches.push(sym.clone()); - if matches.len() >= limit { - break 'outer; - } + if !passes_filters(sym, Some(uri.as_str())) { + continue; + } + if let Some((score, match_type)) = classify(&sym.display_name) { + hits.push(Hit { + sym: sym.clone(), + score, + match_type, + }); } } } - if matches.len() < limit { - for sym in self.mounted_symbols.values() { - if sym.display_name.to_lowercase().contains(&q) { - matches.push(sym.clone()); - if matches.len() >= limit { - break; - } - } + for sym in self.mounted_symbols.values() { + let def_file = self.def_index.get(&sym.uri).map(|(f, _)| f.as_str()); + if !passes_filters(sym, def_file) { + continue; + } + if let Some((score, match_type)) = classify(&sym.display_name) { + hits.push(Hit { + sym: sym.clone(), + score, + match_type, + }); + } + } + + // Sort by score desc, then by display_name asc as a stable tiebreaker. + hits.sort_by(|a, b| { + b.score + .partial_cmp(&a.score) + .unwrap_or(std::cmp::Ordering::Equal) + .then_with(|| a.sym.display_name.cmp(&b.sym.display_name)) + }); + hits.truncate(limit); + + let emit_ranked = has_query; + let mut symbols = Vec::with_capacity(hits.len()); + let mut ranked = Vec::with_capacity(if emit_ranked { hits.len() } else { 0 }); + for h in hits { + if emit_ranked { + ranked.push(RankedSymbol { + symbol_uri: h.sym.uri.clone(), + score: h.score, + match_type: h.match_type, + }); } + symbols.push(h.sym); } - matches + (symbols, ranked) } /// Trigram fuzzy search across all tracked symbols and mounted slices. @@ -2278,7 +2527,7 @@ impl Default for LipDatabase { #[cfg(test)] mod tests { use super::*; - use crate::schema::SymbolKind; + use crate::schema::{ReferenceKind, SymbolKind}; fn make_rust_file(content: &str) -> (String, String, String) { ( @@ -3103,6 +3352,7 @@ impl Greeter { taint_labels: vec![], blast_radius: 0, is_exported: true, + ..Default::default() }) .collect(), slice_url: String::new(), @@ -3399,6 +3649,7 @@ impl Greeter { taint_labels: vec![], blast_radius: 0, is_exported: true, + ..Default::default() }; db.upsert_file_precomputed( uri.clone(), @@ -3964,6 +4215,7 @@ impl Greeter { taint_labels: vec![], blast_radius: 0, is_exported: false, + ..Default::default() }]; let occurrences = vec![OwnedOccurrence { symbol_uri: sym_uri.clone(), @@ -3976,6 +4228,8 @@ impl Greeter { confidence_score: 90, role: Role::Definition, override_doc: None, + kind: ReferenceKind::Unknown, + is_test: false, }]; db.upsert_file_precomputed( @@ -4021,6 +4275,7 @@ impl Greeter { taint_labels: vec![], blast_radius: 0, is_exported: false, + ..Default::default() }; let occ = OwnedOccurrence { symbol_uri: "lip://local/lib.rs#Foo".into(), @@ -4033,6 +4288,8 @@ impl Greeter { confidence_score: 90, role: Role::Definition, override_doc: None, + kind: ReferenceKind::Unknown, + is_test: false, }; db.upsert_file_precomputed( @@ -4078,6 +4335,8 @@ impl Greeter { confidence_score: 90, role: Role::Definition, override_doc: None, + kind: ReferenceKind::Unknown, + is_test: false, }; db.upsert_file_precomputed( uri_a.clone(), @@ -4101,6 +4360,8 @@ impl Greeter { confidence_score: 80, role: Role::Reference, override_doc: None, + kind: ReferenceKind::Unknown, + is_test: false, }; db.upsert_file_precomputed( uri_b.clone(), @@ -4134,6 +4395,8 @@ impl Greeter { confidence_score: 90, role: Role::Definition, override_doc: None, + kind: ReferenceKind::Unknown, + is_test: false, }; db.upsert_file_precomputed( uri_c.clone(), @@ -4171,6 +4434,8 @@ impl Greeter { confidence_score: 90, role: Role::Definition, override_doc: None, + kind: ReferenceKind::Unknown, + is_test: false, }; db.upsert_file_precomputed( uri_a.clone(), @@ -4194,6 +4459,8 @@ impl Greeter { confidence_score: 80, role: Role::Reference, override_doc: None, + kind: ReferenceKind::Unknown, + is_test: false, }; db.upsert_file_precomputed( uri_b.clone(), diff --git a/bindings/rust/src/query_graph/types.rs b/bindings/rust/src/query_graph/types.rs index 74048f8..4b9b18b 100644 --- a/bindings/rust/src/query_graph/types.rs +++ b/bindings/rust/src/query_graph/types.rs @@ -88,6 +88,34 @@ pub struct EnrichedBlastRadius { pub semantic_items: Vec, } +/// A single forward call edge returned by `QueryOutgoingCalls` (v2.3). +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)] +pub struct OutgoingCallEdge { + pub from_uri: String, + pub to_uri: String, +} + +/// How the client's query matched a workspace symbol's display name (v2.3 #5). +/// Discriminator only — not a ranking signal; the numeric `score` on +/// [`RankedSymbol`] is what callers sort by. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum MatchType { + Exact, + Prefix, + Fuzzy, +} + +/// Per-symbol ranking metadata for [`ServerMessage::WorkspaceSymbolsResult`] +/// (v2.3 Feature #5). Parallel to `symbols`: `ranked[i]` describes `symbols[i]`. +/// Tiered scoring — Exact=1.0, Prefix=0.8, Fuzzy=0.5 — not BM25. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct RankedSymbol { + pub symbol_uri: String, + pub score: f32, + pub match_type: MatchType, +} + /// An impact item discovered through embedding similarity rather than /// static call-graph edges. #[derive(Debug, Clone, Serialize, Deserialize)] @@ -251,8 +279,27 @@ pub enum ServerMessage { #[serde(default, skip_serializing_if = "Vec::is_empty")] not_indexed_uris: Vec, }, + /// Response to [`ClientMessage::QueryBlastRadiusSymbol`]. + /// `result` is `None` when the symbol's defining file is not indexed. + BlastRadiusSymbolResult { + #[serde(default, skip_serializing_if = "Option::is_none")] + result: Option, + }, + /// Response to [`ClientMessage::QueryOutgoingCalls`]. `edges` is a flat + /// list of `(caller, callee)` pairs collected during BFS, with no + /// guaranteed ordering. `truncated = true` means the configured node + /// cap stopped the BFS short. + OutgoingCallsResult { + edges: Vec, + truncated: bool, + }, WorkspaceSymbolsResult { symbols: Vec, + /// v2.3 Feature #5: per-symbol ranking information. + /// Empty when the client did not provide any filter/ranking cues + /// (pre-v2.3 callers get an empty vec and ignore it via serde default). + #[serde(default, skip_serializing_if = "Vec::is_empty")] + ranked: Vec, }, DocumentSymbolsResult { symbols: Vec, @@ -769,9 +816,43 @@ pub enum ClientMessage { #[serde(default)] min_score: Option, }, + /// Symbol-scoped blast radius with optional semantic enrichment (v2.3). + /// Single-symbol analogue of `QueryBlastRadiusBatch`. Delegates to the + /// file-level blast-radius computation for the symbol's defining file + /// and returns an `EnrichedBlastRadius`. + /// + /// When `min_score` is present, semantic enrichment runs for that file's + /// embedding against the index; absent means structural-only. + /// Returns `BlastRadiusSymbolResult`. + QueryBlastRadiusSymbol { + symbol_uri: String, + #[serde(default)] + min_score: Option, + }, + /// Outgoing call graph starting at `symbol_uri` (v2.3 Feature #4). + /// Returns the transitive forward call edges up to `depth` hops. + /// `truncated = true` when the BFS hit the configured node limit. + QueryOutgoingCalls { + symbol_uri: String, + /// BFS depth (>=1). Values <1 are treated as 1; >8 are clamped to 8 + /// to bound response size on pathological graphs. + depth: u32, + }, QueryWorkspaceSymbols { query: String, limit: Option, + /// v2.3 Feature #5: only return symbols whose `kind` is in this set. + /// Omit for no filter. + #[serde(default, skip_serializing_if = "Option::is_none")] + kind_filter: Option>, + /// v2.3 Feature #5: only return symbols whose defining file URI + /// starts with this prefix. Omit to search all files. + #[serde(default, skip_serializing_if = "Option::is_none")] + scope: Option, + /// v2.3 Feature #5: only return symbols that carry at least one of + /// these modifier strings (e.g. "pub", "async"). Omit for no filter. + #[serde(default, skip_serializing_if = "Option::is_none")] + modifier_filter: Option>, }, QueryDocumentSymbols { uri: String, @@ -1246,6 +1327,8 @@ impl ClientMessage { "query_hover", "query_blast_radius", "query_blast_radius_batch", + "query_blast_radius_symbol", + "query_outgoing_calls", "query_workspace_symbols", "query_document_symbols", "query_dead_symbols", @@ -1320,6 +1403,8 @@ impl ClientMessage { ClientMessage::QueryHover { .. } => "query_hover", ClientMessage::QueryBlastRadius { .. } => "query_blast_radius", ClientMessage::QueryBlastRadiusBatch { .. } => "query_blast_radius_batch", + ClientMessage::QueryBlastRadiusSymbol { .. } => "query_blast_radius_symbol", + ClientMessage::QueryOutgoingCalls { .. } => "query_outgoing_calls", ClientMessage::QueryWorkspaceSymbols { .. } => "query_workspace_symbols", ClientMessage::QueryDocumentSymbols { .. } => "query_document_symbols", ClientMessage::QueryDeadSymbols { .. } => "query_dead_symbols", @@ -1728,9 +1813,20 @@ mod tests { changed_file_uris: vec![], min_score: None, }, + ClientMessage::QueryBlastRadiusSymbol { + symbol_uri: String::new(), + min_score: None, + }, + ClientMessage::QueryOutgoingCalls { + symbol_uri: String::new(), + depth: 1, + }, ClientMessage::QueryWorkspaceSymbols { query: String::new(), limit: None, + kind_filter: None, + scope: None, + modifier_filter: None, }, ClientMessage::QueryDocumentSymbols { uri: String::new() }, ClientMessage::QueryDeadSymbols { limit: None }, diff --git a/bindings/rust/src/schema/mod.rs b/bindings/rust/src/schema/mod.rs index c1e3eba..6966e1d 100644 --- a/bindings/rust/src/schema/mod.rs +++ b/bindings/rust/src/schema/mod.rs @@ -17,10 +17,14 @@ //! | [`OwnedDependencySlice`] | Pre-built index fragment from the registry | //! | [`OwnedEventStream`] | Batch of [`OwnedDelta`]s emitted by the indexer | +pub mod signature; pub mod types; +pub mod visibility; +pub use signature::normalize_signature; pub use types::{ - sha256_hex, Action, EdgeKind, IndexingState, LipUri, OwnedAnnotationEntry, OwnedDelta, - OwnedDependencySlice, OwnedDocument, OwnedEventStream, OwnedGraphEdge, OwnedOccurrence, - OwnedRange, OwnedRelationship, OwnedSymbolInfo, Role, SymbolKind, + sha256_hex, Action, Completeness, EdgeKind, ExtractionTier, IndexingState, LipUri, + ModifiersSource, OwnedAnnotationEntry, OwnedDelta, OwnedDependencySlice, OwnedDocument, + OwnedEventStream, OwnedGraphEdge, OwnedOccurrence, OwnedRange, OwnedRelationship, + OwnedSymbolInfo, ReferenceKind, Role, SymbolKind, Visibility, }; diff --git a/bindings/rust/src/schema/signature.rs b/bindings/rust/src/schema/signature.rs new file mode 100644 index 0000000..ddda146 --- /dev/null +++ b/bindings/rust/src/schema/signature.rs @@ -0,0 +1,344 @@ +//! Signature normalization for ABI-hash-stable comparisons. +//! +//! `normalize_signature` produces a canonical form of a symbol's signature +//! so that churn in parameter names or whitespace does not flip the hash. +//! The output is idempotent: `normalize(normalize(s)) == normalize(s)`. +//! +//! Strategy: +//! - Trim and collapse internal whitespace runs to a single space. +//! - Drop trailing documentation tails (`// …`, `# …` line comments). +//! - For languages that use `name: Type` params (Rust, TypeScript, Python, +//! Kotlin, Swift), replace the parameter name with `_` at paren depth 1. +//! - Languages that use different orderings (Go: `name Type`, C/C++/Dart: +//! `Type name`) get whitespace-only normalization for now. + +use crate::indexer::language::Language; + +/// Produce a canonical, param-name-agnostic form of `raw`. +pub fn normalize_signature(raw: &str, lang: Language) -> String { + let trimmed = strip_doc_tail(raw); + let collapsed = collapse_whitespace(trimmed); + if uses_colon_params(lang) { + strip_colon_param_names(&collapsed) + } else { + collapsed + } +} + +fn uses_colon_params(lang: Language) -> bool { + matches!( + lang, + Language::Rust + | Language::TypeScript + | Language::Python + | Language::Kotlin + | Language::Swift + ) +} + +/// Drop the first `//`-line comment or `#`-comment tail, if present. +/// +/// We only look at top-level (non-string) context for the line comment. +/// A simple heuristic: split at the first `\n`, then look for the comment +/// marker on that first line after any closing brace of the signature. +fn strip_doc_tail(s: &str) -> &str { + // Take only the first line — signatures are single-line after LSP hover + // code-block extraction, so any trailing documentation will be on a + // separate line anyway. + let first_line = s.split_once('\n').map(|(a, _)| a).unwrap_or(s); + + // Remove inline `//` comment if present. + let without_slash = match first_line.find("//") { + Some(idx) => &first_line[..idx], + None => first_line, + }; + + // Remove inline `#` comment (Python) — conservative: only when + // preceded by whitespace, so `#[derive(...)]` attributes are preserved. + if let Some(idx) = find_hash_comment(without_slash) { + &without_slash[..idx] + } else { + without_slash + } +} + +fn find_hash_comment(s: &str) -> Option { + let bytes = s.as_bytes(); + for (i, &c) in bytes.iter().enumerate() { + if c == b'#' && i > 0 && bytes[i - 1].is_ascii_whitespace() { + return Some(i); + } + } + None +} + +/// Trim and collapse runs of ASCII whitespace to a single space. +fn collapse_whitespace(s: &str) -> String { + let mut out = String::with_capacity(s.len()); + let mut in_ws = false; + for c in s.chars() { + if c.is_ascii_whitespace() { + in_ws = true; + } else { + if in_ws && !out.is_empty() { + out.push(' '); + } + in_ws = false; + out.push(c); + } + } + out +} + +/// Replace `ident:` with `_:` for parameters at paren depth 1 +/// (and angle depth 0). Handles optional `?` markers (TS) and +/// `*` / `**` prefixes (Python varargs). +fn strip_colon_param_names(s: &str) -> String { + let bytes = s.as_bytes(); + let mut out = String::with_capacity(bytes.len()); + let mut i = 0usize; + let mut paren: i32 = 0; + let mut angle: i32 = 0; + let mut expect_param = false; + + while i < bytes.len() { + let c = bytes[i]; + + if expect_param { + // Skip leading whitespace inside the param slot. + if c == b' ' { + out.push(' '); + i += 1; + continue; + } + if let Some((prefix_end, ident_end, colon_pos)) = try_match_named_param(bytes, i) { + // Emit any `*` / `**` prefix verbatim, then `_`, then any + // `?` / whitespace between the identifier and the colon. + out.push_str(slice_str(bytes, i, prefix_end)); + out.push('_'); + out.push_str(slice_str(bytes, ident_end, colon_pos)); + i = colon_pos; + expect_param = false; + continue; + } + // Not a `name: type` pattern — emit chars normally and clear flag. + expect_param = false; + } + + match c { + b'(' => { + paren += 1; + if paren == 1 && angle == 0 { + expect_param = true; + } + } + b')' => { + paren -= 1; + } + b'<' => { + angle += 1; + } + b'>' => { + angle -= 1; + } + b',' if paren == 1 && angle == 0 => { + expect_param = true; + } + _ => {} + } + out.push(c as char); + i += 1; + } + out +} + +/// Try to match `[*]{1,2} ident (\s* \?)? \s* :` starting at `start`. +/// Returns `(prefix_end, ident_end, colon_pos)` on success, else `None`. +fn try_match_named_param(bytes: &[u8], start: usize) -> Option<(usize, usize, usize)> { + let mut j = start; + // Optional `*` / `**` prefix (Python varargs). + while j < bytes.len() && bytes[j] == b'*' && (j - start) < 2 { + j += 1; + } + let prefix_end = j; + // Identifier: first char must be [A-Za-z_], then [A-Za-z0-9_]. + if j >= bytes.len() || !is_ident_start(bytes[j]) { + return None; + } + j += 1; + while j < bytes.len() && is_ident_continue(bytes[j]) { + j += 1; + } + let ident_end = j; + // Optional `?` (TS optional param marker). + if j < bytes.len() && bytes[j] == b'?' { + j += 1; + } + // Optional whitespace. + while j < bytes.len() && bytes[j] == b' ' { + j += 1; + } + if j < bytes.len() && bytes[j] == b':' { + Some((prefix_end, ident_end, j)) + } else { + None + } +} + +fn is_ident_start(b: u8) -> bool { + b == b'_' || b.is_ascii_alphabetic() +} + +fn is_ident_continue(b: u8) -> bool { + b == b'_' || b.is_ascii_alphanumeric() +} + +fn slice_str(bytes: &[u8], a: usize, b: usize) -> &str { + // Safe: callers only pass indices at ASCII boundaries. + std::str::from_utf8(&bytes[a..b]).unwrap_or("") +} + +// ─── Tests ──────────────────────────────────────────────────────────────────── + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn rust_strips_param_names() { + let got = normalize_signature("fn foo(x: i32, y: &str) -> bool", Language::Rust); + assert_eq!(got, "fn foo(_: i32, _: &str) -> bool"); + } + + #[test] + fn rust_preserves_self_variants() { + // &self / &mut self have no colon, so nothing to replace. + let got = normalize_signature("fn bar(&self, x: i32)", Language::Rust); + assert_eq!(got, "fn bar(&self, _: i32)"); + } + + #[test] + fn rust_generic_params_untouched() { + // Angle-bracket generics must not be treated as params. + let got = normalize_signature("fn baz(x: Vec) -> T", Language::Rust); + assert_eq!(got, "fn baz(_: Vec) -> T"); + } + + #[test] + fn typescript_optional_marker_preserved() { + let got = normalize_signature("foo(x?: number, y: string): boolean", Language::TypeScript); + assert_eq!(got, "foo(_?: number, _: string): boolean"); + } + + #[test] + fn python_defaults_and_varargs() { + let got = normalize_signature("def f(*args: int, **kwargs: str) -> None", Language::Python); + assert_eq!(got, "def f(*_: int, **_: str) -> None"); + } + + #[test] + fn kotlin_strips_names() { + let got = normalize_signature("fun foo(x: Int, y: String): Boolean", Language::Kotlin); + assert_eq!(got, "fun foo(_: Int, _: String): Boolean"); + } + + #[test] + fn swift_strips_names() { + let got = normalize_signature("func foo(x: Int, y: String) -> Bool", Language::Swift); + assert_eq!(got, "func foo(_: Int, _: String) -> Bool"); + } + + #[test] + fn go_whitespace_only() { + // Go uses `name Type`; we don't (yet) strip names, just collapse WS. + let got = normalize_signature("func foo(x int, y string) bool", Language::Go); + assert_eq!(got, "func foo(x int, y string) bool"); + } + + #[test] + fn c_whitespace_only() { + let got = normalize_signature("int foo(const char *s, int n)", Language::C); + assert_eq!(got, "int foo(const char *s, int n)"); + } + + #[test] + fn dart_whitespace_only() { + let got = normalize_signature("bool foo(int x, String y)", Language::Dart); + assert_eq!(got, "bool foo(int x, String y)"); + } + + #[test] + fn whitespace_collapse() { + let got = normalize_signature("fn foo(x: i32) -> bool", Language::Rust); + assert_eq!(got, "fn foo(_: i32) -> bool"); + } + + #[test] + fn doc_tail_slash_comment_dropped() { + let got = normalize_signature("fn foo(x: i32) -> bool // frobnicate", Language::Rust); + assert_eq!(got, "fn foo(_: i32) -> bool"); + } + + #[test] + fn doc_tail_hash_comment_dropped_python() { + let got = normalize_signature("def f(x: int) -> None # legacy", Language::Python); + assert_eq!(got, "def f(_: int) -> None"); + } + + #[test] + fn rust_attribute_preserved() { + // `#[derive(...)]` uses `#` but no preceding whitespace, so it survives. + let got = normalize_signature("#[inline] fn foo(x: i32)", Language::Rust); + assert_eq!(got, "#[inline] fn foo(_: i32)"); + } + + #[test] + fn idempotent_rust() { + let once = normalize_signature("fn foo(xs: Vec, y: &str) -> T", Language::Rust); + let twice = normalize_signature(&once, Language::Rust); + assert_eq!(once, twice); + } + + #[test] + fn idempotent_typescript() { + let once = normalize_signature("foo(x?: number): Promise", Language::TypeScript); + let twice = normalize_signature(&once, Language::TypeScript); + assert_eq!(once, twice); + } + + #[test] + fn idempotent_go() { + let once = normalize_signature("func foo(x int, y string) bool", Language::Go); + let twice = normalize_signature(&once, Language::Go); + assert_eq!(once, twice); + } + + #[test] + fn param_name_change_normalizes_equal() { + let a = normalize_signature("fn foo(x: i32, y: i32) -> i32", Language::Rust); + let b = normalize_signature("fn foo(a: i32, b: i32) -> i32", Language::Rust); + assert_eq!(a, b); + } + + #[test] + fn empty_and_no_params() { + assert_eq!(normalize_signature("", Language::Rust), ""); + assert_eq!( + normalize_signature("fn foo() -> bool", Language::Rust), + "fn foo() -> bool" + ); + } + + #[test] + fn nested_parens_in_fn_type_arg() { + // Callback params inside generic arg: outer paren strip only + // at depth 1; inner should still get name stripped. + let got = normalize_signature( + "fn foo(cb: fn(x: i32) -> bool) -> ()", + Language::Rust, + ); + // Inner `x:` is at paren depth 2, so NOT stripped by the current + // depth-1-only rule. Record this limitation as the expected output. + assert_eq!(got, "fn foo(_: fn(x: i32) -> bool) -> ()"); + } +} diff --git a/bindings/rust/src/schema/types.rs b/bindings/rust/src/schema/types.rs index ca5451d..6f14542 100644 --- a/bindings/rust/src/schema/types.rs +++ b/bindings/rust/src/schema/types.rs @@ -95,6 +95,36 @@ pub enum Action { Delete, } +/// Fine-grained classification of a reference occurrence (v2.3). +/// +/// Populated when `role != Definition`. `Role` marks whether an occurrence is +/// a definition/reference/read/write; `ReferenceKind` adds the *reason* the +/// reference exists — call site vs. type position vs. inheritance clause. CKB +/// uses it to distinguish "X is called from Y" from "X is the return type of +/// Y" without re-parsing. +/// +/// Tier-1 can classify Call/Read/Write from the tree-sitter parent node; +/// Type/Implements/Extends require Tier-2 type information to distinguish +/// reliably. Leave as `Unknown` when the extractor cannot decide. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, Default)] +#[serde(rename_all = "snake_case")] +pub enum ReferenceKind { + #[default] + Unknown, + Call, + Read, + Write, + Type, + Implements, + Extends, +} + +impl ReferenceKind { + pub fn is_unknown(&self) -> bool { + matches!(self, ReferenceKind::Unknown) + } +} + #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, Default)] #[serde(rename_all = "snake_case")] pub enum Role { @@ -137,6 +167,46 @@ pub enum IndexingState { WarmFull, } +/// Symbol visibility (spec §v2.3 Feature #1). +/// +/// LIP owns inference — see `schema::visibility::infer`. Derived from language +/// rules + modifiers at ingest time and carried on `OwnedSymbolInfo`. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum Visibility { + Public, + Private, + Internal, + Protected, +} + +/// Which extraction tier produced a symbol record (spec §v2.3 Feature #1). +/// +/// Telemetry: NOT included in `OwnedSymbolInfo::PartialEq` so that a Tier-1 → +/// Tier-2 upgrade with no structural change does not invalidate the salsa +/// early-cutoff. Clients tier-gate confidence at query time. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, Default)] +#[serde(rename_all = "snake_case")] +pub enum ExtractionTier { + #[default] + Tier1, + Tier1p5, + Tier2, + Tier3Scip, +} + +/// Provenance of the `modifiers` field on SCIP-imported symbols. +/// +/// `Proto` = the vendored SCIP `SymbolInformation.modifiers` field was present. +/// `PrefixParse` = fell back to parsing the signature prefix (older `.scip` +/// blobs predate upstream field 7). CKB discounts confidence on `PrefixParse`. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum ModifiersSource { + Proto, + PrefixParse, +} + // ─── Owned heap types ──────────────────────────────────────────────────────── #[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize, Default)] @@ -156,12 +226,33 @@ pub struct OwnedRelationship { pub is_override: bool, } +/// How fully the indexer resolved a symbol (spec §v2.3 Feature #1). +/// +/// `score` in [0.0, 1.0]. `reason` is a short stable tag (e.g. `"tier1_syntactic"`, +/// `"lsp_verified"`, `"scip_precomputed"`, `"scip_unresolved_local"`). +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct Completeness { + pub score: f32, + pub reason: String, +} + /// Heap-allocated SymbolInfo. /// /// `runtime_p99_ms` and `call_rate_per_s` are advisory telemetry fields /// (spec §8.3). They are excluded from `PartialEq`/`Eq`/`Hash` so that /// salsa's early-cutoff can fire purely on the structural intelligence fields. -#[derive(Debug, Clone, Serialize, Deserialize)] +/// +/// v2.3 adds rich metadata (spec §v2.3 Feature #1). Eq split per decision C.5: +/// `modifiers`, `visibility`, `container_name`, `signature_normalized` are +/// structural (a flip is an ABI change) and participate in Eq. `completeness`, +/// `visibility_confidence`, `extraction_tier`, `modifiers_source` are telemetry +/// and are excluded — a Tier-1 → Tier-2 upgrade with no structural change must +/// not invalidate the salsa early-cutoff. +/// +/// `Default` is derived so construction sites can use `..Default::default()` +/// for telemetry/v2.3 fields. Note: the derived default has `confidence_score +/// = 0`; prefer `OwnedSymbolInfo::new` when you want the `30` baseline. +#[derive(Debug, Clone, Default, Serialize, Deserialize)] pub struct OwnedSymbolInfo { pub uri: String, pub display_name: String, @@ -179,6 +270,35 @@ pub struct OwnedSymbolInfo { /// Set by the Tier 1 extractor using language-specific visibility rules; /// used by `file_api_surface()` for stable ABI hash computation. pub is_exported: bool, + + // ── v2.3 rich metadata — structural (in Eq) ────────────────────────────── + /// Whitespace- and param-name-stripped signature, for API-compat comparison. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub signature_normalized: Option, + /// Raw modifier list from SCIP or tree-sitter (`public`, `async`, `static`, + /// `deprecated`, `export`, `test`, …). Empty = extractor ran and saw none. + /// Use `extraction_tier` to tell "none" from "not yet extracted". + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub modifiers: Vec, + /// Canonical visibility, inferred by `schema::visibility::infer`. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub visibility: Option, + /// Enclosing class / namespace / module name. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub container_name: Option, + + // ── v2.3 rich metadata — telemetry (NOT in Eq) ─────────────────────────── + #[serde(default, skip_serializing_if = "Option::is_none")] + pub visibility_confidence: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub completeness: Option, + /// Which tier produced this record. Defaults to `Tier1` on deserialize of + /// v2.2 payloads. + #[serde(default)] + pub extraction_tier: ExtractionTier, + /// Only set on SCIP-imported symbols; `None` on Tier-1/Tier-2 native paths. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub modifiers_source: Option, } impl PartialEq for OwnedSymbolInfo { @@ -193,7 +313,13 @@ impl PartialEq for OwnedSymbolInfo { && self.taint_labels == other.taint_labels && self.blast_radius == other.blast_radius && self.is_exported == other.is_exported - // runtime_p99_ms / call_rate_per_s intentionally omitted + // v2.3 structural fields — ABI-bearing, participate in early-cutoff + && self.signature_normalized == other.signature_normalized + && self.modifiers == other.modifiers + && self.visibility == other.visibility + && self.container_name == other.container_name + // Excluded telemetry: runtime_p99_ms, call_rate_per_s, + // visibility_confidence, completeness, extraction_tier, modifiers_source } } impl Eq for OwnedSymbolInfo {} @@ -221,6 +347,14 @@ impl OwnedSymbolInfo { taint_labels: vec![], blast_radius: 0, is_exported: false, + signature_normalized: None, + modifiers: vec![], + visibility: None, + container_name: None, + visibility_confidence: None, + completeness: None, + extraction_tier: ExtractionTier::Tier1, + modifiers_source: None, } } } @@ -232,6 +366,23 @@ pub struct OwnedOccurrence { pub confidence_score: u8, pub role: Role, pub override_doc: Option, + + // ── v2.3 reference classification (additive) ───────────────────────────── + /// Fine-grained reason this reference exists (call site vs. type position + /// vs. inheritance clause). Defaults to `Unknown` on v2.2 payloads and + /// when the extractor cannot classify. Skipped on wire when `Unknown` so + /// v2.2 clients see the exact same JSON they used to. + #[serde(default, skip_serializing_if = "ReferenceKind::is_unknown")] + pub kind: ReferenceKind, + /// True when the enclosing file or function is recognised as a test + /// (file path under a test dir, `#[test]` attribute, `@Test` annotation, + /// etc.). Lets CKB down-rank test-only references in production queries. + #[serde(default, skip_serializing_if = "is_false")] + pub is_test: bool, +} + +fn is_false(b: &bool) -> bool { + !*b } /// Edge kind in the Code Property Graph (spec §4.1, §8.5). @@ -440,4 +591,62 @@ mod tests { assert_eq!(sha256_hex(b"hello"), sha256_hex(b"hello")); assert_ne!(sha256_hex(b"hello"), sha256_hex(b"world")); } + + // ── v2.3 reference classification ───────────────────────────────────────── + + #[test] + fn v22_occurrence_json_deserializes_with_default_ref_fields() { + // A v2.2 client sends no `kind` or `is_test`; v2.3 code must accept + // the payload and fill defaults (Unknown / false). + let json = r#"{ + "symbol_uri": "lip://local/a.rs#foo", + "range": {"start_line":0,"start_char":0,"end_line":0,"end_char":3}, + "confidence_score": 80, + "role": "reference", + "override_doc": null + }"#; + let occ: OwnedOccurrence = serde_json::from_str(json).expect("v2.2 payload"); + assert_eq!(occ.kind, ReferenceKind::Unknown); + assert!(!occ.is_test); + } + + #[test] + fn v23_unknown_and_is_test_false_skipped_on_wire() { + // Defaults must not bloat the wire — v2.2 clients should see identical + // JSON to what they produce. + let occ = OwnedOccurrence { + symbol_uri: "lip://local/a.rs#foo".into(), + range: OwnedRange::default(), + confidence_score: 50, + role: Role::Reference, + override_doc: None, + kind: ReferenceKind::Unknown, + is_test: false, + }; + let json = serde_json::to_string(&occ).unwrap(); + assert!(!json.contains("\"kind\""), "kind:unknown must be skipped: {json}"); + assert!( + !json.contains("\"is_test\""), + "is_test:false must be skipped: {json}" + ); + } + + #[test] + fn v23_non_default_ref_fields_roundtrip() { + let occ = OwnedOccurrence { + symbol_uri: "lip://local/a.rs#foo".into(), + range: OwnedRange::default(), + confidence_score: 50, + role: Role::Reference, + override_doc: None, + kind: ReferenceKind::Call, + is_test: true, + }; + let json = serde_json::to_string(&occ).unwrap(); + assert!(json.contains("\"kind\":\"call\""), "got {json}"); + assert!(json.contains("\"is_test\":true"), "got {json}"); + let back: OwnedOccurrence = serde_json::from_str(&json).unwrap(); + assert_eq!(back.kind, ReferenceKind::Call); + assert!(back.is_test); + } } diff --git a/bindings/rust/src/schema/visibility.rs b/bindings/rust/src/schema/visibility.rs new file mode 100644 index 0000000..c5a3eba --- /dev/null +++ b/bindings/rust/src/schema/visibility.rs @@ -0,0 +1,356 @@ +//! Visibility inference from modifier keywords and name conventions. +//! +//! `infer(name, modifiers, lang)` returns `(Visibility, confidence)` where +//! confidence is a 0-100 score: +//! - `100` — explicit modifier keyword determined visibility (e.g. `pub`, +//! `private`, `export`). +//! - `80` — naming convention determined visibility (Go capital-letter +//! export, Python/Dart `_` prefix, language default). +//! - `50` — fallback when neither modifiers nor conventions are decisive. +//! +//! This helper is the single source of truth so Tier 1, Tier 1.5 and SCIP +//! import paths produce consistent visibility values. + +use crate::indexer::language::Language; +use crate::schema::Visibility; + +/// Infer a symbol's visibility and a 0-100 confidence score. +pub fn infer(name: &str, modifiers: &[String], lang: Language) -> (Visibility, u8) { + match lang { + Language::Rust => infer_rust(modifiers), + Language::TypeScript + | Language::JavaScript + | Language::JavaScriptReact => infer_ts_js(modifiers), + Language::Python => infer_python(name), + Language::Dart => infer_dart(name), + Language::Go => infer_go(name), + Language::Kotlin => infer_kotlin(modifiers), + Language::Swift => infer_swift(modifiers), + Language::C | Language::Cpp => infer_cpp(modifiers), + Language::Unknown => (Visibility::Public, 50), + } +} + +fn has_mod(modifiers: &[String], keyword: &str) -> bool { + modifiers.iter().any(|m| m == keyword) +} + +fn infer_rust(modifiers: &[String]) -> (Visibility, u8) { + // `pub` → Public; `pub(crate)`, `pub(super)`, `pub(in …)` → Internal. + if has_mod(modifiers, "pub") { + return (Visibility::Public, 100); + } + if modifiers.iter().any(|m| m.starts_with("pub(")) { + return (Visibility::Internal, 100); + } + (Visibility::Private, 50) +} + +fn infer_ts_js(modifiers: &[String]) -> (Visibility, u8) { + if has_mod(modifiers, "private") { + return (Visibility::Private, 100); + } + if has_mod(modifiers, "protected") { + return (Visibility::Protected, 100); + } + if has_mod(modifiers, "export") || has_mod(modifiers, "public") { + return (Visibility::Public, 100); + } + // No explicit keyword: a top-level symbol without `export` is module-private + // but a class member without a modifier is public. Without more context we + // return `Internal` at low confidence. + (Visibility::Internal, 50) +} + +fn infer_python(name: &str) -> (Visibility, u8) { + // PEP 8: dunder (`__x__`) is public by convention; `__x` is + // name-mangled (private to class); `_x` is module-private. + if name.starts_with("__") && name.ends_with("__") && name.len() > 4 { + return (Visibility::Public, 80); + } + if name.starts_with('_') { + return (Visibility::Private, 80); + } + (Visibility::Public, 80) +} + +fn infer_dart(name: &str) -> (Visibility, u8) { + // Dart: `_`-prefixed identifiers are library-private. + if name.starts_with('_') { + return (Visibility::Private, 80); + } + (Visibility::Public, 80) +} + +fn infer_go(name: &str) -> (Visibility, u8) { + // Go export rule: first rune uppercase → exported. + match name.chars().next() { + Some(c) if c.is_ascii_uppercase() => (Visibility::Public, 80), + Some(_) => (Visibility::Private, 80), + None => (Visibility::Public, 50), + } +} + +fn infer_kotlin(modifiers: &[String]) -> (Visibility, u8) { + if has_mod(modifiers, "private") { + return (Visibility::Private, 100); + } + if has_mod(modifiers, "protected") { + return (Visibility::Protected, 100); + } + if has_mod(modifiers, "internal") { + return (Visibility::Internal, 100); + } + if has_mod(modifiers, "public") { + return (Visibility::Public, 100); + } + (Visibility::Public, 80) // Kotlin default is public +} + +fn infer_swift(modifiers: &[String]) -> (Visibility, u8) { + if has_mod(modifiers, "private") || has_mod(modifiers, "fileprivate") { + return (Visibility::Private, 100); + } + if has_mod(modifiers, "public") || has_mod(modifiers, "open") { + return (Visibility::Public, 100); + } + if has_mod(modifiers, "internal") { + return (Visibility::Internal, 100); + } + (Visibility::Internal, 80) // Swift default is internal +} + +fn infer_cpp(modifiers: &[String]) -> (Visibility, u8) { + if has_mod(modifiers, "private") { + return (Visibility::Private, 100); + } + if has_mod(modifiers, "protected") { + return (Visibility::Protected, 100); + } + if has_mod(modifiers, "public") { + return (Visibility::Public, 100); + } + // C / free C++ functions are public; class-body defaults (private for + // `class`, public for `struct`) need call-site context we don't have here. + (Visibility::Public, 50) +} + +// ─── Tests ──────────────────────────────────────────────────────────────────── + +#[cfg(test)] +mod tests { + use super::*; + + fn mods(ms: &[&str]) -> Vec { + ms.iter().map(|s| (*s).to_owned()).collect() + } + + // Rust + #[test] + fn rust_pub_is_public() { + assert_eq!( + infer("foo", &mods(&["pub"]), Language::Rust), + (Visibility::Public, 100) + ); + } + + #[test] + fn rust_pub_crate_is_internal() { + assert_eq!( + infer("foo", &mods(&["pub(crate)"]), Language::Rust), + (Visibility::Internal, 100) + ); + } + + #[test] + fn rust_pub_super_is_internal() { + assert_eq!( + infer("foo", &mods(&["pub(super)"]), Language::Rust), + (Visibility::Internal, 100) + ); + } + + #[test] + fn rust_no_modifier_is_private() { + assert_eq!( + infer("foo", &[], Language::Rust), + (Visibility::Private, 50) + ); + } + + // TypeScript / JavaScript + #[test] + fn ts_export_is_public() { + assert_eq!( + infer("foo", &mods(&["export"]), Language::TypeScript), + (Visibility::Public, 100) + ); + } + + #[test] + fn ts_private_keyword_is_private() { + assert_eq!( + infer("foo", &mods(&["private"]), Language::TypeScript), + (Visibility::Private, 100) + ); + } + + #[test] + fn ts_protected_is_protected() { + assert_eq!( + infer("foo", &mods(&["protected"]), Language::TypeScript), + (Visibility::Protected, 100) + ); + } + + #[test] + fn ts_no_modifier_is_internal_lowconf() { + assert_eq!( + infer("foo", &[], Language::TypeScript), + (Visibility::Internal, 50) + ); + } + + // Python + #[test] + fn python_underscore_is_private() { + assert_eq!( + infer("_helper", &[], Language::Python), + (Visibility::Private, 80) + ); + } + + #[test] + fn python_double_underscore_is_private() { + assert_eq!( + infer("__mangled", &[], Language::Python), + (Visibility::Private, 80) + ); + } + + #[test] + fn python_dunder_is_public() { + assert_eq!( + infer("__init__", &[], Language::Python), + (Visibility::Public, 80) + ); + } + + #[test] + fn python_plain_name_is_public() { + assert_eq!( + infer("foo", &[], Language::Python), + (Visibility::Public, 80) + ); + } + + // Dart + #[test] + fn dart_underscore_is_private() { + assert_eq!( + infer("_internal", &[], Language::Dart), + (Visibility::Private, 80) + ); + } + + #[test] + fn dart_plain_is_public() { + assert_eq!( + infer("public", &[], Language::Dart), + (Visibility::Public, 80) + ); + } + + // Go + #[test] + fn go_capital_is_public() { + assert_eq!( + infer("Foo", &[], Language::Go), + (Visibility::Public, 80) + ); + } + + #[test] + fn go_lowercase_is_private() { + assert_eq!( + infer("foo", &[], Language::Go), + (Visibility::Private, 80) + ); + } + + // Kotlin + #[test] + fn kotlin_private_keyword() { + assert_eq!( + infer("foo", &mods(&["private"]), Language::Kotlin), + (Visibility::Private, 100) + ); + } + + #[test] + fn kotlin_internal_keyword() { + assert_eq!( + infer("foo", &mods(&["internal"]), Language::Kotlin), + (Visibility::Internal, 100) + ); + } + + #[test] + fn kotlin_default_is_public() { + assert_eq!( + infer("foo", &[], Language::Kotlin), + (Visibility::Public, 80) + ); + } + + // Swift + #[test] + fn swift_fileprivate_is_private() { + assert_eq!( + infer("foo", &mods(&["fileprivate"]), Language::Swift), + (Visibility::Private, 100) + ); + } + + #[test] + fn swift_open_is_public() { + assert_eq!( + infer("foo", &mods(&["open"]), Language::Swift), + (Visibility::Public, 100) + ); + } + + #[test] + fn swift_default_is_internal() { + assert_eq!( + infer("foo", &[], Language::Swift), + (Visibility::Internal, 80) + ); + } + + // C / C++ + #[test] + fn cpp_explicit_private() { + assert_eq!( + infer("foo", &mods(&["private"]), Language::Cpp), + (Visibility::Private, 100) + ); + } + + #[test] + fn c_no_modifier_is_public_lowconf() { + assert_eq!( + infer("foo", &[], Language::C), + (Visibility::Public, 50) + ); + } + + // Unknown + #[test] + fn unknown_language_fallback() { + assert_eq!( + infer("foo", &[], Language::Unknown), + (Visibility::Public, 50) + ); + } +} diff --git a/bindings/rust/tests/integration.rs b/bindings/rust/tests/integration.rs index 19a1e8d..7a40d20 100644 --- a/bindings/rust/tests/integration.rs +++ b/bindings/rust/tests/integration.rs @@ -7,8 +7,8 @@ use tokio::net::UnixStream; use lip_core::daemon::LipDaemon; use lip_core::query_graph::{ClientMessage, ErrorCode, ServerMessage}; use lip_core::schema::{ - Action, IndexingState, OwnedDocument, OwnedOccurrence, OwnedRange, OwnedSymbolInfo, Role, - SymbolKind, + Action, ExtractionTier, IndexingState, ModifiersSource, OwnedDocument, OwnedOccurrence, + OwnedRange, OwnedSymbolInfo, ReferenceKind, Role, SymbolKind, Visibility, }; // ─── Framing helpers (client side) ─────────────────────────────────────────── @@ -216,6 +216,9 @@ async fn daemon_workspace_symbols() { send( &mut client, &ClientMessage::QueryWorkspaceSymbols { + kind_filter: None, + scope: None, + modifier_filter: None, query: "Widget".to_owned(), limit: Some(50), }, @@ -225,7 +228,7 @@ async fn daemon_workspace_symbols() { let resp = recv(&mut client).await.expect("recv workspace symbols"); match resp { - ServerMessage::WorkspaceSymbolsResult { symbols } => { + ServerMessage::WorkspaceSymbolsResult { symbols, .. } => { // tree-sitter should have found at least the two struct declarations. assert!( !symbols.is_empty(), @@ -307,6 +310,9 @@ async fn daemon_restart_restores_journal() { send( &mut client, &ClientMessage::QueryWorkspaceSymbols { + kind_filter: None, + scope: None, + modifier_filter: None, query: "persisted".into(), limit: Some(10), }, @@ -316,7 +322,7 @@ async fn daemon_restart_restores_journal() { let resp = recv(&mut client).await.unwrap(); match resp { - ServerMessage::WorkspaceSymbolsResult { symbols } => { + ServerMessage::WorkspaceSymbolsResult { symbols, .. } => { assert!( !symbols.is_empty(), "expected persisted_fn to survive daemon restart, got no symbols" @@ -928,6 +934,7 @@ async fn scip_import_precomputed_symbols_searchable() { taint_labels: vec![], blast_radius: 0, is_exported: true, + ..Default::default() }], occurrences: vec![OwnedOccurrence { symbol_uri: symbol_uri.clone(), @@ -940,6 +947,8 @@ async fn scip_import_precomputed_symbols_searchable() { confidence_score: 100, role: Role::Definition, override_doc: None, + kind: lip_core::schema::ReferenceKind::Unknown, + is_test: false, }], merkle_path: uri.to_owned(), edges: vec![], @@ -971,6 +980,9 @@ async fn scip_import_precomputed_symbols_searchable() { send( &mut client, &ClientMessage::QueryWorkspaceSymbols { + kind_filter: None, + scope: None, + modifier_filter: None, query: "ScipWidget".to_owned(), limit: Some(10), }, @@ -980,7 +992,7 @@ async fn scip_import_precomputed_symbols_searchable() { let resp = recv(&mut client).await.expect("recv workspace symbols"); match resp { - ServerMessage::WorkspaceSymbolsResult { symbols } => { + ServerMessage::WorkspaceSymbolsResult { symbols, .. } => { assert!( !symbols.is_empty(), "expected ScipWidget in workspace symbols, got none" @@ -1031,3 +1043,832 @@ async fn scip_import_precomputed_symbols_searchable() { task.abort(); let _ = task.await; } + +// ─── v2.3 rich metadata end-to-end ─────────────────────────────────────────── + +/// Upsert a Rust file with source_text, then query WorkspaceSymbols and verify +/// the Tier-1 extractor's v2.3 structural fields (modifiers, visibility, +/// container_name, signature_normalized) survive the daemon's storage and +/// response serialization round-trip. Tier-1 produces `extraction_tier = Tier1` +/// and leaves `modifiers_source = None` (that field is reserved for SCIP). +#[tokio::test] +async fn daemon_tier1_emits_v23_metadata() { + let dir = tempfile::tempdir().expect("tempdir"); + let socket_path = dir.path().join("lip_v23_tier1.sock"); + + let daemon = LipDaemon::new(&socket_path); + let task = tokio::spawn(async move { daemon.run().await.ok() }); + tokio::time::sleep(Duration::from_millis(20)).await; + + let mut client = UnixStream::connect(&socket_path).await.expect("connect"); + + let source = "\ +pub struct Svc; +impl Svc { + pub async fn handle(&self, x: i32) -> i32 { x } +} +"; + let uri = "lip://local/v23@0.1/svc.rs"; + send( + &mut client, + &ClientMessage::Delta { + seq: 1, + action: Action::Upsert, + document: make_doc(uri, source), + }, + ) + .await + .expect("send upsert"); + let _ = recv(&mut client).await.expect("recv ack"); + + send( + &mut client, + &ClientMessage::QueryWorkspaceSymbols { + kind_filter: None, + scope: None, + modifier_filter: None, + query: "handle".to_owned(), + limit: Some(10), + }, + ) + .await + .expect("send workspace query"); + + let resp = recv(&mut client).await.expect("recv workspace symbols"); + let handle = match resp { + ServerMessage::WorkspaceSymbolsResult { symbols, .. } => symbols + .into_iter() + .find(|s| s.display_name == "handle") + .expect("expected 'handle' method in workspace symbols"), + other => panic!("expected WorkspaceSymbolsResult, got {other:?}"), + }; + + assert_eq!(handle.extraction_tier, ExtractionTier::Tier1); + assert_eq!( + handle.modifiers_source, None, + "Tier-1 must not set modifiers_source; that field is reserved for SCIP" + ); + assert_eq!(handle.visibility, Some(Visibility::Public)); + assert_eq!(handle.container_name.as_deref(), Some("Svc")); + assert!( + handle.modifiers.iter().any(|m| m == "pub"), + "expected `pub` modifier, got {:?}", + handle.modifiers + ); + assert!( + handle.modifiers.iter().any(|m| m == "async"), + "expected `async` modifier, got {:?}", + handle.modifiers + ); + assert!( + handle + .signature_normalized + .as_deref() + .map(|s| s.contains("fn handle")) + .unwrap_or(false), + "expected normalized signature containing `fn handle`, got {:?}", + handle.signature_normalized + ); + + task.abort(); + let _ = task.await; +} + +/// Upsert a Delta carrying pre-computed symbols with v2.3 fields populated (as +/// a SCIP-style importer would produce) and verify every structural and +/// telemetry field survives the daemon's storage and query serialization. +/// +/// This is the tightest available check that the daemon's write path does not +/// drop `modifiers`, `visibility`, `container_name`, `signature_normalized`, +/// `extraction_tier`, or `modifiers_source`. +#[tokio::test] +async fn daemon_precomputed_preserves_v23_metadata() { + let dir = tempfile::tempdir().expect("tempdir"); + let socket_path = dir.path().join("lip_v23_scip.sock"); + + let daemon = LipDaemon::new(&socket_path); + let task = tokio::spawn(async move { daemon.run().await.ok() }); + tokio::time::sleep(Duration::from_millis(20)).await; + + let mut client = UnixStream::connect(&socket_path).await.expect("connect"); + + let uri = "lip://local/scip@1.0/imported.rs"; + let sym_uri = format!("{uri}#RichSym"); + + let sym = OwnedSymbolInfo { + uri: sym_uri.clone(), + display_name: "RichSym".to_owned(), + kind: SymbolKind::Function, + documentation: Some("A symbol carrying full v2.3 metadata.".to_owned()), + signature: Some("pub async fn RichSym(x: i32) -> Bar".to_owned()), + signature_normalized: Some("pub async fn RichSym(_: i32) -> Bar".to_owned()), + modifiers: vec!["pub".to_owned(), "async".to_owned()], + visibility: Some(Visibility::Public), + visibility_confidence: Some(1.0), + container_name: Some("RichContainer".to_owned()), + extraction_tier: ExtractionTier::Tier3Scip, + modifiers_source: Some(ModifiersSource::PrefixParse), + confidence_score: 100, + is_exported: true, + ..Default::default() + }; + + let doc = OwnedDocument { + uri: uri.to_owned(), + content_hash: "deadbeef".to_owned(), + language: "rust".to_owned(), + symbols: vec![sym], + occurrences: vec![OwnedOccurrence { + symbol_uri: sym_uri.clone(), + range: OwnedRange { + start_line: 0, + start_char: 0, + end_line: 0, + end_char: 7, + }, + confidence_score: 100, + role: Role::Definition, + override_doc: None, + kind: lip_core::schema::ReferenceKind::Unknown, + is_test: false, + }], + merkle_path: uri.to_owned(), + edges: vec![], + source_text: None, // SCIP-style: no source + }; + + send( + &mut client, + &ClientMessage::Delta { + seq: 7, + action: Action::Upsert, + document: doc, + }, + ) + .await + .expect("send delta"); + let _ = recv(&mut client).await.expect("recv ack"); + + send( + &mut client, + &ClientMessage::QueryWorkspaceSymbols { + kind_filter: None, + scope: None, + modifier_filter: None, + query: "RichSym".to_owned(), + limit: Some(10), + }, + ) + .await + .expect("send query"); + + let resp = recv(&mut client).await.expect("recv workspace symbols"); + let got = match resp { + ServerMessage::WorkspaceSymbolsResult { symbols, .. } => symbols + .into_iter() + .find(|s| s.display_name == "RichSym") + .expect("expected RichSym in workspace symbols"), + other => panic!("expected WorkspaceSymbolsResult, got {other:?}"), + }; + + assert_eq!(got.extraction_tier, ExtractionTier::Tier3Scip); + assert_eq!(got.modifiers_source, Some(ModifiersSource::PrefixParse)); + assert_eq!(got.visibility, Some(Visibility::Public)); + assert_eq!(got.visibility_confidence, Some(1.0)); + assert_eq!(got.container_name.as_deref(), Some("RichContainer")); + assert_eq!(got.modifiers, vec!["pub".to_owned(), "async".to_owned()]); + assert_eq!( + got.signature_normalized.as_deref(), + Some("pub async fn RichSym(_: i32) -> Bar") + ); + + task.abort(); + let _ = task.await; +} + +/// End-to-end v2.3 reference-kind test. Upserts a Rust file with a function +/// definition and a call site, then verifies the daemon's ReferencesResult +/// carries `kind = Call` on the reference occurrence produced by Tier-1. +#[tokio::test] +async fn daemon_tier1_call_occurrence_has_ref_kind_call() { + let dir = tempfile::tempdir().expect("tempdir"); + let socket_path = dir.path().join("lip_v23_refkind.sock"); + + let daemon = LipDaemon::new(&socket_path); + let task = tokio::spawn(async move { daemon.run().await.ok() }); + tokio::time::sleep(Duration::from_millis(20)).await; + + let mut client = UnixStream::connect(&socket_path).await.expect("connect"); + + let source = "\ +pub fn callee() {} +pub fn caller() { + callee(); +} +"; + let uri = "lip://local/v23refkind@0.1/src.rs"; + send( + &mut client, + &ClientMessage::Delta { + seq: 1, + action: Action::Upsert, + document: make_doc(uri, source), + }, + ) + .await + .expect("send upsert"); + let _ = recv(&mut client).await.expect("recv ack"); + + send( + &mut client, + &ClientMessage::QueryWorkspaceSymbols { + kind_filter: None, + scope: None, + modifier_filter: None, + query: "callee".to_owned(), + limit: Some(10), + }, + ) + .await + .expect("send workspace query"); + let callee_uri = match recv(&mut client).await.expect("recv workspace") { + ServerMessage::WorkspaceSymbolsResult { symbols, .. } => symbols + .into_iter() + .find(|s| s.display_name == "callee") + .expect("expected 'callee' in workspace symbols") + .uri, + other => panic!("expected WorkspaceSymbolsResult, got {other:?}"), + }; + + send( + &mut client, + &ClientMessage::QueryReferences { + symbol_uri: callee_uri.clone(), + limit: Some(10), + }, + ) + .await + .expect("send refs query"); + let refs = match recv(&mut client).await.expect("recv refs") { + ServerMessage::ReferencesResult { occurrences } => occurrences, + other => panic!("expected ReferencesResult, got {other:?}"), + }; + + let call_ref = refs + .iter() + .find(|o| o.role == Role::Reference) + .expect("expected at least one reference occurrence for `callee`"); + assert_eq!( + call_ref.kind, + ReferenceKind::Call, + "Tier-1 must tag `callee()` with ReferenceKind::Call; got {:?}", + call_ref.kind + ); + assert!( + !call_ref.is_test, + "non-test file must not set is_test; got {:?} for uri {}", + call_ref.is_test, uri + ); + + task.abort(); + let _ = task.await; +} + +/// Upsert a file whose URI contains `/tests/` and verify Tier-1 stamps +/// `is_test = true` on every occurrence — the down-rank signal for CKB. +#[tokio::test] +async fn daemon_tier1_test_file_stamps_is_test() { + let dir = tempfile::tempdir().expect("tempdir"); + let socket_path = dir.path().join("lip_v23_istest.sock"); + + let daemon = LipDaemon::new(&socket_path); + let task = tokio::spawn(async move { daemon.run().await.ok() }); + tokio::time::sleep(Duration::from_millis(20)).await; + + let mut client = UnixStream::connect(&socket_path).await.expect("connect"); + + let source = "pub fn helper() { helper(); }"; + let uri = "lip://local/myproj@0.1/tests/integration.rs"; + send( + &mut client, + &ClientMessage::Delta { + seq: 1, + action: Action::Upsert, + document: make_doc(uri, source), + }, + ) + .await + .expect("send upsert"); + let _ = recv(&mut client).await.expect("recv ack"); + + send( + &mut client, + &ClientMessage::QueryWorkspaceSymbols { + kind_filter: None, + scope: None, + modifier_filter: None, + query: "helper".to_owned(), + limit: Some(10), + }, + ) + .await + .expect("send workspace query"); + let helper_uri = match recv(&mut client).await.expect("recv workspace") { + ServerMessage::WorkspaceSymbolsResult { symbols, .. } => symbols + .into_iter() + .find(|s| s.display_name == "helper") + .expect("expected 'helper' in workspace symbols") + .uri, + other => panic!("expected WorkspaceSymbolsResult, got {other:?}"), + }; + + send( + &mut client, + &ClientMessage::QueryReferences { + symbol_uri: helper_uri, + limit: Some(10), + }, + ) + .await + .expect("send refs query"); + let refs = match recv(&mut client).await.expect("recv refs") { + ServerMessage::ReferencesResult { occurrences } => occurrences, + other => panic!("expected ReferencesResult, got {other:?}"), + }; + + assert!( + !refs.is_empty(), + "expected at least one reference occurrence" + ); + for o in &refs { + assert!( + o.is_test, + "Tier-1 must stamp is_test on occurrences from /tests/ files; got false" + ); + } + + task.abort(); + let _ = task.await; +} + +/// QueryBlastRadiusSymbol (v2.3 Feature #3): single-symbol analogue of +/// QueryBlastRadiusBatch. Upsert two Rust files where file B calls a function +/// defined in file A, then ask the daemon for A's function's blast radius and +/// verify file B appears in `affected_files`. Also verify the `None` path for +/// unknown symbols. +#[tokio::test] +async fn daemon_query_blast_radius_symbol() { + let dir = tempfile::tempdir().expect("tempdir"); + let socket_path = dir.path().join("lip_v23_br_symbol.sock"); + + let daemon = LipDaemon::new(&socket_path); + let task = tokio::spawn(async move { daemon.run().await.ok() }); + tokio::time::sleep(Duration::from_millis(20)).await; + + let mut client = UnixStream::connect(&socket_path).await.expect("connect"); + + let a_uri = "lip://local/brsym@0.1/a.rs"; + let a_src = "pub fn victim() {}"; + send( + &mut client, + &ClientMessage::Delta { + seq: 1, + action: Action::Upsert, + document: make_doc(a_uri, a_src), + }, + ) + .await + .expect("send a"); + let _ = recv(&mut client).await.expect("ack a"); + + let b_uri = "lip://local/brsym@0.1/b.rs"; + let b_src = "\ +pub fn caller() { + crate::a::victim(); +} +"; + send( + &mut client, + &ClientMessage::Delta { + seq: 2, + action: Action::Upsert, + document: make_doc(b_uri, b_src), + }, + ) + .await + .expect("send b"); + let _ = recv(&mut client).await.expect("ack b"); + + send( + &mut client, + &ClientMessage::QueryWorkspaceSymbols { + kind_filter: None, + scope: None, + modifier_filter: None, + query: "victim".to_owned(), + limit: Some(10), + }, + ) + .await + .expect("send workspace"); + let victim_uri = match recv(&mut client).await.expect("recv workspace") { + ServerMessage::WorkspaceSymbolsResult { symbols, .. } => symbols + .into_iter() + .find(|s| s.display_name == "victim") + .expect("expected `victim` in workspace") + .uri, + other => panic!("expected WorkspaceSymbolsResult, got {other:?}"), + }; + + send( + &mut client, + &ClientMessage::QueryBlastRadiusSymbol { + symbol_uri: victim_uri.clone(), + min_score: None, + }, + ) + .await + .expect("send br symbol"); + let enriched = match recv(&mut client).await.expect("recv br symbol") { + ServerMessage::BlastRadiusSymbolResult { result } => { + result.expect("expected Some(EnrichedBlastRadius) for indexed symbol") + } + other => panic!("expected BlastRadiusSymbolResult, got {other:?}"), + }; + + // The file that defines `victim` — the enrichment's anchor. + assert_eq!(enriched.file_uri, a_uri); + assert!( + enriched.static_result.affected_files.iter().any(|f| f == b_uri), + "expected caller file {b_uri} in affected_files, got {:?}", + enriched.static_result.affected_files + ); + // min_score was None — enrichment must be skipped. + assert!( + enriched.semantic_items.is_empty(), + "min_score = None must skip semantic enrichment; got {:?}", + enriched.semantic_items + ); + + // Unknown symbol URI → None (not an error). + send( + &mut client, + &ClientMessage::QueryBlastRadiusSymbol { + symbol_uri: "lip://local/does/not/exist#nope".to_owned(), + min_score: None, + }, + ) + .await + .expect("send unknown br symbol"); + match recv(&mut client).await.expect("recv unknown br symbol") { + ServerMessage::BlastRadiusSymbolResult { result } => { + assert!(result.is_none(), "unknown symbol must return None result") + } + other => panic!("expected BlastRadiusSymbolResult, got {other:?}"), + } + + task.abort(); + let _ = task.await; +} + +/// QueryOutgoingCalls (v2.3 Feature #4): forward call-graph BFS. Upsert a +/// single file containing an A→B→C call chain and verify that depth=2 +/// returns both edges, while depth=1 only returns A→B. +#[tokio::test] +async fn daemon_query_outgoing_calls_depth() { + let dir = tempfile::tempdir().expect("tempdir"); + let socket_path = dir.path().join("lip_v23_outgoing.sock"); + + let daemon = LipDaemon::new(&socket_path); + let task = tokio::spawn(async move { daemon.run().await.ok() }); + tokio::time::sleep(Duration::from_millis(20)).await; + + let mut client = UnixStream::connect(&socket_path).await.expect("connect"); + + let uri = "lip://local/og@0.1/chain.rs"; + let src = "\ +fn a() { b(); } +fn b() { c(); } +fn c() {} +"; + send( + &mut client, + &ClientMessage::Delta { + seq: 1, + action: Action::Upsert, + document: make_doc(uri, src), + }, + ) + .await + .expect("send upsert"); + let _ = recv(&mut client).await.expect("recv ack"); + + // Resolve `a`'s URI via workspace symbols. + send( + &mut client, + &ClientMessage::QueryWorkspaceSymbols { + kind_filter: None, + scope: None, + modifier_filter: None, + query: "a".to_owned(), + limit: Some(20), + }, + ) + .await + .expect("send workspace"); + let a_uri = match recv(&mut client).await.expect("recv workspace") { + ServerMessage::WorkspaceSymbolsResult { symbols, .. } => symbols + .into_iter() + .find(|s| s.display_name == "a") + .expect("expected `a` in workspace symbols") + .uri, + other => panic!("expected WorkspaceSymbolsResult, got {other:?}"), + }; + + // Depth 2: A→B and B→C must both be present. + send( + &mut client, + &ClientMessage::QueryOutgoingCalls { + symbol_uri: a_uri.clone(), + depth: 2, + }, + ) + .await + .expect("send outgoing depth=2"); + let (edges, truncated) = match recv(&mut client).await.expect("recv depth=2") { + ServerMessage::OutgoingCallsResult { edges, truncated } => (edges, truncated), + other => panic!("expected OutgoingCallsResult, got {other:?}"), + }; + assert!(!truncated, "chain is tiny; truncated must be false"); + assert!( + edges.iter().any(|e| e.from_uri == a_uri && e.to_uri.ends_with("#b")), + "expected A→B edge; got {edges:?}", + ); + assert!( + edges.iter().any(|e| e.from_uri.ends_with("#b") && e.to_uri.ends_with("#c")), + "expected B→C edge at depth=2; got {edges:?}", + ); + + // Depth 1: B→C must be absent. + send( + &mut client, + &ClientMessage::QueryOutgoingCalls { + symbol_uri: a_uri.clone(), + depth: 1, + }, + ) + .await + .expect("send outgoing depth=1"); + let (edges1, _) = match recv(&mut client).await.expect("recv depth=1") { + ServerMessage::OutgoingCallsResult { edges, truncated } => (edges, truncated), + other => panic!("expected OutgoingCallsResult, got {other:?}"), + }; + assert!( + edges1.iter().any(|e| e.to_uri.ends_with("#b")), + "depth=1 must still include A→B; got {edges1:?}", + ); + assert!( + !edges1.iter().any(|e| e.to_uri.ends_with("#c")), + "depth=1 must exclude B→C; got {edges1:?}", + ); + + task.abort(); + let _ = task.await; +} + +/// Feature #5c: ranked workspace symbols. +/// +/// Verifies the four new v2.3 behaviors on `QueryWorkspaceSymbols`: +/// 1. Ranking tiers: Exact (1.0) > Prefix (0.8), and `ranked` parallels `symbols`. +/// 2. `kind_filter` narrows the result set to the requested `SymbolKind`s. +/// 3. `scope` restricts to symbols whose def-file URI starts with the prefix. +/// 4. `modifier_filter` restricts to symbols carrying at least one listed modifier. +/// 5. An empty query returns `ranked = []` (preserves pre-v2.3 semantics). +#[tokio::test] +async fn daemon_workspace_symbols_v23_filters_and_ranking() { + let dir = tempfile::tempdir().expect("tempdir"); + let socket_path = dir.path().join("lip_v23_ranked.sock"); + + let daemon = LipDaemon::new(&socket_path); + let task = tokio::spawn(async move { daemon.run().await.ok() }); + tokio::time::sleep(Duration::from_millis(20)).await; + + let mut client = UnixStream::connect(&socket_path).await.expect("connect"); + + // File A under scope `lip://local/srv`: `Handler` struct + async fn `handle`. + let srv_uri = "lip://local/srv@0.1/a.rs"; + let srv_src = "\ +pub struct Handler; +pub async fn handle() {} +"; + send( + &mut client, + &ClientMessage::Delta { + seq: 1, + action: Action::Upsert, + document: make_doc(srv_uri, srv_src), + }, + ) + .await + .expect("send srv upsert"); + let _ = recv(&mut client).await.expect("recv srv ack"); + + // File B under scope `lip://local/cli`: `HandlerFactory` struct. + let cli_uri = "lip://local/cli@0.1/b.rs"; + let cli_src = "pub struct HandlerFactory;\n"; + send( + &mut client, + &ClientMessage::Delta { + seq: 2, + action: Action::Upsert, + document: make_doc(cli_uri, cli_src), + }, + ) + .await + .expect("send cli upsert"); + let _ = recv(&mut client).await.expect("recv cli ack"); + + // (1) Ranking tiers — query "Handler" hits Exact on `Handler` and Prefix on `HandlerFactory`. + send( + &mut client, + &ClientMessage::QueryWorkspaceSymbols { + kind_filter: None, + scope: None, + modifier_filter: None, + query: "Handler".to_owned(), + limit: Some(50), + }, + ) + .await + .expect("send ranked query"); + let (symbols, ranked) = match recv(&mut client).await.expect("recv ranked") { + ServerMessage::WorkspaceSymbolsResult { + symbols, ranked, .. + } => (symbols, ranked), + other => panic!("expected WorkspaceSymbolsResult, got {other:?}"), + }; + assert_eq!( + symbols.len(), + ranked.len(), + "ranked must parallel symbols; got {} vs {}", + symbols.len(), + ranked.len(), + ); + let exact = ranked + .iter() + .zip(symbols.iter()) + .find(|(_, s)| s.display_name == "Handler") + .map(|(r, _)| r) + .expect("expected Handler in ranked list"); + assert!( + matches!(exact.match_type, lip_core::query_graph::types::MatchType::Exact), + "Handler should be Exact match, got {:?}", + exact.match_type, + ); + assert!((exact.score - 1.0).abs() < 1e-6, "exact score must be 1.0"); + let prefix = ranked + .iter() + .zip(symbols.iter()) + .find(|(_, s)| s.display_name == "HandlerFactory") + .map(|(r, _)| r) + .expect("expected HandlerFactory in ranked list"); + assert!( + matches!(prefix.match_type, lip_core::query_graph::types::MatchType::Prefix), + "HandlerFactory should be Prefix match, got {:?}", + prefix.match_type, + ); + assert!( + (prefix.score - 0.8).abs() < 1e-6, + "prefix score must be 0.8" + ); + // Sorted: exact before prefix. + let exact_idx = symbols + .iter() + .position(|s| s.display_name == "Handler") + .unwrap(); + let prefix_idx = symbols + .iter() + .position(|s| s.display_name == "HandlerFactory") + .unwrap(); + assert!( + exact_idx < prefix_idx, + "Exact must sort before Prefix; got exact={exact_idx}, prefix={prefix_idx}", + ); + + // (2) kind_filter — only Function symbols. + send( + &mut client, + &ClientMessage::QueryWorkspaceSymbols { + kind_filter: Some(vec![SymbolKind::Function]), + scope: None, + modifier_filter: None, + query: String::new(), + limit: Some(50), + }, + ) + .await + .expect("send kind query"); + let symbols = match recv(&mut client).await.expect("recv kind") { + ServerMessage::WorkspaceSymbolsResult { symbols, .. } => symbols, + other => panic!("expected WorkspaceSymbolsResult, got {other:?}"), + }; + assert!(!symbols.is_empty(), "expected at least one Function"); + assert!( + symbols.iter().all(|s| s.kind == SymbolKind::Function), + "kind_filter violated; got kinds {:?}", + symbols.iter().map(|s| s.kind).collect::>(), + ); + assert!( + symbols.iter().any(|s| s.display_name == "handle"), + "expected `handle` fn, got {:?}", + symbols.iter().map(|s| &s.display_name).collect::>(), + ); + + // (3) scope — only symbols defined under `lip://local/cli`. + send( + &mut client, + &ClientMessage::QueryWorkspaceSymbols { + kind_filter: None, + scope: Some("lip://local/cli".to_owned()), + modifier_filter: None, + query: String::new(), + limit: Some(50), + }, + ) + .await + .expect("send scope query"); + let symbols = match recv(&mut client).await.expect("recv scope") { + ServerMessage::WorkspaceSymbolsResult { symbols, .. } => symbols, + other => panic!("expected WorkspaceSymbolsResult, got {other:?}"), + }; + assert!( + symbols.iter().any(|s| s.display_name == "HandlerFactory"), + "scope must include HandlerFactory; got {:?}", + symbols.iter().map(|s| &s.display_name).collect::>(), + ); + assert!( + !symbols.iter().any(|s| s.display_name == "Handler"), + "scope must exclude `Handler` (wrong scope); got {:?}", + symbols.iter().map(|s| &s.display_name).collect::>(), + ); + assert!( + !symbols.iter().any(|s| s.display_name == "handle"), + "scope must exclude `handle` (wrong scope); got {:?}", + symbols.iter().map(|s| &s.display_name).collect::>(), + ); + + // (4) modifier_filter — only symbols with `async` modifier. + send( + &mut client, + &ClientMessage::QueryWorkspaceSymbols { + kind_filter: None, + scope: None, + modifier_filter: Some(vec!["async".to_owned()]), + query: "handle".to_owned(), + limit: Some(20), + }, + ) + .await + .expect("send modifier query"); + let symbols = match recv(&mut client).await.expect("recv modifier") { + ServerMessage::WorkspaceSymbolsResult { symbols, .. } => symbols, + other => panic!("expected WorkspaceSymbolsResult, got {other:?}"), + }; + assert!( + !symbols.is_empty(), + "expected at least one async symbol matching `handle`" + ); + assert!( + symbols + .iter() + .all(|s| s.modifiers.iter().any(|m| m == "async")), + "modifier_filter violated; modifiers: {:?}", + symbols.iter().map(|s| &s.modifiers).collect::>(), + ); + + // (5) empty query → `ranked` is empty (legacy behavior preserved). + send( + &mut client, + &ClientMessage::QueryWorkspaceSymbols { + kind_filter: None, + scope: None, + modifier_filter: None, + query: String::new(), + limit: Some(10), + }, + ) + .await + .expect("send empty query"); + let ranked = match recv(&mut client).await.expect("recv empty") { + ServerMessage::WorkspaceSymbolsResult { ranked, .. } => ranked, + other => panic!("expected WorkspaceSymbolsResult, got {other:?}"), + }; + assert!( + ranked.is_empty(), + "empty query must produce empty ranked (legacy behavior); got {} entries", + ranked.len(), + ); + + task.abort(); + let _ = task.await; +} diff --git a/tools/lip-cli/src/cmd/export.rs b/tools/lip-cli/src/cmd/export.rs index dd1e453..992f35b 100644 --- a/tools/lip-cli/src/cmd/export.rs +++ b/tools/lip-cli/src/cmd/export.rs @@ -3,7 +3,7 @@ use std::path::PathBuf; use clap::Args; use prost::Message; -use lip::schema::{OwnedDocument, OwnedSymbolInfo, Role, SymbolKind}; +use lip::schema::{OwnedDocument, OwnedSymbolInfo, ReferenceKind, Role, SymbolKind}; // Generated from src/proto/scip.proto by prost-build. #[allow(clippy::all)] @@ -127,15 +127,31 @@ fn convert_symbol_info(sym: &OwnedSymbolInfo) -> scip::SymbolInformation { .unwrap_or_default(), kind: lip_kind_to_scip(sym.kind) as i32, relationships, + enclosing_symbol: sym.container_name.clone().unwrap_or_default(), } } fn convert_occurrence(occ: &lip::schema::OwnedOccurrence) -> scip::Occurrence { - let role_bits = if occ.role == Role::Definition { + let mut role_bits = if occ.role == Role::Definition { scip::SymbolRole::Definition as i32 } else { scip::SymbolRole::UnspecifiedSymbolRole as i32 }; + // Preserve LIP ReferenceKind on export via SCIP symbol_roles bits (spec §10.2). + match occ.kind { + ReferenceKind::Write => role_bits |= scip::SymbolRole::WriteAccess as i32, + ReferenceKind::Read => role_bits |= scip::SymbolRole::ReadAccess as i32, + // Call/Type/Implements/Extends have no SCIP Occurrence-role equivalent; they + // round-trip via other channels (call edges, Relationships, type info). + ReferenceKind::Call + | ReferenceKind::Type + | ReferenceKind::Implements + | ReferenceKind::Extends + | ReferenceKind::Unknown => {} + } + if occ.is_test { + role_bits |= scip::SymbolRole::Test as i32; + } let range = vec![ occ.range.start_line, @@ -258,4 +274,59 @@ mod tests { assert_eq!(lip_kind_to_scip(lip), scip); } } + + fn occ_with(kind: ReferenceKind, is_test: bool, role: Role) -> lip::schema::OwnedOccurrence { + lip::schema::OwnedOccurrence { + symbol_uri: "lip://local/x".to_owned(), + range: lip::schema::OwnedRange { + start_line: 0, + start_char: 0, + end_line: 0, + end_char: 1, + }, + confidence_score: 20, + role, + override_doc: None, + kind, + is_test, + } + } + + #[test] + fn ref_kind_read_exports_read_access_bit() { + let out = convert_occurrence(&occ_with(ReferenceKind::Read, false, Role::Reference)); + assert!(out.symbol_roles & scip::SymbolRole::ReadAccess as i32 != 0); + assert!(out.symbol_roles & scip::SymbolRole::WriteAccess as i32 == 0); + assert!(out.symbol_roles & scip::SymbolRole::Test as i32 == 0); + } + + #[test] + fn ref_kind_write_exports_write_access_bit() { + let out = convert_occurrence(&occ_with(ReferenceKind::Write, false, Role::Reference)); + assert!(out.symbol_roles & scip::SymbolRole::WriteAccess as i32 != 0); + assert!(out.symbol_roles & scip::SymbolRole::ReadAccess as i32 == 0); + } + + #[test] + fn is_test_exports_test_bit() { + let out = convert_occurrence(&occ_with(ReferenceKind::Read, true, Role::Reference)); + assert!(out.symbol_roles & scip::SymbolRole::Test as i32 != 0); + } + + #[test] + fn ref_kind_call_does_not_set_access_bits() { + // Call has no SCIP Occurrence-role equivalent; bits stay cleared. + let out = convert_occurrence(&occ_with(ReferenceKind::Call, false, Role::Reference)); + assert!(out.symbol_roles & scip::SymbolRole::ReadAccess as i32 == 0); + assert!(out.symbol_roles & scip::SymbolRole::WriteAccess as i32 == 0); + } + + #[test] + fn definition_with_write_kind_still_sets_definition() { + // Defensive: if an upstream caller somehow marks a def with a Write kind, + // both bits end up set. We don't actively strip kind on defs during export; + // the pair remains semantically consistent because SCIP permits combined bits. + let out = convert_occurrence(&occ_with(ReferenceKind::Write, false, Role::Definition)); + assert!(out.symbol_roles & scip::SymbolRole::Definition as i32 != 0); + } } diff --git a/tools/lip-cli/src/cmd/import.rs b/tools/lip-cli/src/cmd/import.rs index 7849574..0ba3ff6 100644 --- a/tools/lip-cli/src/cmd/import.rs +++ b/tools/lip-cli/src/cmd/import.rs @@ -5,10 +5,12 @@ use prost::Message; use tokio::io::{AsyncReadExt, AsyncWriteExt}; use tokio::net::UnixStream; +use lip::indexer::language::Language; use lip::query_graph::{ClientMessage, ServerMessage, Tier3Source}; use lip::schema::{ - sha256_hex, Action, OwnedDelta, OwnedDocument, OwnedEventStream, OwnedOccurrence, OwnedRange, - OwnedSymbolInfo, Role, SymbolKind, + normalize_signature, sha256_hex, visibility, Action, ExtractionTier, ModifiersSource, + OwnedDelta, OwnedDocument, OwnedEventStream, OwnedOccurrence, OwnedRange, OwnedSymbolInfo, + ReferenceKind, Role, SymbolKind, }; use crate::output; @@ -98,7 +100,7 @@ pub async fn run(args: ImportArgs) -> anyhow::Result<()> { let symbols: Vec = index .external_symbols .into_iter() - .map(|sym| convert_symbol_info(&sym, confidence)) + .map(|sym| convert_symbol_info(&sym, confidence, Language::Unknown)) .collect(); let doc = OwnedDocument { @@ -258,11 +260,12 @@ fn build_tier3_source(index: &scip::Index, scip_path: &std::path::Path) -> Tier3 fn convert_document(doc: scip::Document, confidence: u8) -> OwnedDelta { let uri = format!("file:///{}", doc.relative_path.trim_start_matches('/')); let content_hash = sha256_hex(doc.relative_path.as_bytes()); + let lang = scip_language_to_lip(&doc.language); let symbols: Vec = doc .symbols .iter() - .map(|s| convert_symbol_info(s, confidence)) + .map(|s| convert_symbol_info(s, confidence, lang)) .collect(); let occurrences: Vec = doc @@ -293,7 +296,11 @@ fn convert_document(doc: scip::Document, confidence: u8) -> OwnedDelta { } } -fn convert_symbol_info(sym: &scip::SymbolInformation, confidence: u8) -> OwnedSymbolInfo { +fn convert_symbol_info( + sym: &scip::SymbolInformation, + confidence: u8, + lang: Language, +) -> OwnedSymbolInfo { let display = if sym.display_name.is_empty() { // Fall back to the last descriptor segment of the symbol string. sym.symbol @@ -314,6 +321,32 @@ fn convert_symbol_info(sym: &scip::SymbolInformation, confidence: u8) -> OwnedSy // SCIP private symbols begin with "local "; everything else is exported. let is_exported = !sym.symbol.starts_with("local "); + + // v2.3 structural metadata. Upstream SCIP carries `enclosing_symbol` but no + // canonical modifier field, so we always reconstruct modifiers by parsing + // the signature prefix and tag the source accordingly. CKB discounts + // confidence on `PrefixParse` when merging imports with Tier-1 results. + let modifiers = signature + .as_deref() + .map(|s| parse_modifiers_from_signature(s, lang)) + .unwrap_or_default(); + + let (vis, vis_confidence) = if matches!(lang, Language::Unknown) { + // Without a language we cannot apply name/keyword rules; skip inference. + (None, None) + } else { + let (v, c) = visibility::infer(&display, &modifiers, lang); + (Some(v), Some(c as f32 / 100.0)) + }; + + let signature_normalized = signature + .as_deref() + .map(|s| normalize_signature(s, lang)); + + let container_name = Some(sym.enclosing_symbol.clone()) + .filter(|s| !s.is_empty()) + .map(|enc| scip_enclosing_to_display(&enc)); + OwnedSymbolInfo { uri: scip_symbol_to_lip_uri(&sym.symbol), display_name: display, @@ -327,7 +360,116 @@ fn convert_symbol_info(sym: &scip::SymbolInformation, confidence: u8) -> OwnedSy taint_labels: vec![], blast_radius: 0, is_exported, + signature_normalized, + modifiers, + visibility: vis, + container_name, + visibility_confidence: vis_confidence, + extraction_tier: ExtractionTier::Tier3Scip, + modifiers_source: Some(ModifiersSource::PrefixParse), + ..Default::default() + } +} + +/// Map a SCIP `Document.language` string to a LIP [`Language`]. +/// +/// Returns [`Language::Unknown`] when the SCIP index omits the language field or +/// uses a value we do not handle. Matches the casing upstream producers emit +/// (e.g. scip-typescript emits `"TypeScript"`). +fn scip_language_to_lip(lang: &str) -> Language { + match lang.to_ascii_lowercase().as_str() { + "rust" => Language::Rust, + "typescript" | "tsx" => Language::TypeScript, + "javascript" | "javascriptreact" | "jsx" => Language::JavaScript, + "python" => Language::Python, + "dart" => Language::Dart, + "go" => Language::Go, + "kotlin" => Language::Kotlin, + "swift" => Language::Swift, + "c" => Language::C, + "cpp" | "c++" | "cxx" => Language::Cpp, + _ => Language::Unknown, + } +} + +/// Parse leading modifier keywords from a signature prefix (SCIP-native docs). +fn parse_modifiers_from_signature(signature: &str, lang: Language) -> Vec { + let keywords: &[&str] = match lang { + Language::Rust => &["pub", "const", "async", "unsafe", "extern", "static", "mut"], + Language::TypeScript + | Language::JavaScript + | Language::JavaScriptReact => &[ + "export", "default", "async", "static", "readonly", "public", "private", "protected", + "abstract", "declare", "override", + ], + Language::Dart => &[ + "static", "abstract", "final", "const", "external", "factory", "late", "covariant", + ], + Language::Kotlin => &[ + "private", "protected", "internal", "public", "abstract", "final", "open", "override", + "suspend", "inline", "external", "data", "sealed", "companion", "lateinit", "const", + "operator", "infix", "tailrec", + ], + Language::Swift => &[ + "private", "fileprivate", "internal", "public", "open", "static", "final", "override", + "mutating", "nonmutating", "required", "convenience", "lazy", "weak", "unowned", + "dynamic", + ], + Language::C | Language::Cpp => &[ + "static", "extern", "const", "virtual", "override", "explicit", "inline", "constexpr", + "private", "protected", "public", "friend", "mutable", "volatile", + ], + // Python / Go / Unknown: prefix-parse not meaningful; rely on name rules. + _ => &[], + }; + + let mut out = Vec::new(); + let mut rest = signature.trim_start(); + loop { + if matches!(lang, Language::Rust) && rest.starts_with("pub(") { + if let Some(close) = rest.find(')') { + out.push(rest[..=close].to_owned()); + rest = rest[close + 1..].trim_start(); + continue; + } + } + let end = rest + .find(|c: char| c.is_whitespace() || c == '(' || c == '<' || c == ':') + .unwrap_or(rest.len()); + if end == 0 { + break; + } + let tok = &rest[..end]; + if keywords.contains(&tok) { + out.push(tok.to_owned()); + rest = rest[end..].trim_start(); + } else { + break; + } + } + out +} + +/// Convert a SCIP `enclosing_symbol` string to a human-readable container name. +/// +/// SCIP encodes enclosing symbols the same way as regular `symbol` strings +/// (e.g. `scip-typescript npm react 18.2.0 React#Component.`); the trailing +/// descriptor is what a human calls the class/namespace. +fn scip_enclosing_to_display(enc: &str) -> String { + // splitn(5): keep the descriptor intact even when it contains spaces. + let parts: Vec<&str> = enc.splitn(5, ' ').collect(); + if parts.len() == 5 { + let desc = parts[4]; + let cleaned = desc.trim_end_matches(['#', '.', '/', '`', ' ']); + let last = cleaned + .rsplit(['#', '.', '/', '`']) + .next() + .unwrap_or(cleaned); + if !last.is_empty() { + return last.to_owned(); + } } + enc.to_owned() } /// Split SCIP documentation entries into a `(signature, doc_comment)` pair. @@ -369,6 +511,7 @@ fn looks_like_signature(s: &str) -> bool { // Java, Go, Dart, and Kotlin. const SIG_PREFIXES: &[&str] = &[ "pub ", + "pub(", "fn ", "async fn ", "pub fn ", @@ -401,11 +544,24 @@ fn looks_like_signature(s: &str) -> bool { fn convert_occurrence(occ: &scip::Occurrence) -> Option { let range = parse_scip_range(&occ.range)?; - let role = if occ.symbol_roles & (scip::SymbolRole::Definition as i32) != 0 { + let roles = occ.symbol_roles; + let role = if roles & (scip::SymbolRole::Definition as i32) != 0 { Role::Definition } else { Role::Reference }; + // SCIP symbol_roles bits map to LIP ReferenceKind (spec §10.2). + // Write takes precedence over Read if both bits are set. + let kind = if matches!(role, Role::Definition) { + ReferenceKind::Unknown + } else if roles & (scip::SymbolRole::WriteAccess as i32) != 0 { + ReferenceKind::Write + } else if roles & (scip::SymbolRole::ReadAccess as i32) != 0 { + ReferenceKind::Read + } else { + ReferenceKind::Unknown + }; + let is_test = roles & (scip::SymbolRole::Test as i32) != 0; Some(OwnedOccurrence { symbol_uri: scip_symbol_to_lip_uri(&occ.symbol), @@ -417,6 +573,8 @@ fn convert_occurrence(occ: &scip::Occurrence) -> Option { } else { Some(occ.override_documentation.join("\n")) }, + kind, + is_test, }) } @@ -635,4 +793,231 @@ mod tests { assert!(sig4.is_none()); assert!(doc4.is_none()); } + + // ── v2.3 SCIP importer enrichment ───────────────────────────────────────── + + use lip::schema::{ModifiersSource, Visibility}; + + fn sym_with(lang: &str, proto_sym: scip::SymbolInformation) -> OwnedSymbolInfo { + convert_symbol_info(&proto_sym, 90, scip_language_to_lip(lang)) + } + + #[test] + fn scip_language_known_values_round_trip() { + assert!(matches!(scip_language_to_lip("rust"), Language::Rust)); + assert!(matches!( + scip_language_to_lip("TypeScript"), + Language::TypeScript + )); + assert!(matches!(scip_language_to_lip("Python"), Language::Python)); + assert!(matches!(scip_language_to_lip(""), Language::Unknown)); + } + + #[test] + fn scip_rust_import_populates_v23_fields() { + let proto = scip::SymbolInformation { + symbol: "rust-analyzer cargo my_crate 1.0 my_mod/foo().".to_owned(), + display_name: "foo".to_owned(), + documentation: vec!["pub async fn foo(x: i32) -> Bar".to_owned()], + relationships: vec![], + kind: scip::Kind::KFunction as i32, + enclosing_symbol: String::new(), + }; + let out = sym_with("rust", proto); + assert_eq!(out.modifiers, vec!["pub".to_owned(), "async".to_owned()]); + assert_eq!(out.visibility, Some(Visibility::Public)); + assert_eq!(out.visibility_confidence, Some(1.0)); + assert_eq!(out.extraction_tier, ExtractionTier::Tier3Scip); + assert_eq!(out.modifiers_source, Some(ModifiersSource::PrefixParse)); + assert_eq!( + out.signature_normalized.as_deref(), + Some("pub async fn foo(_: i32) -> Bar") + ); + } + + #[test] + fn scip_enclosing_symbol_becomes_container() { + let proto = scip::SymbolInformation { + symbol: "scip-typescript npm react 18.2.0 src/App.ts`render().".to_owned(), + display_name: "render".to_owned(), + documentation: vec!["render(): void".to_owned()], + relationships: vec![], + kind: scip::Kind::KMethod as i32, + enclosing_symbol: + "scip-typescript npm react 18.2.0 src/App.ts`App#".to_owned(), + }; + let out = sym_with("typescript", proto); + // Last descriptor segment of the enclosing symbol becomes container name. + assert_eq!(out.container_name.as_deref(), Some("App")); + } + + #[test] + fn scip_unknown_language_skips_inference() { + let proto = scip::SymbolInformation { + symbol: "x y z 1.0 Foo.".to_owned(), + display_name: "Foo".to_owned(), + documentation: vec![], + relationships: vec![], + kind: scip::Kind::KClass as i32, + enclosing_symbol: String::new(), + }; + let out = sym_with("", proto); + // No signature and no language → no modifiers, no visibility inference. + assert!(out.modifiers.is_empty()); + assert!(out.visibility.is_none()); + assert!(out.visibility_confidence.is_none()); + // Source is still marked as PrefixParse — absence of a proto modifier + // field is the whole reason we tag it that way. + assert_eq!(out.modifiers_source, Some(ModifiersSource::PrefixParse)); + } + + #[test] + fn scip_python_uses_name_convention() { + let proto = scip::SymbolInformation { + symbol: "scip-python pypi mylib 1.0 _helper().".to_owned(), + display_name: "_helper".to_owned(), + documentation: vec!["def _helper(x: int) -> None".to_owned()], + relationships: vec![], + kind: scip::Kind::KFunction as i32, + enclosing_symbol: String::new(), + }; + let out = sym_with("python", proto); + assert_eq!(out.visibility, Some(Visibility::Private)); + // Python modifier keywords are not meaningful in our prefix-parse set, + // so the list stays empty — visibility comes from the name rule. + assert!(out.modifiers.is_empty()); + } + + #[test] + fn scip_pub_crate_is_internal() { + let proto = scip::SymbolInformation { + symbol: "rust-analyzer cargo crate 1.0 foo().".to_owned(), + display_name: "foo".to_owned(), + documentation: vec!["pub(crate) fn foo()".to_owned()], + relationships: vec![], + kind: scip::Kind::KFunction as i32, + enclosing_symbol: String::new(), + }; + let out = sym_with("rust", proto); + assert_eq!(out.modifiers, vec!["pub(crate)".to_owned()]); + assert_eq!(out.visibility, Some(Visibility::Internal)); + } + + /// Wire-format round-trip: encode a `scip::Index` containing + /// `enclosing_symbol` (field 8, the only upstream-compatible v2.3 field we + /// added to the proto) and verify it decodes back and flows into + /// `container_name` on the LIP side. This guards against a future proto + /// edit that drops or renumbers the field. + #[test] + fn scip_enclosing_symbol_survives_wire_round_trip() { + let proto = scip::Index { + metadata: Some(scip::Metadata { + version: scip::ProtocolVersion::UnspecifiedProtocolVersion as i32, + tool_info: Some(scip::ToolInfo { + name: "test".to_owned(), + version: "0.0".to_owned(), + arguments: vec![], + }), + project_root: String::new(), + text_document_encoding: scip::TextEncoding::Utf8 as i32, + }), + documents: vec![scip::Document { + language: "rust".to_owned(), + relative_path: "src/mod.rs".to_owned(), + occurrences: vec![], + symbols: vec![scip::SymbolInformation { + symbol: "rust-analyzer cargo mycrate 1.0 Mod/bar().".to_owned(), + display_name: "bar".to_owned(), + documentation: vec!["pub fn bar()".to_owned()], + relationships: vec![], + kind: scip::Kind::KMethod as i32, + enclosing_symbol: + "rust-analyzer cargo mycrate 1.0 Mod/MyStruct#".to_owned(), + }], + }], + external_symbols: vec![], + }; + + let mut buf = vec![]; + prost::Message::encode(&proto, &mut buf).expect("prost encode"); + let decoded = ::decode(&buf[..]).expect("prost decode"); + + let doc = &decoded.documents[0]; + let sym = &doc.symbols[0]; + assert_eq!( + sym.enclosing_symbol, + "rust-analyzer cargo mycrate 1.0 Mod/MyStruct#", + "field 8 enclosing_symbol lost across prost encode/decode" + ); + + let out = convert_symbol_info(sym, 90, scip_language_to_lip(&doc.language)); + assert_eq!( + out.container_name.as_deref(), + Some("MyStruct"), + "decoded enclosing_symbol did not populate container_name" + ); + assert_eq!(out.extraction_tier, ExtractionTier::Tier3Scip); + assert_eq!(out.modifiers_source, Some(ModifiersSource::PrefixParse)); + } + + fn occ_with_roles(roles: i32) -> scip::Occurrence { + scip::Occurrence { + range: vec![1, 0, 1, 5], + symbol: "rust-analyzer cargo c 1.0 m/x.".to_owned(), + symbol_roles: roles, + override_documentation: vec![], + ..Default::default() + } + } + + #[test] + fn scip_read_access_maps_to_ref_kind_read() { + let o = convert_occurrence(&occ_with_roles(scip::SymbolRole::ReadAccess as i32)) + .expect("occurrence converts"); + assert_eq!(o.role, Role::Reference); + assert_eq!(o.kind, ReferenceKind::Read); + assert!(!o.is_test); + } + + #[test] + fn scip_write_access_maps_to_ref_kind_write() { + let o = convert_occurrence(&occ_with_roles(scip::SymbolRole::WriteAccess as i32)) + .expect("occurrence converts"); + assert_eq!(o.kind, ReferenceKind::Write); + } + + #[test] + fn scip_write_wins_over_read_when_both_set() { + let bits = + scip::SymbolRole::WriteAccess as i32 | scip::SymbolRole::ReadAccess as i32; + let o = convert_occurrence(&occ_with_roles(bits)).expect("occurrence converts"); + assert_eq!(o.kind, ReferenceKind::Write); + } + + #[test] + fn scip_test_bit_sets_is_test() { + let bits = + scip::SymbolRole::Test as i32 | scip::SymbolRole::ReadAccess as i32; + let o = convert_occurrence(&occ_with_roles(bits)).expect("occurrence converts"); + assert!(o.is_test); + assert_eq!(o.kind, ReferenceKind::Read); + } + + #[test] + fn scip_definition_keeps_kind_unknown() { + // SCIP does not set read/write on definitions; kind stays Unknown. + let bits = + scip::SymbolRole::Definition as i32 | scip::SymbolRole::WriteAccess as i32; + let o = convert_occurrence(&occ_with_roles(bits)).expect("occurrence converts"); + assert_eq!(o.role, Role::Definition); + assert_eq!(o.kind, ReferenceKind::Unknown); + } + + #[test] + fn scip_unspecified_role_leaves_kind_unknown() { + let o = convert_occurrence(&occ_with_roles(0)).expect("occurrence converts"); + assert_eq!(o.role, Role::Reference); + assert_eq!(o.kind, ReferenceKind::Unknown); + assert!(!o.is_test); + } } diff --git a/tools/lip-cli/src/cmd/mcp.rs b/tools/lip-cli/src/cmd/mcp.rs index 3f43472..20fba76 100644 --- a/tools/lip-cli/src/cmd/mcp.rs +++ b/tools/lip-cli/src/cmd/mcp.rs @@ -111,6 +111,9 @@ async fn daemon_call(name: &str, args: &Value, socket: &Path) -> anyhow::Result< "lip_workspace_symbols" => ClientMessage::QueryWorkspaceSymbols { query: req_str(args, "query")?, limit: args["limit"].as_u64().map(|n| n as usize).or(Some(50)), + kind_filter: None, + scope: None, + modifier_filter: None, }, "lip_definition" => ClientMessage::QueryDefinition { uri: req_str(args, "uri")?, @@ -383,7 +386,7 @@ fn format_response(tool: &str, msg: &ServerMessage) -> String { out } - ServerMessage::WorkspaceSymbolsResult { symbols } => { + ServerMessage::WorkspaceSymbolsResult { symbols, .. } => { if symbols.is_empty() { return "No symbols found.".into(); } diff --git a/tools/lip-cli/src/cmd/query.rs b/tools/lip-cli/src/cmd/query.rs index 5b1d540..a54caf8 100644 --- a/tools/lip-cli/src/cmd/query.rs +++ b/tools/lip-cli/src/cmd/query.rs @@ -129,6 +129,9 @@ pub async fn run(args: QueryArgs) -> anyhow::Result<()> { QueryKind::Symbols { query, limit } => ClientMessage::QueryWorkspaceSymbols { query, limit: Some(limit), + kind_filter: None, + scope: None, + modifier_filter: None, }, QueryKind::DeadSymbols { limit } => ClientMessage::QueryDeadSymbols { limit: Some(limit) }, QueryKind::Similar { query, limit } => ClientMessage::SimilarSymbols { query, limit }, diff --git a/tools/lip-cli/src/proto/scip.proto b/tools/lip-cli/src/proto/scip.proto index ef77107..04ec144 100644 --- a/tools/lip-cli/src/proto/scip.proto +++ b/tools/lip-cli/src/proto/scip.proto @@ -47,6 +47,9 @@ message SymbolInformation { repeated Relationship relationships = 4; Kind kind = 5; string display_name = 6; + // Upstream-compatible (SCIP 0.3+): the enclosing symbol string. Used as + // container_name in LIP. Empty when absent. + string enclosing_symbol = 8; } // Symbol kind — values prefixed with K_ to avoid conflict with other enums. From f7f538cf543e1e0d2e4b6eadbaf8d9130b1683fd Mon Sep 17 00:00:00 2001 From: Lisa Date: Tue, 21 Apr 2026 14:10:47 +0200 Subject: [PATCH 10/18] =?UTF-8?q?feat:=20v2.3.1=20=E2=80=94=20CKB=20import?= =?UTF-8?q?-landing=20fix?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Converge client and daemon URI conventions and back-fill call edges when SCIP imports omit them. Fixes the "lip import prints success but every file shows indexed: false" bug reported against v2.3.0. - RegisterProjectRoot message + daemon canonicalizes lip://local/ against registered roots (longest-first); capability advertised in HandshakeResult.supported_messages - EdgesSource provenance on EnrichedBlastRadius (Tier1 | ScipWithTier1Edges | ScipOnly | Empty) so CKB can route around files LIP has no structural edges for - upsert_file_precomputed reads the file from disk and runs tier-1 when the incoming SCIP document has empty edges - lip import emits canonical lip://local/// (or lip://local/ when Metadata.project_root is absent), replacing the old file:/// form that silently mismatched CKB queries - lip import --verify round-trips up to 10 sampled documents after push and exits non-zero on any mismatch Bumps workspace version 2.2.0 → 2.3.1; v2.3.0 features and v2.3.1 fixes ship in the same release. Co-Authored-By: Claude Sonnet 4.6 --- CHANGELOG.md | 50 ++++ Cargo.lock | 6 +- Cargo.toml | 2 +- README.md | 2 +- bindings/rust/src/daemon/session.rs | 19 ++ bindings/rust/src/query_graph/db.rs | 282 ++++++++++++++++++++--- bindings/rust/src/query_graph/types.rs | 71 ++++++ bindings/rust/tests/integration.rs | 306 +++++++++++++++++++++++++ docs/LIP_SPEC.mdx | 159 ++++++++++++- tools/lip-cli/src/cmd/import.rs | 238 ++++++++++++++++++- 10 files changed, 1098 insertions(+), 37 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1379e53..18a5c16 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,56 @@ All notable changes to this project are documented here. ## [Unreleased] +## [2.3.1] — 2026-04-21 + +**CKB import landing fix.** Addresses the "`lip import --push-to-daemon` prints success but every file shows `indexed: false`" class of bug by making client- and daemon-side URI conventions converge, and by back-filling call edges when SCIP imports carry none. `protocol_version` stays at `2`; every change is either additive or limited to CLI behaviour. + +### Added + +- **`RegisterProjectRoot { root: String }`** — idempotent client message that registers a filesystem root with the daemon. The daemon uses its registered roots to resolve relative `lip://local/` URIs against the absolute-form records emitted by the tier-1 indexer and by `lip import` (`lip://local///`). Longest matching root wins when multiple are registered. Advertised as `register_project_root` in `HandshakeResult.supported_messages`; pre-v2.3.1 daemons reply `UnknownMessage`, in which case clients must send absolute URIs. `RegisterTier3Source` now auto-registers its `source.project_root` when non-empty, so SCIP imports that carry a project root need no second round-trip. + +- **`EdgesSource` provenance on blast radius results** — `EnrichedBlastRadius` gains `edges_source: Option` with four variants (`Tier1 | ScipWithTier1Edges | ScipOnly | Empty`). Consumers that maintain their own fallback path (e.g. CKB's native SCIP backend) can now detect when LIP has no structural edges for a file and route around us. `#[serde(default, skip_serializing_if = Option::is_none)]`, so the field is invisible to pre-v2.3.1 clients. + +- **Tier-1 edge back-fill on SCIP imports** — `upsert_file_precomputed` now falls back to running the tree-sitter tier-1 extractor over the file on disk when the incoming SCIP document has no call edges. Produces `edges_source = ScipWithTier1Edges` when the fallback succeeds, `Empty` when the file is unreadable or yields no calls. Fills the gap where `scip-go` inconsistently emits call edges and `scip-clang` omits them entirely. + +- **`lip import --verify`** — after pushing deltas, samples up to 10 documents and round-trips `QueryFileStatus` (expecting `indexed = true`) plus a `QueryWorkspaceSymbols` probe scoped to the file when the document carries an exported `Function` / `Class` / `Interface` definition. Exits non-zero on any mismatch so CI catches silent import drops instead of printing success and returning 0. Requires `--push-to-daemon`. + +### Changed + +- **`lip import` URI scheme** — imported documents now use `lip://local/` (when `Metadata.project_root` is absent) or `lip://local//` with the canonical doubled slash (when it is present), replacing the previous `file:///` form that silently failed to match any CKB query. Requires CKB to call `RegisterProjectRoot` for its workspace root before querying by relative path. + +- **`LipDatabase::canonicalize_uri`** — every public query- and mutation-surface method now canonicalises its URI argument through `registered_roots` before hitting the input/embedding/def/sym/occ maps, so relative and absolute lip-local forms of the same file resolve to the same record. Non-lip-local URIs (`file://…`, `scip://external`, bare paths) are returned unchanged. + +### Fixed + +- **CKB "printed success but nothing landed"** — the combined effect of the import URI change + `RegisterProjectRoot` + daemon-side canonicalisation means `lip import` records are now discoverable by a CKB client that queries `lip://local/`, which was the class of bug reported after v2.3.0. + +--- + +## [2.3.0] — 2026-04-21 + +**CKB structural-parity bundle.** Five additive features so CKB (and any other consumer) can retire its duplicate SCIP parser and query LIP for everything structural. `protocol_version` stays at `2`; every new field is `#[serde(default, skip_serializing_if = …)]` and every new message is advertised via `HandshakeResult.supported_messages`, so older clients see no change. + +### Added + +- **Rich symbol metadata** — `OwnedSymbolInfo` now carries `signature_normalized`, `modifiers: Vec`, `visibility: Option` + `visibility_confidence: Option`, `container_name: Option`, `extraction_tier: ExtractionTier`, and `modifiers_source: Option`. Tier-1 extractors populate the structural fields for Rust / TypeScript / Python / Swift / Kotlin (`extraction_tier = Tier1`, `modifiers_source = None`). The SCIP importer (`lip import --from-scip`) parses upstream-compatible `enclosing_symbol = 8` and derives modifiers via prefix-parse (`extraction_tier = Tier3Scip`, `modifiers_source = PrefixParse`). Wire round-trip covered end-to-end by `daemon_tier1_emits_v23_metadata` and `daemon_precomputed_preserves_v23_metadata`. + +- **Reference classification** — `OwnedOccurrence` gains `kind: ReferenceKind` (`Unknown` / `Call` / `Read` / `Write` / `Type` / `Implements` / `Extends`) and `is_test: bool`. Tier-1 classifier in `symbol_extractor::classify_ref_kind` uses tree-sitter parent-node and field-name lookup — call / method targets → `Call`, assignment-LHS → `Write`, otherwise `Read`; `is_test` stamps occurrences from paths under `/tests/`, `_test.rs`, `_test.py`, `.spec.ts`, etc. SCIP import/export round-trips via `SymbolRole::ReadAccess | WriteAccess | Test` (Call has no SCIP equivalent and maps to `Unknown` on re-export). + +- **`QueryBlastRadiusSymbol { symbol_uri, min_score?: f32 } → BlastRadiusSymbolResult { result: Option }`** — single-symbol wrapper around `blast_radius_for_symbol`. Resolves the symbol's `def_index` entry, runs the same structural BFS + semantic-enrichment loop as `QueryBlastRadiusBatch`, and returns `None` for unknown or unindexed symbols so the caller can distinguish "zero impact" from "no data." Safe inside `BatchQuery`. + +- **`QueryOutgoingCalls { symbol_uri, depth: u32 } → OutgoingCallsResult { edges: Vec, truncated: bool }`** — forward call-graph traversal. New `caller_to_callees: HashMap>` index mirrors the existing reverse map, populated in both `upsert_file` and `upsert_file_precomputed` and cleaned in `remove_file_call_edges`. BFS is depth-clamped to `[1, 8]` with `NODE_LIMIT = 200`; the `truncated` flag reports when the cap fired. Safe inside `BatchQuery`. + +- **Ranked & filtered workspace symbols** — `QueryWorkspaceSymbols` adds three optional filters (`kind_filter: Option>`, `scope: Option`, `modifier_filter: Option>`); `WorkspaceSymbolsResult` adds `ranked: Vec` (parallel to `symbols`). Tiered scoring: `Exact = 1.0`, case-insensitive `Prefix = 0.8`, case-insensitive substring `Fuzzy = 0.5` — not BM25. `MatchType` is a discriminator only, not a ranking signal; callers sort by `score`. `ranked` is `skip_if_empty`, and an empty query preserves pre-v2.3 behaviour (empty `ranked`). Pre-v2.3 clients that pattern-match `{ symbols }` keep working unchanged. + +- **`OutgoingCallEdge`, `RankedSymbol`, `MatchType`** — new public types in `lip_core::query_graph::types`. + +- **Drift guard** — the two new client messages (`query_blast_radius_symbol`, `query_outgoing_calls`) are registered in both `supported_messages()` and `variant_tag()`; the `supported_messages_covers_all_variants` test now covers them. + +### Changed + +- **`LipDatabase::workspace_symbols`** now delegates to `workspace_symbols_ranked`; the old signature is preserved for callers that do not need the ranked tier (LSP bridge, MCP adapter, legacy CLI). No behaviour change for existing callers. + --- ## [2.2.0] — 2026-04-21 diff --git a/Cargo.lock b/Cargo.lock index 31fbf39..032c4cf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1074,7 +1074,7 @@ checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53" [[package]] name = "lip-cli" -version = "2.2.0" +version = "2.3.1" dependencies = [ "anyhow", "clap", @@ -1093,7 +1093,7 @@ dependencies = [ [[package]] name = "lip-core" -version = "2.2.0" +version = "2.3.1" dependencies = [ "anyhow", "criterion", @@ -1130,7 +1130,7 @@ dependencies = [ [[package]] name = "lip-registry" -version = "2.2.0" +version = "2.3.1" dependencies = [ "anyhow", "axum", diff --git a/Cargo.toml b/Cargo.toml index 1e5eeb9..e2055c8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,7 +7,7 @@ members = [ ] [workspace.package] -version = "2.2.0" +version = "2.3.1" edition = "2021" rust-version = "1.78" authors = ["Lisa Welsch "] diff --git a/README.md b/README.md index c1e2cdf..838e49b 100644 --- a/README.md +++ b/README.md @@ -379,7 +379,7 @@ Requires Rust 1.78+. No system `protoc` required. ## Status -v2.1 — `StreamContext` (token-budgeted RAG context streaming): callers stream symbols ranked by relevance to a cursor and stop reading when the prompt budget is full instead of fetching top-k and locally truncating; `protocol_version` bumped to `2`. v2.0 — `ExplainMatch` (chunk-level explanation: which lines in a result file drove the match), model provenance (`FileStatus` exposes the embedding model per file; `IndexStatus` warns when the index contains mixed-model vectors). v1.9: `filter` glob + `min_score` on all NN calls, `GetCentroid`, `QueryStaleEmbeddings`. v1.8: `FindBoundaries`, `SemanticDiff`, `QueryNearestInStore` (cross-repo federation), `QueryNoveltyScore`, `ExtractTerminology`, `PruneDeleted`. v1.7: 6 semantic retrieval primitives. v1.6: `ReindexFiles`, `Similarity`, `QueryExpansion`, `Cluster`, `ExportEmbeddings`. Wire format is JSON. +v2.3.1 — CKB import-landing fix: `RegisterProjectRoot` + daemon-side canonical URI resolution, `EdgesSource` provenance on blast radius results, tier-1 edge back-fill when SCIP imports carry none, `lip import` emits canonical `lip://local///` URIs, and `lip import --verify` round-trips a sample after push. v2.3 — CKB structural-parity bundle: rich symbol metadata (`signature_normalized`, `modifiers`, `visibility`, `container_name`, `extraction_tier`, `modifiers_source`), reference classification (`ReferenceKind` + `is_test` on every occurrence), `QueryBlastRadiusSymbol` (single-symbol blast radius), `QueryOutgoingCalls` (forward call-graph BFS), and ranked / filtered `QueryWorkspaceSymbols` (`kind_filter`, `scope`, `modifier_filter`, tiered `ranked` output). All additive; `protocol_version` stays at `2`. v2.2 — function-level blast radius, `ReindexStale`, `BatchFileStatus`, `QueryAbiHash`, Tier 1.5 Datalog inference, Tier 2 exponential backoff. v2.1 — `StreamContext` (token-budgeted RAG context streaming); `protocol_version` bumped to `2`. v2.0 — `ExplainMatch`, model provenance on embeddings. v1.9: `filter` glob + `min_score` on all NN calls, `GetCentroid`, `QueryStaleEmbeddings`. v1.8: `FindBoundaries`, `SemanticDiff`, `QueryNearestInStore` (cross-repo federation), `QueryNoveltyScore`, `ExtractTerminology`, `PruneDeleted`. v1.7: 6 semantic retrieval primitives. v1.6: `ReindexFiles`, `Similarity`, `QueryExpansion`, `Cluster`, `ExportEmbeddings`. Wire format is JSON. --- diff --git a/bindings/rust/src/daemon/session.rs b/bindings/rust/src/daemon/session.rs index 096e6ce..34f44db 100644 --- a/bindings/rust/src/daemon/session.rs +++ b/bindings/rust/src/daemon/session.rs @@ -1640,6 +1640,10 @@ impl Session { // ── v2.1: Tier 3 provenance registration ────────────────────── ClientMessage::RegisterTier3Source { source } => { let mut db = self.db.lock().await; + // Auto-register the source's project root for URI resolution (v2.3.1). + if !source.project_root.is_empty() { + db.register_project_root(&source.project_root); + } db.register_tier3_source(source); ServerMessage::DeltaAck { seq: 0, @@ -1648,6 +1652,17 @@ impl Session { } } + // ── v2.3.1: standalone project-root registration ────────────── + ClientMessage::RegisterProjectRoot { root } => { + let mut db = self.db.lock().await; + db.register_project_root(&root); + ServerMessage::DeltaAck { + seq: 0, + accepted: true, + error: None, + } + } + // ── v2.2 features ───────────────────────────────────────────── ClientMessage::ReindexStale { uris, @@ -2533,6 +2548,10 @@ fn process_query_sync( err("RegisterTier3Source is a mutation; not permitted in BatchQuery") } + ClientMessage::RegisterProjectRoot { .. } => { + err("RegisterProjectRoot is a mutation; not permitted in BatchQuery") + } + // ── v2.2: new variants ─────────────────────────────────────────────── ClientMessage::ReindexStale { .. } => { err("ReindexStale requires filesystem I/O; not permitted in BatchQuery") diff --git a/bindings/rust/src/query_graph/db.rs b/bindings/rust/src/query_graph/db.rs index 34d13f2..0fd77e1 100644 --- a/bindings/rust/src/query_graph/db.rs +++ b/bindings/rust/src/query_graph/db.rs @@ -13,8 +13,8 @@ use std::sync::Arc; use crate::indexer::{language::Language, Tier1Indexer}; use crate::query_graph::types::{ - ApiSurface, BlastRadiusResult, EnrichedBlastRadius, ImpactItem, ImpactSource, RiskLevel, - SemanticImpactItem, SimilarSymbol, + ApiSurface, BlastRadiusResult, EdgesSource, EnrichedBlastRadius, ImpactItem, ImpactSource, + RiskLevel, SemanticImpactItem, SimilarSymbol, }; use crate::schema::EdgeKind; use crate::schema::{ @@ -87,6 +87,37 @@ fn trigram_similarity(a: &str, b: &str) -> f32 { intersection as f32 / union as f32 } +/// Normalise a project-root string to an absolute filesystem path (v2.3.1). +/// +/// Accepts `file:///abs`, `lip://local/abs`, and bare `/abs` forms. Trailing +/// slashes are stripped. Returns `""` for unrecognised inputs or empty roots. +fn normalise_root(raw: &str) -> String { + let stripped = if let Some(rest) = raw.strip_prefix("file://") { + // file:///abs — the first '/' is part of the path + let trimmed = rest.trim_start_matches('/'); + format!("/{trimmed}") + } else if let Some(rest) = raw.strip_prefix("lip://local") { + // lip://local/abs → /abs; handle lip://local (no slash) as "" + let trimmed = rest.trim_start_matches('/'); + if trimmed.is_empty() { + String::new() + } else { + format!("/{trimmed}") + } + } else if raw.starts_with('/') { + raw.to_owned() + } else { + String::new() + }; + let trimmed = stripped.trim_end_matches('/'); + // Never leave the bare string "" ambiguous with root "/". Empty input → empty root. + if trimmed.is_empty() && !stripped.starts_with('/') { + String::new() + } else { + trimmed.to_owned() + } +} + // ─── Internal types ─────────────────────────────────────────────────────────── #[derive(Debug)] @@ -187,6 +218,17 @@ pub struct LipDatabase { /// `QueryIndexStatus` so clients can implement their own staleness /// policy; the daemon never reasons about freshness itself. tier3_sources: HashMap, + /// Absolute filesystem paths registered as project roots (v2.3.1). + /// Used by [`LipDatabase::canonicalize_uri`] to resolve relative + /// `lip://local/` URIs from clients against absolute keys + /// produced by SCIP import. Populated by `RegisterProjectRoot` and + /// implicitly by `RegisterTier3Source` when `project_root` is set. + /// Resolution prefers the longest prefix when multiple roots match. + registered_roots: HashSet, + /// Per-file provenance of call edges (v2.3.1). Surfaced back to + /// clients through [`crate::query_graph::types::EnrichedBlastRadius::edges_source`] + /// so CKB can decide whether to fall back to its own SCIP backend. + file_edges_source: HashMap, } impl LipDatabase { @@ -215,7 +257,94 @@ impl LipDatabase { symbol_embedding_models: HashMap::new(), file_indexed_at: HashMap::new(), tier3_sources: HashMap::new(), + registered_roots: HashSet::new(), + file_edges_source: HashMap::new(), + } + } + + // ── v2.3.1 project-root registration + URI canonicalisation ───────── + + /// Register an absolute project root for URI canonicalisation. + /// + /// Accepts either a bare filesystem path (`/repo`), a `file:///` URI + /// (`file:///repo`), or a `lip://local/` URI (`lip://local/repo`). + /// Trailing slashes are trimmed. Duplicates are no-ops. + /// + /// Returns `true` when a new root was inserted, `false` when the root + /// was already registered (or the input normalised to an empty path). + pub fn register_project_root(&mut self, raw: &str) -> bool { + let path = normalise_root(raw); + if path.is_empty() { + return false; + } + self.registered_roots.insert(path) + } + + /// All currently-registered project roots, sorted for deterministic + /// output. Primarily exposed for diagnostics and tests. + pub fn registered_roots(&self) -> Vec { + let mut out: Vec = self.registered_roots.iter().cloned().collect(); + out.sort(); + out + } + + /// Canonicalise a URI for lookup inside the query graph (v2.3.1). + /// + /// Only `lip://local/` (a *relative* URI — no leading slash after + /// the scheme) is rewritten: the daemon prepends a registered project + /// root so `` lands on the absolute form that the tier-1 importer + /// and the v2.3.1 SCIP importer both emit (`lip://local//abs/path`). + /// All other URIs — including `file:///abs/path`, `lip://local//abs/...`, + /// `scip://external`, etc. — are returned unchanged. + /// + /// Fragments (`#symbol`) are preserved across the rewrite. Multiple + /// matching roots are tried longest-first; the longest root also wins + /// when no file-match exists so write paths still produce a stable key + /// before the file is first upserted. + /// + /// This method never mutates state — safe to call on the read path. + pub fn canonicalize_uri(&self, uri: &str) -> String { + let Some(body_and_frag) = uri.strip_prefix("lip://local/") else { + return uri.to_owned(); + }; + let (body, frag) = match body_and_frag.find('#') { + Some(i) => (&body_and_frag[..i], &body_and_frag[i..]), + None => (body_and_frag, ""), + }; + if body.starts_with('/') { + // Already absolute — canonical. + return uri.to_owned(); + } + + // Relative path — try each registered root, longest first. + // Root starts with `/`, so the extra slash in the format string + // produces the double-slash convention used by tier-1 extractors + // (`lip://local//abs/path`). + let mut roots: Vec<&String> = self.registered_roots.iter().collect(); + roots.sort_by(|a, b| b.len().cmp(&a.len())); + for root in &roots { + let candidate = format!("lip://local/{}/{}", root, body); + if self.file_inputs.contains_key(&candidate) { + return if frag.is_empty() { + candidate + } else { + format!("{}{}", candidate, frag) + }; + } + } + + // No file-match: fall back to the longest root anyway, so write + // paths still produce a stable canonical key even before the file + // is first upserted. + if let Some(longest) = roots.first() { + let candidate = format!("lip://local/{}/{}", longest, body); + return if frag.is_empty() { + candidate + } else { + format!("{}{}", candidate, frag) + }; } + uri.to_owned() } /// Record (or refresh) provenance for a Tier 3 ingestion batch. @@ -245,10 +374,11 @@ impl LipDatabase { /// A change in hash means the public interface changed — safe as a /// downstream recompilation / re-verification trigger (Kotlin IC model). pub fn abi_hash(&mut self, uri: &str) -> Option { - if !self.file_inputs.contains_key(uri) { + let uri = self.canonicalize_uri(uri); + if !self.file_inputs.contains_key(&uri) { return None; } - let syms = self.file_symbols(uri); + let syms = self.file_symbols(&uri); let mut surface: Vec = syms .iter() .filter(|s| s.is_exported) @@ -385,6 +515,7 @@ impl LipDatabase { /// Register or update a file. Bumps the global revision and invalidates /// cached derived data for `uri`. pub fn upsert_file(&mut self, uri: String, text: String, language: String) { + let uri = self.canonicalize_uri(&uri); self.revision += 1; let rev = self.revision; let content_hash = sha256_hex(text.as_bytes()); @@ -467,6 +598,12 @@ impl LipDatabase { } pairs.push((edge.from_uri.clone(), edge.to_uri.clone())); } + let src = if pairs.is_empty() { + EdgesSource::Empty + } else { + EdgesSource::Tier1 + }; + self.file_edges_source.insert(uri.clone(), src); self.file_call_edges.insert(uri.clone(), pairs); } @@ -518,13 +655,14 @@ impl LipDatabase { occurrences: Vec, edges: Vec, ) { + let uri = self.canonicalize_uri(&uri); self.revision += 1; let rev = self.revision; self.file_inputs.insert( uri.clone(), FileInput { text: String::new(), - language, + language: language.clone(), revision: rev, precomputed: true, content_hash, @@ -614,6 +752,51 @@ impl LipDatabase { } pairs.push((edge.from_uri.clone(), edge.to_uri.clone())); } + + // v2.3.1 Feature #5 — SCIP-imported files often have no call edges + // (scip-clang omits `SymbolRole::Call`; scip-go is inconsistent). + // When the disk source is reachable, re-run the Tier-1 tree-sitter + // edge extractor to populate the forward + reverse call graph so + // `QueryBlastRadiusSymbol` returns non-empty `direct_items`. + let edges_src = if !pairs.is_empty() { + EdgesSource::ScipOnly + } else if let Some(path) = crate::daemon::watcher::uri_to_path(&uri) { + match std::fs::read_to_string(&path) { + Ok(text) => { + let lang = Language::detect(&uri, &language); + let tier1_edges = Tier1Indexer::new().edges_for_source(&uri, &text, lang); + let mut filled = false; + for edge in tier1_edges.iter().filter(|e| e.kind == EdgeKind::Calls) { + self.callee_to_callers + .entry(edge.to_uri.clone()) + .or_default() + .push(edge.from_uri.clone()); + self.caller_to_callees + .entry(edge.from_uri.clone()) + .or_default() + .push(edge.to_uri.clone()); + let callee_name = extract_name(&edge.to_uri).to_owned(); + if !callee_name.is_empty() { + self.callee_name_to_callers + .entry(callee_name) + .or_default() + .push(edge.from_uri.clone()); + } + pairs.push((edge.from_uri.clone(), edge.to_uri.clone())); + filled = true; + } + if filled { + EdgesSource::ScipWithTier1Edges + } else { + EdgesSource::Empty + } + } + Err(_) => EdgesSource::Empty, + } + } else { + EdgesSource::Empty + }; + self.file_edges_source.insert(uri.clone(), edges_src); self.file_call_edges.insert(uri.clone(), pairs); let now_ms = std::time::SystemTime::now() @@ -625,6 +808,8 @@ impl LipDatabase { } pub fn remove_file(&mut self, uri: &str) { + let uri = self.canonicalize_uri(uri); + let uri = uri.as_str(); self.revision += 1; self.file_inputs.remove(uri); self.sym_cache.remove(uri); @@ -693,10 +878,11 @@ impl LipDatabase { /// Returns the source text stored for `uri`, or `None` if not indexed. pub fn file_source_text(&self, uri: &str) -> Option { - let fi = self.file_inputs.get(uri)?; + let canon = self.canonicalize_uri(uri); + let fi = self.file_inputs.get(&canon)?; if fi.precomputed && fi.text.is_empty() { - if let Some(path) = uri.strip_prefix("file://") { - if let Ok(text) = std::fs::read_to_string(path) { + if let Some(path) = crate::daemon::watcher::uri_to_path(&canon) { + if let Ok(text) = std::fs::read_to_string(&path) { return Some(text); } } @@ -724,6 +910,8 @@ impl LipDatabase { if upgrades.is_empty() { return; } + let canon = self.canonicalize_uri(uri); + let uri = canon.as_str(); let Some(cached) = self.sym_cache.get(uri) else { return; }; @@ -764,11 +952,13 @@ impl LipDatabase { // ── Raw accessors ───────────────────────────────────────────────────── pub fn file_text(&self, uri: &str) -> Option<&str> { - self.file_inputs.get(uri).map(|f| f.text.as_str()) + let canon = self.canonicalize_uri(uri); + self.file_inputs.get(&canon).map(|f| f.text.as_str()) } pub fn file_language(&self, uri: &str) -> Option<&str> { - self.file_inputs.get(uri).map(|f| f.language.as_str()) + let canon = self.canonicalize_uri(uri); + self.file_inputs.get(&canon).map(|f| f.language.as_str()) } pub fn tracked_uris(&self) -> Vec { @@ -776,11 +966,15 @@ impl LipDatabase { } pub fn is_precomputed(&self, uri: &str) -> bool { - self.file_inputs.get(uri).is_some_and(|f| f.precomputed) + let canon = self.canonicalize_uri(uri); + self.file_inputs.get(&canon).is_some_and(|f| f.precomputed) } pub fn file_content_hash(&self, uri: &str) -> Option<&str> { - self.file_inputs.get(uri).map(|f| f.content_hash.as_str()) + let canon = self.canonicalize_uri(uri); + self.file_inputs + .get(&canon) + .map(|f| f.content_hash.as_str()) } /// Read-only access to cached symbols (for journal compaction). @@ -851,6 +1045,8 @@ impl LipDatabase { /// Tier 1 symbols for a file, lazily computed and cached. pub fn file_symbols(&mut self, uri: &str) -> Arc> { + let canon = self.canonicalize_uri(uri); + let uri = canon.as_str(); let file_rev = match self.file_inputs.get(uri) { Some(f) => f.revision, None => return Arc::new(vec![]), @@ -877,6 +1073,8 @@ impl LipDatabase { /// Tier 1 occurrences for a file, lazily computed and cached. pub fn file_occurrences(&mut self, uri: &str) -> Arc> { + let canon = self.canonicalize_uri(uri); + let uri = canon.as_str(); let file_rev = match self.file_inputs.get(uri) { Some(f) => f.revision, None => return Arc::new(vec![]), @@ -899,6 +1097,8 @@ impl LipDatabase { /// If `content_hash` is identical to the last-cached value, downstream /// callers can skip their own recomputation (see spec §3.1 "early cutoff"). pub fn file_api_surface(&mut self, uri: &str) -> Arc { + let canon = self.canonicalize_uri(uri); + let uri = canon.as_str(); let file_rev = match self.file_inputs.get(uri) { Some(f) => f.revision, None => { @@ -956,6 +1156,8 @@ impl LipDatabase { /// Files that directly reference any exported symbol from `uri`. pub fn reverse_deps(&mut self, uri: &str) -> Vec { + let canon = self.canonicalize_uri(uri); + let uri = canon.as_str(); let uris: Vec = self.file_inputs.keys().cloned().collect(); let target = self.file_api_surface(uri); let target_uris: Vec = target.symbols.iter().map(|s| s.uri.clone()).collect(); @@ -975,6 +1177,9 @@ impl LipDatabase { const DEPTH_LIMIT: u32 = 4; const NODE_LIMIT: usize = 200; + let canon_symbol = self.canonicalize_uri(symbol_uri); + let symbol_uri = canon_symbol.as_str(); + let all_uris: Vec = self.file_inputs.keys().cloned().collect(); let def_uri = all_uris .iter() @@ -1229,11 +1434,12 @@ impl LipDatabase { }; for file_uri in changed_file_uris { - if !self.file_inputs.contains_key(file_uri.as_str()) { + let canon_file = self.canonicalize_uri(file_uri); + if !self.file_inputs.contains_key(canon_file.as_str()) { not_indexed_uris.push(file_uri.clone()); continue; } - let syms = self.file_symbols(file_uri); + let syms = self.file_symbols(&canon_file); for sym in syms.iter() { if !interesting(sym.kind) { continue; @@ -1277,12 +1483,13 @@ impl LipDatabase { source, }); } - } else if let Some(file_embedding) = self.file_embeddings.get(file_uri).cloned() + } else if let Some(file_embedding) = + self.file_embeddings.get(canon_file.as_str()).cloned() { let neighbours = self.nearest_by_vector( &file_embedding, 20, - Some(file_uri), + Some(&canon_file), None, Some(threshold), ); @@ -1302,10 +1509,12 @@ impl LipDatabase { } } + let edges_source = self.file_edges_source.get(&canon_file).copied(); results.push(EnrichedBlastRadius { - file_uri: file_uri.clone(), + file_uri: canon_file.clone(), static_result, semantic_items, + edges_source, }); } } @@ -1323,6 +1532,8 @@ impl LipDatabase { symbol_uri: &str, min_score: Option, ) -> Option { + let canon_symbol = self.canonicalize_uri(symbol_uri); + let symbol_uri = canon_symbol.as_str(); let file_uri = self.def_index.get(symbol_uri).map(|(f, _)| f.clone())?; if !self.file_inputs.contains_key(file_uri.as_str()) { return None; @@ -1383,10 +1594,12 @@ impl LipDatabase { } } + let edges_source = self.file_edges_source.get(&file_uri).copied(); Some(EnrichedBlastRadius { file_uri, static_result, semantic_items, + edges_source, }) } @@ -1403,6 +1616,9 @@ impl LipDatabase { const NODE_LIMIT: usize = 200; let depth = depth.clamp(1, 8); + let canon = self.canonicalize_uri(symbol_uri); + let symbol_uri = canon.as_str(); + let mut edges: Vec<(String, String)> = Vec::new(); let mut seen_edges: HashSet<(String, String)> = HashSet::new(); let mut visited: HashSet = HashSet::new(); @@ -1444,7 +1660,8 @@ impl LipDatabase { /// /// Returns `None` if no occurrence covers the given position. pub fn symbol_at_position(&mut self, uri: &str, line: i32, col: i32) -> Option { - let occs = self.file_occurrences(uri); + let canon = self.canonicalize_uri(uri); + let occs = self.file_occurrences(&canon); occs.iter() .find(|occ| range_contains(&occ.range, line, col)) .map(|occ| occ.symbol_uri.clone()) @@ -1454,7 +1671,8 @@ impl LipDatabase { /// /// O(1) via the definition reverse index maintained in `upsert_file`. pub fn symbol_definition_location(&self, symbol_uri: &str) -> Option<(String, OwnedRange)> { - self.def_index.get(symbol_uri).cloned() + let canon = self.canonicalize_uri(symbol_uri); + self.def_index.get(canon.as_str()).cloned() } /// Files that reference any of the given display-name strings (Kotlin IC model). @@ -1494,32 +1712,38 @@ impl LipDatabase { /// Store a pre-computed embedding vector for a file, recording which model produced it. pub fn set_file_embedding(&mut self, uri: &str, vector: Vec, model: &str) { - self.file_embeddings.insert(uri.to_owned(), vector); + let uri = self.canonicalize_uri(uri); + self.file_embeddings.insert(uri.clone(), vector); self.file_embedding_models - .insert(uri.to_owned(), model.to_owned()); + .insert(uri, model.to_owned()); } /// Retrieve the stored embedding vector for a file, if any. pub fn get_file_embedding(&self, uri: &str) -> Option<&Vec> { - self.file_embeddings.get(uri) + let canon = self.canonicalize_uri(uri); + self.file_embeddings.get(canon.as_str()) } /// Retrieve the model that produced the stored embedding for a file, if any. pub fn file_embedding_model(&self, uri: &str) -> Option<&str> { - self.file_embedding_models.get(uri).map(String::as_str) + let canon = self.canonicalize_uri(uri); + self.file_embedding_models + .get(canon.as_str()) + .map(String::as_str) } /// Store a pre-computed embedding vector for a symbol URI (`lip://` scheme), /// recording which model produced it. pub fn set_symbol_embedding(&mut self, uri: &str, vector: Vec, model: &str) { - self.symbol_embeddings.insert(uri.to_owned(), vector); - self.symbol_embedding_models - .insert(uri.to_owned(), model.to_owned()); + let uri = self.canonicalize_uri(uri); + self.symbol_embeddings.insert(uri.clone(), vector); + self.symbol_embedding_models.insert(uri, model.to_owned()); } /// Retrieve the stored embedding vector for a symbol URI, if any. pub fn get_symbol_embedding(&self, uri: &str) -> Option<&Vec> { - self.symbol_embeddings.get(uri) + let canon = self.canonicalize_uri(uri); + self.symbol_embeddings.get(canon.as_str()) } /// Return the distinct model names present across all stored file embeddings. @@ -1761,6 +1985,8 @@ impl LipDatabase { /// /// Returns `(indexed, has_embedding, age_seconds)`. pub fn file_status(&self, uri: &str) -> (bool, bool, Option) { + let canon = self.canonicalize_uri(uri); + let uri = canon.as_str(); let indexed = self.file_inputs.contains_key(uri); let has_embedding = self.file_embeddings.contains_key(uri); let age_seconds = self.file_indexed_at.get(uri).and_then(|&ts_ms| { @@ -2116,6 +2342,8 @@ impl LipDatabase { /// Find `OwnedSymbolInfo` for a given symbol URI across all tracked files and mounted slices. pub fn symbol_by_uri(&mut self, symbol_uri: &str) -> Option { + let canon = self.canonicalize_uri(symbol_uri); + let symbol_uri = canon.as_str(); // Fast path: check mounted slice symbols first (O(1)). if let Some(sym) = self.mounted_symbols.get(symbol_uri) { return Some(sym.clone()); diff --git a/bindings/rust/src/query_graph/types.rs b/bindings/rust/src/query_graph/types.rs index 4b9b18b..3389727 100644 --- a/bindings/rust/src/query_graph/types.rs +++ b/bindings/rust/src/query_graph/types.rs @@ -75,6 +75,28 @@ pub enum ImpactSource { Both, } +/// Provenance for the call edges backing a blast-radius result (v2.3.1). +/// +/// Reported to clients so they can decide how much to trust the static +/// graph: Tier-1 tree-sitter edges are reliable, SCIP-only edges depend +/// on the emitter's accuracy (scip-clang omits calls, scip-go is +/// inconsistent), and `Empty` means no edges were available at all. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum EdgesSource { + /// Edges produced by the Tier-1 tree-sitter pass on the file's source. + Tier1, + /// SCIP import provided symbols/occurrences but no edges, so the + /// daemon re-ran the Tier-1 tree-sitter pass against the file on disk + /// to fill the static call graph (v2.3.1 Feature #5). + ScipWithTier1Edges, + /// SCIP import provided edges via `SymbolRole::Call`, used as-is. + ScipOnly, + /// No call edges are available for this symbol/file. Clients should + /// treat the static blast-radius as best-effort only. + Empty, +} + /// A single entry in a batch blast-radius result. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct EnrichedBlastRadius { @@ -86,6 +108,11 @@ pub struct EnrichedBlastRadius { /// Semantically coupled files/symbols not in the static call graph. /// Empty when `include_semantic` was false or embeddings are unavailable. pub semantic_items: Vec, + /// Provenance for the call edges used to compute `static_result` + /// (v2.3.1). `None` when older daemons serialise this field without + /// any signal — deserialisers default to `None`. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub edges_source: Option, } /// A single forward call edge returned by `QueryOutgoingCalls` (v2.3). @@ -1309,6 +1336,24 @@ pub enum ClientMessage { QueryAbiHash { uri: String, }, + + // ── v2.3.1 — URI canonicalisation ───────────────────────────────────── + /// Register a project root so the daemon can resolve relative + /// `lip://local/` URIs (from clients like CKB) against absolute + /// `lip://local/` keys (how SCIP imports are stored). + /// + /// Idempotent: re-registering the same root is a no-op. Callers may + /// issue this unconditionally at startup regardless of whether a prior + /// `lip import` already registered the same root. + /// + /// With multiple registered roots, URI resolution matches the + /// longest-prefix first. Acknowledged with `DeltaAck`. + RegisterProjectRoot { + /// Absolute filesystem path or `file:///…` / `lip://local/…` URI + /// of the project root. Normalised to an absolute path by the + /// daemon before insertion. + root: String, + }, } impl ClientMessage { @@ -1377,6 +1422,7 @@ impl ClientMessage { "reindex_stale", "batch_file_status", "query_abi_hash", + "register_project_root", ] .iter() .map(|s| (*s).to_owned()) @@ -1453,6 +1499,7 @@ impl ClientMessage { ClientMessage::ReindexStale { .. } => "reindex_stale", ClientMessage::BatchFileStatus { .. } => "batch_file_status", ClientMessage::QueryAbiHash { .. } => "query_abi_hash", + ClientMessage::RegisterProjectRoot { .. } => "register_project_root", } } @@ -1678,6 +1725,27 @@ mod tests { assert_eq!(uri, "file:///src/lib.rs"); } + // ── v2.3.1 round-trip tests ─────────────────────────────────────── + #[test] + fn register_project_root_round_trips() { + let msg = ClientMessage::RegisterProjectRoot { + root: "file:///repo".into(), + }; + let rt = round_trip_client(&msg); + let ClientMessage::RegisterProjectRoot { root } = rt else { + panic!("wrong variant"); + }; + assert_eq!(root, "file:///repo"); + } + + #[test] + fn register_project_root_not_batchable() { + assert!(ClientMessage::RegisterProjectRoot { + root: String::new() + } + .is_batchable()); + } + #[test] fn query_abi_hash_is_batchable() { assert!(ClientMessage::QueryAbiHash { uri: String::new() }.is_batchable()); @@ -2008,6 +2076,9 @@ mod tests { }, ClientMessage::BatchFileStatus { uris: vec![] }, ClientMessage::QueryAbiHash { uri: String::new() }, + ClientMessage::RegisterProjectRoot { + root: String::new(), + }, ]; let supported = ClientMessage::supported_messages(); diff --git a/bindings/rust/tests/integration.rs b/bindings/rust/tests/integration.rs index 7a40d20..8bfc24a 100644 --- a/bindings/rust/tests/integration.rs +++ b/bindings/rust/tests/integration.rs @@ -1872,3 +1872,309 @@ pub async fn handle() {} task.abort(); let _ = task.await; } + +// ─── v2.3.1: URI canonicalization + edges_source provenance ────────────────── + +/// Register a project root, upsert a file under its absolute-form URI, then +/// query it back via the relative-form URI. The daemon must resolve the +/// relative URI against the registered root and return `indexed = true`. +/// +/// This is the core of the CKB "printed success but nothing landed" fix: +/// import writes `lip://local//abs/path`, CKB queries `lip://local/rel/path`, +/// both must land on the same stored record. +#[tokio::test] +async fn daemon_register_project_root_canonicalizes_relative_uri() { + let dir = tempfile::tempdir().expect("tempdir"); + let socket = dir.path().join("lip_canon.sock"); + let daemon = LipDaemon::new(&socket); + let task = tokio::spawn(async move { daemon.run().await.ok() }); + tokio::time::sleep(Duration::from_millis(20)).await; + + let mut client = UnixStream::connect(&socket).await.expect("connect"); + + let root = "/workspaces/demo"; + send( + &mut client, + &ClientMessage::RegisterProjectRoot { + root: root.to_owned(), + }, + ) + .await + .expect("send register"); + match recv(&mut client).await.expect("recv register ack") { + ServerMessage::DeltaAck { accepted: true, .. } => {} + other => panic!("expected accepting DeltaAck, got {other:?}"), + } + + let abs_uri = "lip://local//workspaces/demo/src/lib.rs"; + send( + &mut client, + &ClientMessage::Delta { + seq: 1, + action: Action::Upsert, + document: make_doc(abs_uri, "pub fn hello() {}"), + }, + ) + .await + .expect("send upsert"); + let _ = recv(&mut client).await.expect("recv upsert ack"); + + let rel_uri = "lip://local/src/lib.rs"; + send( + &mut client, + &ClientMessage::QueryFileStatus { + uri: rel_uri.to_owned(), + }, + ) + .await + .expect("send file status"); + match recv(&mut client).await.expect("recv file status") { + ServerMessage::FileStatusResult { indexed, .. } => assert!( + indexed, + "relative URI {rel_uri} must canonicalize to the absolute-form record" + ), + other => panic!("expected FileStatusResult, got {other:?}"), + } + + task.abort(); + let _ = task.await; +} + +/// Sending `RegisterProjectRoot` twice for the same root is idempotent. +/// Handshake advertises the capability. +#[tokio::test] +async fn daemon_register_project_root_is_idempotent() { + let dir = tempfile::tempdir().expect("tempdir"); + let socket = dir.path().join("lip_reg_idem.sock"); + let daemon = LipDaemon::new(&socket); + let task = tokio::spawn(async move { daemon.run().await.ok() }); + tokio::time::sleep(Duration::from_millis(20)).await; + + let mut client = UnixStream::connect(&socket).await.expect("connect"); + + send( + &mut client, + &ClientMessage::Handshake { + client_version: Some("test".into()), + }, + ) + .await + .unwrap(); + match recv(&mut client).await.unwrap() { + ServerMessage::HandshakeResult { + supported_messages, .. + } => assert!( + supported_messages.contains(&"register_project_root".to_string()), + "register_project_root must be advertised in handshake capabilities" + ), + other => panic!("expected HandshakeResult, got {other:?}"), + } + + for _ in 0..2 { + send( + &mut client, + &ClientMessage::RegisterProjectRoot { + root: "/tmp/demo".to_owned(), + }, + ) + .await + .unwrap(); + match recv(&mut client).await.unwrap() { + ServerMessage::DeltaAck { accepted: true, .. } => {} + other => panic!("expected DeltaAck(true), got {other:?}"), + } + } + + task.abort(); + let _ = task.await; +} + +/// Tier-1 upsert (real source_text) must advertise `edges_source = Tier1` in +/// both `BlastRadiusSymbolResult` and `BlastRadiusBatchResult`. +#[tokio::test] +async fn daemon_blast_radius_edges_source_tier1() { + let dir = tempfile::tempdir().expect("tempdir"); + let socket = dir.path().join("lip_es_tier1.sock"); + let daemon = LipDaemon::new(&socket); + let task = tokio::spawn(async move { daemon.run().await.ok() }); + tokio::time::sleep(Duration::from_millis(20)).await; + + let mut client = UnixStream::connect(&socket).await.expect("connect"); + + let a_uri = "lip://local/es@0.1/a.rs"; + // `target` must call something so the tier-1 extractor records ≥1 edge; + // zero edges would produce `EdgesSource::Empty`. + let a_src = "pub fn helper() {}\npub fn target() { helper(); }"; + send( + &mut client, + &ClientMessage::Delta { + seq: 1, + action: Action::Upsert, + document: make_doc(a_uri, a_src), + }, + ) + .await + .unwrap(); + let _ = recv(&mut client).await.unwrap(); + + send( + &mut client, + &ClientMessage::QueryWorkspaceSymbols { + query: "target".into(), + limit: Some(5), + kind_filter: None, + scope: None, + modifier_filter: None, + }, + ) + .await + .unwrap(); + let target_uri = match recv(&mut client).await.unwrap() { + ServerMessage::WorkspaceSymbolsResult { symbols, .. } => symbols + .into_iter() + .find(|s| s.display_name == "target") + .expect("target not in workspace") + .uri, + other => panic!("expected WorkspaceSymbolsResult, got {other:?}"), + }; + + use lip_core::query_graph::types::EdgesSource; + + send( + &mut client, + &ClientMessage::QueryBlastRadiusSymbol { + symbol_uri: target_uri, + min_score: None, + }, + ) + .await + .unwrap(); + match recv(&mut client).await.unwrap() { + ServerMessage::BlastRadiusSymbolResult { result } => { + let e = result.expect("expected Some result"); + assert_eq!( + e.edges_source, + Some(EdgesSource::Tier1), + "Tier-1 upsert must set edges_source = Tier1" + ); + } + other => panic!("expected BlastRadiusSymbolResult, got {other:?}"), + } + + send( + &mut client, + &ClientMessage::QueryBlastRadiusBatch { + changed_file_uris: vec![a_uri.to_owned()], + min_score: None, + }, + ) + .await + .unwrap(); + match recv(&mut client).await.unwrap() { + ServerMessage::BlastRadiusBatchResult { results, .. } => { + assert!(!results.is_empty(), "batch should yield ≥1 entry"); + assert!( + results + .iter() + .all(|e| e.edges_source == Some(EdgesSource::Tier1)), + "every batch entry must carry edges_source = Tier1; got {:?}", + results.iter().map(|e| e.edges_source).collect::>(), + ); + } + other => panic!("expected BlastRadiusBatchResult, got {other:?}"), + } + + task.abort(); + let _ = task.await; +} + +/// Pre-computed (SCIP-style) delta with no edges and `source_text = None`: +/// the daemon must read the file from disk and back-fill Tier-1 call edges, +/// tagging the result `ScipWithTier1Edges`. +#[tokio::test] +async fn daemon_precomputed_tier1_edge_fill_on_disk() { + let dir = tempfile::tempdir().expect("tempdir"); + let socket = dir.path().join("lip_es_fill.sock"); + let daemon = LipDaemon::new(&socket); + let task = tokio::spawn(async move { daemon.run().await.ok() }); + tokio::time::sleep(Duration::from_millis(20)).await; + + let mut client = UnixStream::connect(&socket).await.expect("connect"); + + let src_path = dir.path().join("chain.rs"); + std::fs::write(&src_path, "fn a() { b(); }\nfn b() {}\n").unwrap(); + let abs = src_path.to_string_lossy(); + let file_uri = format!("lip://local//{}", abs.trim_start_matches('/')); + let sym_a = format!("{file_uri}#a"); + + let doc = OwnedDocument { + uri: file_uri.clone(), + content_hash: "feedbeef".to_owned(), + language: "rust".to_owned(), + symbols: vec![OwnedSymbolInfo { + uri: sym_a.clone(), + display_name: "a".to_owned(), + kind: SymbolKind::Function, + is_exported: true, + confidence_score: 90, + extraction_tier: ExtractionTier::Tier3Scip, + modifiers_source: Some(ModifiersSource::PrefixParse), + ..Default::default() + }], + occurrences: vec![OwnedOccurrence { + symbol_uri: sym_a.clone(), + range: OwnedRange { + start_line: 0, + start_char: 3, + end_line: 0, + end_char: 4, + }, + confidence_score: 90, + role: Role::Definition, + override_doc: None, + kind: ReferenceKind::Unknown, + is_test: false, + }], + merkle_path: file_uri.clone(), + edges: vec![], + source_text: None, + }; + + send( + &mut client, + &ClientMessage::Delta { + seq: 1, + action: Action::Upsert, + document: doc, + }, + ) + .await + .unwrap(); + let _ = recv(&mut client).await.unwrap(); + + use lip_core::query_graph::types::EdgesSource; + + send( + &mut client, + &ClientMessage::QueryBlastRadiusSymbol { + symbol_uri: sym_a, + min_score: None, + }, + ) + .await + .unwrap(); + match recv(&mut client).await.unwrap() { + ServerMessage::BlastRadiusSymbolResult { result } => { + let e = result.expect("expected Some result"); + assert_eq!( + e.edges_source, + Some(EdgesSource::ScipWithTier1Edges), + "precomputed + on-disk source should back-fill tier-1 edges" + ); + } + other => panic!("expected BlastRadiusSymbolResult, got {other:?}"), + } + + task.abort(); + let _ = task.await; +} diff --git a/docs/LIP_SPEC.mdx b/docs/LIP_SPEC.mdx index 639564f..e4873b3 100644 --- a/docs/LIP_SPEC.mdx +++ b/docs/LIP_SPEC.mdx @@ -1,6 +1,6 @@ --- title: "LIP — Linked Incremental Protocol" -subtitle: "Design Document & Specification v2.2" +subtitle: "Design Document & Specification v2.3" status: "Stable" license: "MIT" authors: ["Lisa Welsch"] @@ -564,6 +564,34 @@ to SCIP's escaping rules: lip://npm/lodash@4.17.21/lodash#`_.chunk` ``` +### 5.4 Local URIs and project roots (v2.3.1) + +For the `local` scope, the daemon stores each file under a single *canonical* URI. +Two forms are accepted on the wire; both map to the same record: + +``` +absolute lip://local//Users/lisa/repo/src/foo.rs +relative lip://local/src/foo.rs +``` + +The absolute form carries a doubled slash after `lip://local/` because the path +component itself begins with `/`. This is the form the tier-1 indexer emits and +the form the v2.3.1 SCIP importer produces (see §10.2). The relative form is +resolved against whichever *project roots* the client has registered: + +``` +Client → Daemon: RegisterProjectRoot { root: "/Users/lisa/repo" } +Daemon → Client: DeltaAck { accepted: true } +``` + +`RegisterProjectRoot` is idempotent — re-registering the same root is a no-op. +Clients may register multiple roots; the daemon resolves relative URIs against +the longest matching root first. The capability is advertised as +`register_project_root` in `HandshakeResult.supported_messages`; pre-v2.3.1 +daemons reply with `UnknownMessage` and clients must fall back to sending +absolute URIs. CKB relies on this contract to send `lip://local/` and have +the daemon re-join them with the absolute records written by `lip import`. + --- ## 6. Protocol Lifecycle @@ -773,8 +801,17 @@ lip.query.blast_radius_batch(changed_file_uris, min_score?) → { risk_level: "low" | "medium" | "high", truncated: bool, semantic_items: [SemanticItem], // only when min_score is set + edges_source: EdgesSource, // v2.3.1, see below }] } + +enum EdgesSource { + "tier1", // edges came from the tree-sitter extractor + "scip_with_tier1_edges", // SCIP import that had no edges; daemon back-filled + // by running tier-1 over the file on disk + "scip_only", // SCIP import that already carried call edges + "empty", // no edges in either source +} ``` Accepts changed **files** and resolves exported symbols server-side — one round-trip @@ -801,6 +838,51 @@ empty) will carry the matched symbol, with no wire format change required. stays structural-only. Semantic callers are additive — they inform the human reviewer ("8 callers (+3 semantically coupled)") but do not inflate risk scores. +#### 8.1.2 Symbol-level blast radius + +``` +lip.query.blast_radius_symbol(symbol_uri, min_score?) → { + result: null | EnrichedBlastRadius +} +``` + +Single-symbol variant of `blast_radius_batch`. Returns the same `EnrichedBlastRadius` +shape — including the v2.3.1 `edges_source` field — or `null` when the symbol is +unknown or not indexed, so callers can distinguish "zero impact" from "no data." +Used by CKB's per-symbol change-impact summaries and by refactor planners that ask +about one candidate at a time. + +**`edges_source` semantics (v2.3.1).** SCIP indexers disagree on whether call +edges belong in the index: `scip-go` emits them inconsistently, `scip-clang` +omits them entirely. The daemon therefore advertises the provenance of each +result's structural edges so consumers can fall back when needed: + +- `tier1` — structural edges came from the tree-sitter extractor over the + stored source text. Authoritative for syntactic calls. +- `scip_only` — the importer provided call edges and the daemon used them + verbatim. Trust but verify: scip-go is known to drop edges. +- `scip_with_tier1_edges` — the importer's `edges` were empty, so the daemon + read the file from disk and ran tier-1 to back-fill. Structural edges are + present even though the upstream producer omitted them. +- `empty` — neither source produced call edges. CKB should fall back to its + own SCIP backend for that file. + +#### 8.1.3 Outgoing calls (forward call graph) + +``` +lip.query.outgoing_calls(symbol_uri, depth) → { + edges: [{ from_uri, to_uri }], + truncated: bool, +} +``` + +Inverse of the blast-radius direction. BFS over the forward call graph from +`symbol_uri` up to `depth` hops (clamped to `[1, 8]`). Implemented by a +`caller_to_callees` index that mirrors the reverse edge map used for blast radius. +Traversal is capped at 200 nodes; the `truncated` flag reports when the cap fires. +Useful for "what does this function reach?" queries — tracing a public API's +downstream behaviour, auditing service calls, generating call-graph visualisations. + ### 8.2 Taint tracking Symbols can be annotated with `taint_labels` (e.g. `["PII", "UNSAFE_IO"]`). LIP @@ -880,6 +962,70 @@ Taint tracking (§8.2) is implemented as a forward reachability query over blast-radius analysis (§8.1) and taint analysis — they differ only in which edge kinds are traversed. +### 8.6 Ranked workspace symbols + +``` +lip.query.workspace_symbols( + query: string, + limit?: int, + kind_filter?: [SymbolKind], + scope?: string, // def-file URI prefix + modifier_filter?: [string], +) → { + symbols: [SymbolInfo], // length n + ranked: [{ symbol_uri, score, match_type }], // length n, parallel (v2.3) +} +``` + +`kind_filter`, `scope`, and `modifier_filter` are optional server-side narrowings +that replace the client-side post-filter patterns CKB was running. `scope` is a +def-file URI prefix (e.g. `lip://local/cli`); `modifier_filter` matches against the +symbol's `modifiers` vector (e.g. `["async"]`, `["pub", "async"]`). + +`ranked[i]` describes `symbols[i]`. Scoring is tiered, not BM25: + +- **`exact`** (score `1.0`) — case-sensitive equality with `display_name`. +- **`prefix`** (score `0.8`) — case-insensitive prefix of `display_name`. +- **`fuzzy`** (score `0.5`) — case-insensitive substring of `display_name`. + +`match_type` is a discriminator for display only; callers should sort by `score`. +Symbols are returned sorted by descending score, then ascending `display_name` for +stability. When `query` is empty, `ranked` is empty (matches pre-v2.3 semantics). + +### 8.7 Structural metadata on symbols + +Every `SymbolInfo` carries the fields that previously required a parallel SCIP +parse: + +- **`signature_normalized`** — parameter-name-stripped, whitespace-collapsed form + of the signature, stable across rename refactors. +- **`modifiers: [string]`** — language-specific modifier keywords (`pub`, `async`, + `static`, `override`, `final`, …). +- **`visibility` + `visibility_confidence`** — `Public` / `Internal` / `Private` / + `Protected`, plus 0–100 confidence. Conventions like Python's underscore prefix + yield lower confidence than an explicit `pub` keyword. +- **`container_name`** — the enclosing class / struct / trait / namespace, or + `None` for top-level symbols. +- **`extraction_tier`** — `Tier1` (tree-sitter), `Tier2Lsp` (language server), + `Tier3Scip` (SCIP import), or `Tier15Inferred` (Datalog inference, §3.3). +- **`modifiers_source`** — `None` when produced by a tree-sitter extractor that + emits modifier tokens directly; `PrefixParse` when derived by the SCIP importer + from `signature_documentation[0]`. Clients that need compiler-grade modifier + provenance should branch on this. + +### 8.8 Reference classification + +Every `Occurrence` carries a `kind: ReferenceKind` discriminator: + +`Unknown` · `Call` · `Read` · `Write` · `Type` · `Implements` · `Extends` + +Plus an `is_test: bool` flag stamped by a file-path heuristic +(`/tests/`, `_test.rs`, `_test.py`, `.spec.ts`, …). Tier-1 extractors classify +via tree-sitter parent-node + field-name lookup; SCIP import maps +`SymbolRole::ReadAccess | WriteAccess | Test` bits to the corresponding fields +(Call has no SCIP equivalent). Both fields are `#[serde(default)]`; pre-v2.3 +occurrences decode with `Unknown` / `false`. + --- ## 9. AI & Agent Integration @@ -1207,6 +1353,15 @@ lip-protocol/ - [x] **`ErrorCode`** enum — stable machine-readable error categories: `unknown_message_type`, `unknown_model`, `embedding_not_configured`, `no_embedding`, `cursor_out_of_range`, `index_locked`, `invalid_request`, `internal` (default). - [x] **`ClientMessage::variant_tag` + `supported_messages_covers_all_variants` test** — compile-time exhaustive match plus paired test that fails when a new variant is added without being advertised in `supported_messages()`. +### v2.3 — CKB structural-parity bundle ✓ + +- [x] **Rich symbol metadata** — `OwnedSymbolInfo` gains `signature_normalized`, `modifiers`, `visibility` + `visibility_confidence`, `container_name`, `extraction_tier`, and `modifiers_source`. Populated by all Tier-1 extractors (Rust / TypeScript / Python / Swift / Kotlin) and by the SCIP importer (via upstream-compatible `enclosing_symbol = 8` + prefix-parsed modifiers). See §8.7. +- [x] **Reference classification** — `OwnedOccurrence` gains `kind: ReferenceKind` (`Unknown` / `Call` / `Read` / `Write` / `Type` / `Implements` / `Extends`) + `is_test: bool`. Tier-1 classifier uses tree-sitter parent/field lookup; SCIP import/export round-trips via `SymbolRole::ReadAccess | WriteAccess | Test`. See §8.8. +- [x] **`QueryBlastRadiusSymbol { symbol_uri, min_score? } → BlastRadiusSymbolResult { result: Option }`** — single-symbol wrapper around `blast_radius_for_symbol`. Returns `None` for unknown or unindexed symbols so callers can distinguish "zero impact" from "no data." Spec §8.1.2. +- [x] **`QueryOutgoingCalls { symbol_uri, depth } → OutgoingCallsResult { edges, truncated }`** — forward call-graph BFS via a new `caller_to_callees` index. Depth clamped `[1, 8]`; NODE_LIMIT = 200. Spec §8.1.3. +- [x] **Ranked & filtered workspace symbols** — `QueryWorkspaceSymbols` adds `kind_filter`, `scope`, `modifier_filter`; `WorkspaceSymbolsResult` adds `ranked: Vec` (parallel to `symbols`) with tiered scoring (`Exact = 1.0` / `Prefix = 0.8` / `Fuzzy = 0.5`). `ranked` is `skip_if_empty`; empty query preserves pre-v2.3 semantics. Spec §8.6. +- [x] **All additive.** `protocol_version` stays at `2`; every new field is `#[serde(default, skip_serializing_if = …)]`; every new message is advertised via `HandshakeResult.supported_messages`. Drift-guard test covers both new variants. + ### v2.2 — Function-level blast radius + intelligence layer ✓ - [x] **`NearestItem.embedding_model`** — every nearest-neighbour hit now carries the model name that produced its stored embedding. Optional / `skip_serializing_if = None`; older clients see no change. Populated by `nearest_by_vector`, `nearest_symbol_by_vector`, and `outliers`. @@ -1329,5 +1484,5 @@ via the `flatbuffers` crate, aligning with LIP's reference implementation langua --- -*LIP Specification v2.2.0 · April 2026 · MIT License* +*LIP Specification v2.3.0 · April 2026 · MIT License* *Lisa Welsch* diff --git a/tools/lip-cli/src/cmd/import.rs b/tools/lip-cli/src/cmd/import.rs index 0ba3ff6..3670617 100644 --- a/tools/lip-cli/src/cmd/import.rs +++ b/tools/lip-cli/src/cmd/import.rs @@ -26,6 +26,13 @@ mod scip { /// With `--push-to-daemon`, each document delta is streamed directly to a running /// LIP daemon — enabling nightly CI to push compiler-accurate symbols into the /// live graph without a daemon restart. +/// +/// As of v2.3.1 imports use canonical `lip://local///` URIs when the +/// SCIP `Metadata.project_root` is present, or `lip://local/` when it is +/// absent (the daemon resolves these against roots supplied via +/// `RegisterProjectRoot`). This replaces the pre-2.3.1 `file:///` form, +/// which silently mismatched CKB queries. Use `--verify` to round-trip a +/// sample of the imported files after pushing. #[derive(Args)] pub struct ImportArgs { /// Path to the `.scip` file to import. @@ -57,6 +64,17 @@ pub struct ImportArgs { /// EventStream-JSON output path. #[arg(long)] pub no_provenance: bool, + + /// After pushing deltas, round-trip a sample of the imported files + /// against the daemon to catch URI/canonicalization regressions. + /// + /// Samples up to 10 files, issues `QueryFileStatus` (expects + /// `indexed=true`) and — when the file carries an exported + /// definition — `QueryWorkspaceSymbols` scoped to that file. Exits + /// non-zero on any mismatch so CI catches silent drops. Only + /// valid with `--push-to-daemon`. + #[arg(long, requires = "push_to_daemon")] + pub verify: bool, } pub async fn run(args: ImportArgs) -> anyhow::Result<()> { @@ -89,10 +107,20 @@ pub async fn run(args: ImportArgs) -> anyhow::Result<()> { }; let confidence = args.confidence; + // SCIP metadata.project_root is a file:// URL identifying the source tree + // the producer indexed. We use it to promote per-document URIs from the + // old `file:///` form to canonical `lip://local//` + // (spec §5). When absent, the daemon resolves relative URIs against any + // RegisterProjectRoot roots supplied by the client. + let project_root_abs = index + .metadata + .as_ref() + .map(|m| strip_file_scheme(&m.project_root)) + .unwrap_or_default(); let mut deltas: Vec = index .documents .into_iter() - .map(|d| convert_document(d, confidence)) + .map(|d| convert_document(d, confidence, &project_root_abs)) .collect(); // Also import external symbols as a synthetic document. @@ -123,6 +151,15 @@ pub async fn run(args: ImportArgs) -> anyhow::Result<()> { } // ── CI batch push: stream deltas directly to a running daemon ────────────── + // Snapshot up to 10 (uri, probe) pairs before the push loop consumes the + // Vec. We round-trip these against the daemon after pushing so CI catches + // the "printed success but nothing landed" class of bug. + let verify_samples: Vec = if args.verify { + collect_verify_samples(&deltas, 10) + } else { + Vec::new() + }; + if let Some(socket_path) = args.push_to_daemon { let mut stream = UnixStream::connect(&socket_path).await.map_err(|e| { anyhow::anyhow!("cannot connect to daemon at {}: {e}", socket_path.display()) @@ -196,6 +233,10 @@ pub async fn run(args: ImportArgs) -> anyhow::Result<()> { "pushed {total} deltas to daemon at {}", socket_path.display() ); + + if args.verify { + run_verification(&mut stream, &verify_samples).await?; + } return Ok(()); } @@ -255,10 +296,170 @@ fn build_tier3_source(index: &scip::Index, scip_path: &std::path::Path) -> Tier3 } } +// ─── --verify plumbing ──────────────────────────────────────────────────────── + +/// A single file we will round-trip against the daemon after a push. +struct VerifySample { + file_uri: String, + /// Display name + kind of an exported definition inside the file, when + /// one is available. Used to drive a `QueryWorkspaceSymbols` probe. + probe: Option<(String, SymbolKind)>, +} + +/// Take at most `max` real document URIs (skipping the synthetic +/// `scip://external` bundle) and pair each with the first suitable exported +/// definition. Simple head-sample rather than random — deterministic and +/// debuggable; CI catches drop-everything regressions before any sampling +/// strategy matters. +fn collect_verify_samples(deltas: &[OwnedDelta], max: usize) -> Vec { + deltas + .iter() + .filter_map(|d| d.document.as_ref()) + .filter(|doc| !doc.uri.starts_with("scip://")) + .take(max) + .map(|doc| { + let probe = doc + .symbols + .iter() + .find(|s| { + s.is_exported + && matches!( + s.kind, + SymbolKind::Function | SymbolKind::Class | SymbolKind::Interface + ) + }) + .map(|s| (s.display_name.clone(), s.kind)); + VerifySample { + file_uri: doc.uri.clone(), + probe, + } + }) + .collect() +} + +async fn run_verification( + stream: &mut UnixStream, + samples: &[VerifySample], +) -> anyhow::Result<()> { + if samples.is_empty() { + eprintln!("verify: no documents to sample"); + return Ok(()); + } + + let mut failures: usize = 0; + for sample in samples { + let fs = round_trip( + stream, + &ClientMessage::QueryFileStatus { + uri: sample.file_uri.clone(), + }, + ) + .await?; + match fs { + ServerMessage::FileStatusResult { indexed: true, .. } => {} + ServerMessage::FileStatusResult { indexed: false, .. } => { + eprintln!("verify: not indexed: {}", sample.file_uri); + failures += 1; + continue; + } + other => { + eprintln!( + "verify: unexpected reply to QueryFileStatus({}): {other:?}", + sample.file_uri + ); + failures += 1; + continue; + } + } + + let Some((name, kind)) = &sample.probe else { + continue; + }; + let ws = round_trip( + stream, + &ClientMessage::QueryWorkspaceSymbols { + query: name.clone(), + limit: Some(10), + kind_filter: Some(vec![*kind]), + scope: Some(sample.file_uri.clone()), + modifier_filter: None, + }, + ) + .await?; + match ws { + ServerMessage::WorkspaceSymbolsResult { symbols, .. } if !symbols.is_empty() => {} + ServerMessage::WorkspaceSymbolsResult { .. } => { + eprintln!( + "verify: probe '{name}' missing in {}", + sample.file_uri + ); + failures += 1; + } + other => { + eprintln!( + "verify: unexpected reply to QueryWorkspaceSymbols({name}): {other:?}" + ); + failures += 1; + } + } + } + + if failures > 0 { + anyhow::bail!( + "verify: {failures}/{} sample(s) failed round-trip", + samples.len() + ); + } + eprintln!("verify: {} sample(s) OK", samples.len()); + Ok(()) +} + +async fn round_trip( + stream: &mut UnixStream, + msg: &ClientMessage, +) -> anyhow::Result { + let body = serde_json::to_vec(msg)?; + stream.write_all(&(body.len() as u32).to_be_bytes()).await?; + stream.write_all(&body).await?; + let mut len_buf = [0u8; 4]; + stream.read_exact(&mut len_buf).await?; + let n = u32::from_be_bytes(len_buf) as usize; + let mut buf = vec![0u8; n]; + stream.read_exact(&mut buf).await?; + Ok(serde_json::from_slice(&buf)?) +} + // ─── Conversion helpers ─────────────────────────────────────────────────────── -fn convert_document(doc: scip::Document, confidence: u8) -> OwnedDelta { - let uri = format!("file:///{}", doc.relative_path.trim_start_matches('/')); +/// Strip the `file://` scheme from a SCIP-style URL, returning the absolute +/// filesystem path (including leading `/`) or the original string when no +/// scheme is present. An empty input returns an empty string. +fn strip_file_scheme(url: &str) -> String { + if let Some(rest) = url.strip_prefix("file://") { + rest.trim_end_matches('/').to_owned() + } else { + url.trim_end_matches('/').to_owned() + } +} + +/// Build a canonical LIP document URI from a SCIP project root + relative path. +/// +/// When `project_root_abs` is non-empty, emits the absolute form +/// `lip://local//` — and because `` already starts with `/`, +/// the result has the canonical double-slash that tier-1 produces +/// (`lip://local//abs/path`). When it is empty, emits the relative form +/// `lip://local/`, which the daemon resolves via registered roots. +fn build_document_uri(project_root_abs: &str, relative_path: &str) -> String { + let rel = relative_path.trim_start_matches('/'); + if project_root_abs.is_empty() { + format!("lip://local/{rel}") + } else { + format!("lip://local/{project_root_abs}/{rel}") + } +} + +fn convert_document(doc: scip::Document, confidence: u8, project_root_abs: &str) -> OwnedDelta { + let uri = build_document_uri(project_root_abs, &doc.relative_path); let content_hash = sha256_hex(doc.relative_path.as_bytes()); let lang = scip_language_to_lip(&doc.language); @@ -1020,4 +1221,35 @@ mod tests { assert_eq!(o.kind, ReferenceKind::Unknown); assert!(!o.is_test); } + + // ── v2.3.1 URI construction ────────────────────────────────────────────── + + #[test] + fn strip_file_scheme_trims_scheme_and_trailing_slash() { + assert_eq!(strip_file_scheme("file:///Users/lisa/repo"), "/Users/lisa/repo"); + assert_eq!(strip_file_scheme("file:///Users/lisa/repo/"), "/Users/lisa/repo"); + // Non-file URLs are returned verbatim (minus trailing slash). + assert_eq!(strip_file_scheme("/already/absolute"), "/already/absolute"); + assert_eq!(strip_file_scheme(""), ""); + } + + #[test] + fn build_document_uri_absolute_preserves_double_slash() { + // Canonical tier-1 form: `lip://local//abs/path` (two slashes). + let uri = build_document_uri("/Users/lisa/repo", "src/foo.rs"); + assert_eq!(uri, "lip://local//Users/lisa/repo/src/foo.rs"); + } + + #[test] + fn build_document_uri_empty_root_emits_relative_form() { + // Without a project root, the daemon resolves against registered roots. + let uri = build_document_uri("", "src/foo.rs"); + assert_eq!(uri, "lip://local/src/foo.rs"); + } + + #[test] + fn build_document_uri_strips_leading_slash_from_relative_path() { + let uri = build_document_uri("/repo", "/src/foo.rs"); + assert_eq!(uri, "lip://local//repo/src/foo.rs"); + } } From 228bcc0f7e1d363bed9ed539de266bbfd4630990 Mon Sep 17 00:00:00 2001 From: Lisa Date: Tue, 21 Apr 2026 18:38:15 +0200 Subject: [PATCH 11/18] fix: self-echo deadlock on bulk precomputed imports (v2.3.1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Every session subscribes to the daemon's push-notification broadcast and writes pending notifications back to its client after each response. Every Delta{Upsert} also emitted an IndexChanged onto that same broadcast — so the session wrote TWO frames per delta (DeltaAck + self-emitted IndexChanged) while `lip import` read ONE frame per iteration. Frame production ran one frame ahead of consumption; after ~65 deltas the 8 KB macOS AF_UNIX send buffer filled, write_message parked mid-frame, read_message never ran, both processes idle at 0% CPU. Fix: tag every broadcast message with the emitting session's id (Notification { source_session: Option, message: ServerMessage }) and have the drain loop skip envelopes whose source_session matches its own. Tier 2 upgrades emit with source_session=None so they still reach every session. LipDaemon holds an AtomicU64 and assigns a fresh id per accept. Regression test daemon_bulk_precomputed_import_does_not_deadlock pushes 200 precomputed deltas through a single session and fails fast if any IndexChanged echo reaches the client. Verified: test fails at delta 1 without the filter, passes in 60ms with it. Latent since v2.2.0 when IndexChanged-on-every-upsert landed; surfaced only now because the v2.3.1 URI fix let CKB imports run long enough to hit the 8 KB buffer wall (836-doc SCIP bundle froze at ~130). Co-Authored-By: Claude Sonnet 4.6 --- CHANGELOG.md | 2 + bindings/rust/src/daemon/mod.rs | 2 +- bindings/rust/src/daemon/server.rs | 13 ++- bindings/rust/src/daemon/session.rs | 61 +++++++++++--- bindings/rust/src/daemon/tier2_manager.rs | 19 +++-- bindings/rust/tests/integration.rs | 98 +++++++++++++++++++++++ 6 files changed, 174 insertions(+), 21 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 18a5c16..51e57ed 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -30,6 +30,8 @@ All notable changes to this project are documented here. - **CKB "printed success but nothing landed"** — the combined effect of the import URI change + `RegisterProjectRoot` + daemon-side canonicalisation means `lip import` records are now discoverable by a CKB client that queries `lip://local/`, which was the class of bug reported after v2.3.0. +- **Self-echo deadlock on bulk precomputed imports.** Each session subscribes to the daemon's push-notification broadcast and, in the drain loop that runs after every response, writes every pending notification back to its client. Every `Delta { Upsert }` also emitted an `IndexChanged` onto that same broadcast — so the session received an echo of its own emission and wrote it as a *second* frame on top of the `DeltaAck`. The import loop read one frame per iteration, so frame production ran one frame ahead of consumption; after ~65 deltas the 8 KB macOS `AF_UNIX` send buffer filled, `write_message` parked mid-frame, `read_message` never ran, and every worker sat idle with both processes at 0 % CPU. Fixed by tagging every broadcast message with the emitting session's id (`Notification { source_session: Option, message: ServerMessage }`) and having the drain loop skip envelopes whose `source_session` matches its own. Tier 2 upgrades emit with `source_session: None` so they still reach every session. Regression test `daemon_bulk_precomputed_import_does_not_deadlock` pushes 200 precomputed deltas through a single session and fails fast if any `IndexChanged` echo reaches the client. + --- ## [2.3.0] — 2026-04-21 diff --git a/bindings/rust/src/daemon/mod.rs b/bindings/rust/src/daemon/mod.rs index 8492921..ceec325 100644 --- a/bindings/rust/src/daemon/mod.rs +++ b/bindings/rust/src/daemon/mod.rs @@ -44,5 +44,5 @@ pub mod watcher; pub use journal::{Journal, JournalEntry}; pub use manifest::{ManifestRequest, ManifestResponse}; pub use server::LipDaemon; -pub use session::{read_message, write_client_message, write_message, Session}; +pub use session::{read_message, write_client_message, write_message, Notification, Session}; pub use tier2_manager::VerificationJob; diff --git a/bindings/rust/src/daemon/server.rs b/bindings/rust/src/daemon/server.rs index f2b8e98..a186a1a 100644 --- a/bindings/rust/src/daemon/server.rs +++ b/bindings/rust/src/daemon/server.rs @@ -1,4 +1,5 @@ use std::path::{Path, PathBuf}; +use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::{Arc, Mutex as StdMutex}; #[cfg(unix)] @@ -8,11 +9,11 @@ use tokio::net::UnixListener; use tokio::sync::{broadcast, mpsc, Mutex}; use tracing::{error, info, warn}; -use crate::query_graph::{LipDatabase, ServerMessage}; +use crate::query_graph::LipDatabase; use super::embedding::EmbeddingClient; use super::journal::{self, Journal, COMPACT_THRESHOLD as COMPACT_THR}; -use super::session::Session; +use super::session::{Notification, Session}; use super::tier2_manager::{Tier2Manager, VerificationJob, CHANNEL_CAPACITY}; use super::watcher::{self, FileWatcherHandle}; @@ -28,11 +29,14 @@ pub struct LipDaemon { /// Whether to spawn the per-file filesystem watcher on startup. watch_files: bool, /// Broadcast sender for push notifications to all active sessions. - notify_tx: broadcast::Sender, + notify_tx: broadcast::Sender, /// Shared embedding client. `None` when `LIP_EMBEDDING_URL` is not set. embedding_client: Arc>, /// When `true`, spawn a watchdog that exits the process when the parent dies. managed: bool, + /// Monotonic session id counter. Assigned to each accepted connection so + /// the session can filter broadcast echoes of its own emissions. + next_session_id: Arc, } impl LipDaemon { @@ -48,6 +52,7 @@ impl LipDaemon { notify_tx, embedding_client: Arc::new(EmbeddingClient::from_env()), managed: false, + next_session_id: Arc::new(AtomicU64::new(1)), } } @@ -169,7 +174,9 @@ impl LipDaemon { loop { let (stream, _) = listener.accept().await?; + let session_id = self.next_session_id.fetch_add(1, Ordering::Relaxed); let session = Arc::new(Session::new( + session_id, self.db.clone(), Some(self.tier2_tx.clone()), Some(Arc::clone(&shared_journal)), diff --git a/bindings/rust/src/daemon/session.rs b/bindings/rust/src/daemon/session.rs index 34f44db..170d00e 100644 --- a/bindings/rust/src/daemon/session.rs +++ b/bindings/rust/src/daemon/session.rs @@ -19,6 +19,20 @@ use super::watcher::{uri_to_path, FileWatcherHandle}; /// Clients can detect drift by comparing against this value in `HandshakeResult`. const PROTOCOL_VERSION: u32 = 2; +/// Broadcast envelope that tags every push notification with the session +/// that produced it. A session skips messages whose `source_session` +/// matches its own id so an importer never receives echoes of the +/// `IndexChanged` it just emitted — that self-echo doubled the daemon's +/// outbound frame rate and filled the 8 KB macOS AF_UNIX send buffer +/// around delta ~65, freezing bulk SCIP imports. System-originated +/// messages (Tier 2 upgrades) use `source_session: None` so every session +/// still receives them. +#[derive(Clone, Debug)] +pub struct Notification { + pub source_session: Option, + pub message: ServerMessage, +} + /// Convert a classified [`EmbedError`] into the appropriate wire-level /// error response. Centralises the mapping so every embedding call site /// reports the same [`ErrorCode`] category for the same failure mode. @@ -37,6 +51,11 @@ fn embed_error_response(e: EmbedError) -> ServerMessage { /// Per-connection session state. pub struct Session { + /// Unique id for this connection, assigned by the daemon at accept time. + /// Used to filter broadcast echoes: when a session emits a notification + /// tagged with its own id, the drain loop skips it rather than writing + /// it back to the client. + pub session_id: u64, pub db: Arc>, /// Channel to the background Tier 2 manager. `None` when Tier 2 is disabled. pub tier2_tx: Option>, @@ -46,21 +65,23 @@ pub struct Session { pub watcher: Option, /// Broadcast sender for push notifications (e.g. `SymbolUpgraded`). /// Kept so we can subscribe receivers for newly forked sessions. - pub notify_tx: Option>, + pub notify_tx: Option>, /// HTTP embedding client. `None` when `LIP_EMBEDDING_URL` is not configured. pub embedding_client: Arc>, } impl Session { pub fn new( + session_id: u64, db: Arc>, tier2_tx: Option>, journal: Option>>, watcher: Option, - notify_tx: Option>, + notify_tx: Option>, embedding_client: Arc>, ) -> Self { Self { + session_id, db, tier2_tx, journal, @@ -82,9 +103,9 @@ impl Session { /// Drive the session loop for a single connected client. pub async fn run(self: Arc, mut stream: UnixStream) -> anyhow::Result<()> { - info!("new client session"); + info!("new client session id={}", self.session_id); // Subscribe to push notifications for this session's lifetime. - let mut notify_rx: Option> = + let mut notify_rx: Option> = self.notify_tx.as_ref().map(|tx| tx.subscribe()); loop { @@ -157,11 +178,21 @@ impl Session { } // Drain any pending push notifications before blocking on the next read. + // Skip envelopes whose `source_session` matches our own id — those + // are echoes of notifications this same session just emitted, and + // writing them back to the client doubles the daemon's outbound + // frame rate. Filling the 8 KB macOS AF_UNIX send buffer with that + // excess hung bulk SCIP imports around delta ~65 before this fix. if let Some(ref mut rx) = notify_rx { loop { match rx.try_recv() { Ok(notification) => { - if let Err(e) = write_message(&mut stream, ¬ification).await { + if notification.source_session == Some(self.session_id) { + continue; + } + if let Err(e) = + write_message(&mut stream, ¬ification.message).await + { error!("write error (notification): {e}"); break; } @@ -326,10 +357,15 @@ impl Session { if matches!(action, Action::Upsert) { if let Some(tx) = &self.notify_tx { let indexed_files = self.db.lock().await.file_count(); + // Tag with our session id so the drain loop in THIS session + // skips the echo — only other sessions forward it to their clients. // SendError only occurs when there are zero active receivers — benign. - let _ = tx.send(ServerMessage::IndexChanged { - indexed_files, - affected_uris: vec![uri.clone()], + let _ = tx.send(Notification { + source_session: Some(self.session_id), + message: ServerMessage::IndexChanged { + indexed_files, + affected_uris: vec![uri.clone()], + }, }); } } @@ -1450,9 +1486,12 @@ impl Session { } if let Some(tx) = &self.notify_tx { let indexed_files = db.file_count(); - let _ = tx.send(ServerMessage::IndexChanged { - indexed_files, - affected_uris: removed.clone(), + let _ = tx.send(Notification { + source_session: Some(self.session_id), + message: ServerMessage::IndexChanged { + indexed_files, + affected_uris: removed.clone(), + }, }); } } diff --git a/bindings/rust/src/daemon/tier2_manager.rs b/bindings/rust/src/daemon/tier2_manager.rs index 03744dc..4b7d7aa 100644 --- a/bindings/rust/src/daemon/tier2_manager.rs +++ b/bindings/rust/src/daemon/tier2_manager.rs @@ -26,6 +26,8 @@ use crate::indexer::tier2::swift_ls::SwiftBackend; use crate::indexer::tier2::ts_server::TypeScriptBackend; use crate::query_graph::{LipDatabase, ServerMessage}; +use super::session::Notification; + pub const CHANNEL_CAPACITY: usize = 64; // ─── Job ────────────────────────────────────────────────────────────────────── @@ -160,14 +162,14 @@ pub struct Tier2Manager { rx: mpsc::Receiver, backends: Tier2Backends, /// Broadcast sender for push notifications. `None` when notifications are disabled. - notify_tx: Option>, + notify_tx: Option>, } impl Tier2Manager { pub fn new( db: Arc>, rx: mpsc::Receiver, - notify_tx: broadcast::Sender, + notify_tx: broadcast::Sender, ) -> Self { Self { db, @@ -745,8 +747,12 @@ impl Tier2Manager { old_confidence, new_confidence: up.confidence_score, }; + // System-originated: no source_session, so every session forwards it. // `send` fails only when there are no receivers; that's fine. - let _ = tx.send(msg); + let _ = tx.send(Notification { + source_session: None, + message: msg, + }); } } } @@ -992,8 +998,9 @@ mod tests { mgr.broadcast_upgrades(file_uri, &upgrades, &mut db); } - let msg = notify_rx.try_recv().expect("should receive a broadcast"); - match msg { + let envelope = notify_rx.try_recv().expect("should receive a broadcast"); + assert_eq!(envelope.source_session, None, "Tier 2 upgrades are system-originated"); + match envelope.message { ServerMessage::SymbolUpgraded { uri, old_confidence, @@ -1055,7 +1062,7 @@ mod tests { /// short-circuit without reading from the db (the receiver_count check). #[tokio::test] async fn broadcast_upgrades_noop_without_receivers() { - let (notify_tx, _) = broadcast::channel::(16); + let (notify_tx, _) = broadcast::channel::(16); let db = Arc::new(Mutex::new(LipDatabase::new())); // Drop the only receiver so receiver_count == 0. diff --git a/bindings/rust/tests/integration.rs b/bindings/rust/tests/integration.rs index 8bfc24a..34131ed 100644 --- a/bindings/rust/tests/integration.rs +++ b/bindings/rust/tests/integration.rs @@ -2178,3 +2178,101 @@ async fn daemon_precomputed_tier1_edge_fill_on_disk() { task.abort(); let _ = task.await; } + +// ─── Regression: bulk precomputed import does not deadlock on self-echo ────── +// +// Before the v2.3.1 hang fix the session emitted `IndexChanged` back to its +// own subscriber for every upsert, writing two frames per delta while the +// import loop read only one. After ~65 iterations the 8 KB macOS AF_UNIX +// send buffer filled and the daemon blocked mid-write, freezing the push. +// +// This test pushes 200 precomputed deltas through a single session — more +// than the buffer-fill threshold — and asserts the loop completes within a +// generous timeout. Without the `Notification.source_session` filter the +// assertion fires; with it every delta round-trips in microseconds. +#[tokio::test] +async fn daemon_bulk_precomputed_import_does_not_deadlock() { + let dir = tempfile::tempdir().expect("tempdir"); + let socket_path = dir.path().join("bulk.sock"); + + let daemon = LipDaemon::new(&socket_path).without_file_watcher(); + let task = tokio::spawn(async move { + let _ = daemon.run().await; + }); + + tokio::time::sleep(Duration::from_millis(20)).await; + let mut client = UnixStream::connect(&socket_path).await.unwrap(); + + const N: u64 = 200; + let push = async { + for seq in 0..N { + let uri = format!("lip://local//tmp/bulk/file_{seq:04}.rs"); + // A precomputed delta: no source_text, one occurrence, one symbol. + // Triggers the IndexChanged broadcast on every upsert — the exact + // path that used to deadlock. + let sym_uri = format!("{uri}#sym_{seq}"); + let doc = OwnedDocument { + uri: uri.clone(), + content_hash: format!("hash_{seq}"), + language: "rust".to_owned(), + occurrences: vec![OwnedOccurrence { + symbol_uri: sym_uri.clone(), + range: OwnedRange { + start_line: 0, + start_char: 0, + end_line: 0, + end_char: 3, + }, + confidence_score: 95, + role: Role::Definition, + override_doc: None, + kind: ReferenceKind::Unknown, + is_test: false, + }], + symbols: vec![OwnedSymbolInfo { + uri: sym_uri, + display_name: format!("sym_{seq}"), + kind: SymbolKind::Function, + confidence_score: 95, + ..Default::default() + }], + merkle_path: uri.clone(), + edges: vec![], + source_text: None, + }; + send( + &mut client, + &ClientMessage::Delta { + seq, + action: Action::Upsert, + document: doc, + }, + ) + .await + .unwrap(); + // Drain exactly one frame per iteration — same pattern as lip import. + // `recv_raw` so IndexChanged notifications (if any leak through) are + // surfaced as failures rather than silently skipped. + match recv_raw(&mut client).await.unwrap() { + ServerMessage::DeltaAck { accepted, .. } => { + assert!(accepted, "delta {seq} was not accepted"); + } + ServerMessage::IndexChanged { .. } => { + panic!( + "session {seq}: received IndexChanged echo of own emission \ + — Notification.source_session filter is not engaged" + ); + } + other => panic!("unexpected response to delta {seq}: {other:?}"), + } + } + }; + + // 30 s gives plenty of headroom on CI; a deadlocked run never completes. + tokio::time::timeout(Duration::from_secs(30), push) + .await + .expect("bulk push must not hang (self-echo deadlock regression)"); + + task.abort(); + let _ = task.await; +} From b611bbc52ee3e6334ec641f395b4f0ea3deb8d2b Mon Sep 17 00:00:00 2001 From: Lisa Date: Fri, 24 Apr 2026 10:15:56 +0200 Subject: [PATCH 12/18] =?UTF-8?q?fix:=20v2.3.2=20=E2=80=94=20CKB=20testdri?= =?UTF-8?q?ve=20follow-up=20(edges=5Fsource=20+=20URI=20translation=20+=20?= =?UTF-8?q?path-traversal=20guard)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Five correctness fixes discovered after v2.3.1 shipped and CKB began consuming EnrichedBlastRadius end-to-end. Wire-compatible via #[serde(flatten)] on BlastRadiusResult; protocol_version stays at 2. Changed: - edges_source moved from EnrichedBlastRadius onto BlastRadiusResult so non-enriched QueryBlastRadius carries call-edge provenance too; JSON shape unchanged. Fixed: - Tier-1 back-fill URIs now translate to SCIP descriptor form, both same-file (via display_name map) and cross-file (via name_to_symbols index with single-match guard). Symbol URIs no longer blank on wire responses for CKB dedup. - Path-traversal guard in convert_document rejects SCIP documents whose relative_path escapes the project root under string-level normalization — stops Go build-cache artefacts leaking into the graph. - Double lip://local/ prefix in callee_to_callers keys: lip_uri now detects an existing lip://local/ prefix when the back-fill replays tree-sitter against a canonical-URI file. - SCIP-descriptor vs tier-1-identifier mismatch in callee_name_to_callers: new normalize_callee_name(fragment) strips trailing () / . / : / # at all four insert sites plus the BFS lookup, so SCIP and tier-1 callees share keys. Added: - LIP_DEBUG_EDGES=1 diagnostic gating for upsert_file_precomputed, Phase-2 BFS, and the wire serializer. Wire log reports has_edges_source / body_bytes / 500-char head — truncation-free. Tests: +normalize_callee_name_strips_scip_descriptor_suffixes, +edges_source_survives_all_response_envelopes, +tier1_backfill_translates_caller_uri_to_scip_fragment, +tier1_backfill_resolves_cross_file_callee_via_name_index. 409 unit + 26 integration + 44 lip-cli all green. Co-Authored-By: Claude Sonnet 4.6 --- CHANGELOG.md | 24 + Cargo.lock | 6 +- Cargo.toml | 2 +- bindings/rust/src/daemon/session.rs | 34 ++ bindings/rust/src/indexer/symbol_extractor.rs | 10 +- bindings/rust/src/query_graph/db.rs | 523 +++++++++++++++++- bindings/rust/src/query_graph/types.rs | 90 ++- bindings/rust/tests/integration.rs | 11 +- docs/LIP_SPEC.mdx | 6 +- tools/lip-cli/src/cmd/import.rs | 80 ++- website/src/pages/docs/spec.mdx | 2 +- 11 files changed, 752 insertions(+), 36 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 51e57ed..48431e9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,30 @@ All notable changes to this project are documented here. ## [Unreleased] +## [2.3.2] — 2026-04-24 + +**CKB testdrive follow-up.** Five correctness fixes discovered after v2.3.1 shipped and CKB began consuming `EnrichedBlastRadius` end-to-end. `protocol_version` stays at `2`; the only schema change moves an existing field between structurally-nested records and is wire-compatible via `#[serde(flatten)]`. + +### Changed + +- **`edges_source` moved from `EnrichedBlastRadius` onto `BlastRadiusResult`** — so non-enriched `QueryBlastRadius` (not just `QueryBlastRadiusBatch` / `QueryBlastRadiusSymbol`) carries call-edge provenance. `EnrichedBlastRadius` still surfaces it through `#[serde(flatten)] static_result: BlastRadiusResult`, so the JSON wire shape is unchanged. Rust callers that field-access `enriched.edges_source` must switch to `enriched.static_result.edges_source`. Round-trip test `edges_source_survives_all_response_envelopes` asserts the field is emitted in every response variant. + +### Fixed + +- **Tier-1 back-fill URIs now translate to SCIP descriptor form — same-file and cross-file.** v2.3.1 Feature #5 re-ran the tree-sitter tier-1 extractor on disk when a SCIP import carried no call edges, but tier-1 emits fragment URIs in plain-identifier form (`#NewExporter`) while scip-go / scip-typescript emit descriptor form (`#NewExporter()`, `#Component.`). Every tier-1-emitted caller and callee missed `def_index.get(caller_sym)` in `blast_radius_for` Phase 3, so Phase 4 fell through to the file-level fallback and produced `ImpactItem`s with blank `symbol_uri` — breaking dedup on the CKB side. The back-fill now builds a same-file `display_name → SCIP-uri` map (plus URI fragment as secondary key) and, for cross-file callees whose definition lives in another SCIP document, falls back to the global `name_to_symbols` index (populated at SCIP-import time from each symbol's `display_name`, with symmetric cleanup on re-upsert and file removal). Both sides of every tier-1 edge are translated in-place when an unambiguous match exists. Regression tests `tier1_backfill_translates_caller_uri_to_scip_fragment` (same-file) and `tier1_backfill_resolves_cross_file_callee_via_name_index` (cross-file) seed SCIP-descriptor defs against real on-disk sources and assert every `ImpactItem` carries a non-empty, translated `symbol_uri`. + +- **Path-traversal guard on SCIP document ingestion.** scip-go ships documents whose `relative_path` points outside the project tree (e.g. `../../../../Library/Caches/go-build/…`). `build_document_uri` previously joined them literally, producing URIs like `lip://local//Users/lisa/Work/Projects/CKB/src/../../../../Library/Caches/go-build/…` that the daemon's path-based indexer happily ingested. `convert_document` now rejects any document whose net depth falls below the project root under pure string-level normalization (no filesystem access, so the check works on machines other than the one that produced the index). Emits `warning: skipped N SCIP document(s) whose relative_path escapes project_root` when any documents are dropped. + +- **Double `lip://local/` prefix in `callee_to_callers` keys.** `SymbolExtractor::lip_uri` stripped the `file://` scheme but not an existing `lip://local/` prefix. When `upsert_file_precomputed`'s tier-1 back-fill replayed the tree-sitter extractor against a file imported with its canonical key (`lip://local///`), every emitted edge URI was double-prefixed (`lip://local/lip://local///#name`), making URI-exact BFS lookups impossible in `blast_radius_for` Phase 2. Fixed by detecting the `lip://local/` prefix and appending `#` directly; the `file://` / bare-path branch is unchanged for tier-1 callers that pass raw paths. Confirmed via `LIP_DEBUG_EDGES=1` diagnostic trace from a CKB testdrive. + +- **SCIP-descriptor / tier-1-identifier name-fragment mismatch in `callee_name_to_callers`.** Tier-1 extractor indexes plain identifiers (`SearchSymbols`); SCIP descriptors carry suffix sigils (`SearchSymbols().`, `MyField.`, `Foo:`). Phase-2 BFS in `blast_radius_for` did `extract_name(callee)` without stripping the sigils, so cross-provider lookups always missed even when both providers had indexed the same function. Added `normalize_callee_name(fragment)` — truncates at the first `(`, then trims trailing non-identifier chars — and applied it at all four `callee_name_to_callers` insert sites plus the BFS lookup site, so SCIP and tier-1 callees now share keys. Unit test `normalize_callee_name_strips_scip_descriptor_suffixes` covers the six canonical SCIP descriptor shapes. + +### Added + +- **`LIP_DEBUG_EDGES=1` diagnostic gating.** `upsert_file_precomputed`, `blast_radius_for` Phase-2 BFS, and `write_message` (wire output) emit focused `[lip-debug-edges]` traces to stderr when the env var is set. Zero-cost and silent when unset. The wire log now reports `has_edges_source` / `edges_source_count` / `body_bytes` + 500-char head instead of a truncated 2 KB tail, so edges_source presence on the wire can be confirmed without scrolling through multi-kilobyte bodies. + +--- + ## [2.3.1] — 2026-04-21 **CKB import landing fix.** Addresses the "`lip import --push-to-daemon` prints success but every file shows `indexed: false`" class of bug by making client- and daemon-side URI conventions converge, and by back-filling call edges when SCIP imports carry none. `protocol_version` stays at `2`; every change is either additive or limited to CLI behaviour. diff --git a/Cargo.lock b/Cargo.lock index 032c4cf..25fb35e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1074,7 +1074,7 @@ checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53" [[package]] name = "lip-cli" -version = "2.3.1" +version = "2.3.2" dependencies = [ "anyhow", "clap", @@ -1093,7 +1093,7 @@ dependencies = [ [[package]] name = "lip-core" -version = "2.3.1" +version = "2.3.2" dependencies = [ "anyhow", "criterion", @@ -1130,7 +1130,7 @@ dependencies = [ [[package]] name = "lip-registry" -version = "2.3.1" +version = "2.3.2" dependencies = [ "anyhow", "axum", diff --git a/Cargo.toml b/Cargo.toml index e2055c8..1cb3599 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,7 +7,7 @@ members = [ ] [workspace.package] -version = "2.3.1" +version = "2.3.2" edition = "2021" rust-version = "1.78" authors = ["Lisa Welsch "] diff --git a/bindings/rust/src/daemon/session.rs b/bindings/rust/src/daemon/session.rs index 170d00e..6058e9b 100644 --- a/bindings/rust/src/daemon/session.rs +++ b/bindings/rust/src/daemon/session.rs @@ -2649,6 +2649,40 @@ pub async fn read_message(stream: &mut UnixStream) -> std::io::Result> { /// Serialize `msg` as JSON and write with a 4-byte big-endian length prefix. pub async fn write_message(stream: &mut UnixStream, msg: &ServerMessage) -> anyhow::Result<()> { let body = serde_json::to_vec(msg)?; + // v2.3.2 diagnostic — gated on LIP_DEBUG_EDGES=1. Replaces the old + // 2 KB tail preview (which truncated real payloads) with a + // field-presence signal: does the wire body mention `edges_source` + // at all, how many of each variant were serialised, and a 500-byte + // head for orientation. Truncation-free answer to "is the field + // reaching the wire?". + if std::env::var("LIP_DEBUG_EDGES") + .map(|v| v == "1") + .unwrap_or(false) + { + let marker = matches!( + msg, + ServerMessage::BlastRadiusResult(_) + | ServerMessage::BlastRadiusBatchResult { .. } + | ServerMessage::BlastRadiusSymbolResult { .. } + ); + if marker { + let body_str = std::str::from_utf8(&body).unwrap_or(""); + let has_field = body_str.contains("\"edges_source\""); + let field_count = body_str.matches("\"edges_source\"").count(); + let head = if body_str.len() > 500 { + &body_str[..500] + } else { + body_str + }; + eprintln!( + "[lip-debug-edges] wire response: has_edges_source={} edges_source_count={} body_bytes={} head={}", + has_field, + field_count, + body.len(), + head + ); + } + } stream.write_all(&(body.len() as u32).to_be_bytes()).await?; stream.write_all(&body).await?; Ok(()) diff --git a/bindings/rust/src/indexer/symbol_extractor.rs b/bindings/rust/src/indexer/symbol_extractor.rs index fd26b52..c3aea44 100644 --- a/bindings/rust/src/indexer/symbol_extractor.rs +++ b/bindings/rust/src/indexer/symbol_extractor.rs @@ -78,7 +78,15 @@ impl<'a> SymbolExtractor<'a> { } fn lip_uri(&self, name: &str) -> String { - // Strip the file:// scheme so we don't produce lip://local/file:///abs/path#Name. + // `file_uri` may already be a canonical `lip://local/` URI — this + // happens in the v2.3.1 back-fill path where `upsert_file_precomputed` + // replays the tree-sitter extractor against a file that was imported + // with its canonical key. Appending without re-prefixing avoids the + // `lip://local/lip://local/...` double-prefix seen in `callee_to_callers`. + if self.file_uri.starts_with("lip://local/") { + return format!("{}#{}", self.file_uri, name); + } + // Strip `file://` so we don't produce `lip://local/file:///abs/path#Name`. let path = self .file_uri .strip_prefix("file://") diff --git a/bindings/rust/src/query_graph/db.rs b/bindings/rust/src/query_graph/db.rs index 0fd77e1..f20ac89 100644 --- a/bindings/rust/src/query_graph/db.rs +++ b/bindings/rust/src/query_graph/db.rs @@ -46,6 +46,23 @@ fn extract_name(uri: &str) -> &str { uri.rfind('#').map(|i| &uri[i + 1..]).unwrap_or("") } +/// Strip SCIP descriptor suffix characters so a fragment like +/// `SearchSymbols().` reduces to `SearchSymbols` — matching the plain +/// identifier form the Tier-1 extractor emits. SCIP descriptors end in +/// `()` for methods/functions, `.` for terms, `#` for types, `:` for +/// macros, or `[T]` for type parameters; tier-1 emits the bare name. +/// Indexing and lookup must go through this normaliser or the two +/// providers will store disjoint keys in `callee_name_to_callers`. +fn normalize_callee_name(fragment: &str) -> &str { + // Truncate at the first `(` — SCIP's `name().` form. + let head = match fragment.find('(') { + Some(i) => &fragment[..i], + None => fragment, + }; + // Strip trailing SCIP sigils / whitespace. + head.trim_end_matches(|c: char| !c.is_alphanumeric() && c != '_') +} + /// Returns `true` if the annotation entry has expired (past its `expires_ms` timestamp). /// An `expires_ms` of 0 means the entry is permanent and never expires. fn is_expired(entry: &crate::schema::OwnedAnnotationEntry) -> bool { @@ -589,7 +606,9 @@ impl LipDatabase { .or_default() .push(edge.to_uri.clone()); // Name-based index: enables cross-file resolution in blast_radius_for. - let callee_name = extract_name(&edge.to_uri).to_owned(); + // Normalise to the plain-identifier form so SCIP-descriptor + // callees (`Foo().`) share a key with tier-1 (`Foo`). + let callee_name = normalize_callee_name(extract_name(&edge.to_uri)).to_owned(); if !callee_name.is_empty() { self.callee_name_to_callers .entry(callee_name) @@ -669,6 +688,21 @@ impl LipDatabase { }, ); + // Snapshot old display_names before clearing sym_cache so we can + // remove both the fragment-keyed and display_name-keyed entries from + // `name_to_symbols` (v2.3.2 Issue #1 adds display_name indexing). + let stale_display_names: Vec<(String, String)> = self + .sym_cache + .get(&uri) + .map(|c| { + c.value + .iter() + .filter(|s| !s.display_name.is_empty()) + .map(|s| (s.uri.clone(), s.display_name.clone())) + .collect() + }) + .unwrap_or_default(); + // Clear stale caches + def_index entries for this file. self.sym_cache.remove(&uri); self.occ_cache.remove(&uri); @@ -687,6 +721,14 @@ impl LipDatabase { } } } + for (sym_uri, display_name) in &stale_display_names { + if let Some(uris) = self.name_to_symbols.get_mut(display_name) { + uris.retain(|u| u != sym_uri); + if uris.is_empty() { + self.name_to_symbols.remove(display_name); + } + } + } self.def_index.retain(|_, (furi, _)| furi != &uri); // Build def_index + name_to_symbols from pre-computed occurrences. @@ -709,7 +751,30 @@ impl LipDatabase { // Seed sym_cache so file_symbols() returns the pre-computed symbols. let syms = Arc::new(symbols); - self.sym_cache.insert(uri.clone(), Cached::new(syms, rev)); + self.sym_cache.insert(uri.clone(), Cached::new(syms.clone(), rev)); + + // v2.3.2 Issue #1 — also index SCIP defs by their `display_name` + // (not just URI fragment) so tier-1 back-fill's cross-file callee + // translation can resolve plain-identifier names to SCIP URIs. The + // descriptor suffix scip-go emits (`NewExporter()` in the fragment) + // otherwise hides cross-file matches from the tier-1 extractor's + // plain-identifier view. + for sym in syms.iter() { + if sym.display_name.is_empty() { + continue; + } + let frag = extract_name(&sym.uri); + if sym.display_name == frag { + continue; // already indexed by upstream occurrence loop + } + let entry = self + .name_to_symbols + .entry(sym.display_name.clone()) + .or_default(); + if !entry.contains(&sym.uri) { + entry.push(sym.uri.clone()); + } + } // Consumed-names index (same as upsert_file). { @@ -743,7 +808,7 @@ impl LipDatabase { .entry(edge.from_uri.clone()) .or_default() .push(edge.to_uri.clone()); - let callee_name = extract_name(&edge.to_uri).to_owned(); + let callee_name = normalize_callee_name(extract_name(&edge.to_uri)).to_owned(); if !callee_name.is_empty() { self.callee_name_to_callers .entry(callee_name) @@ -758,6 +823,15 @@ impl LipDatabase { // When the disk source is reachable, re-run the Tier-1 tree-sitter // edge extractor to populate the forward + reverse call graph so // `QueryBlastRadiusSymbol` returns non-empty `direct_items`. + // + // v2.3.2 Issue #1 — tier-1 emits file-local URIs (`lip://local/…#Name`) + // that won't match the SCIP-style def_index keys (`lip://scip-go/…#Name`). + // Without translation, Phase 3 of `blast_radius_for` can't resolve the + // caller symbol and every ImpactItem degrades to blank `symbol_uri`. + // We translate the caller side only: the current file's SCIP defs are + // already in `def_index`, so we can look them up by name and rewrite + // the tier-1 caller URI in-place. Callees stay tier-1 because the BFS + // walks `callee_name_to_callers` via name fragment regardless. let edges_src = if !pairs.is_empty() { EdgesSource::ScipOnly } else if let Some(path) = crate::daemon::watcher::uri_to_path(&uri) { @@ -765,24 +839,79 @@ impl LipDatabase { Ok(text) => { let lang = Language::detect(&uri, &language); let tier1_edges = Tier1Indexer::new().edges_for_source(&uri, &text, lang); + + // Build identifier → SCIP-uri map for defs in this file. + // Used to translate tier-1-emitted caller/callee URIs into + // SCIP keys so that def_index and name-based BFS lookups in + // blast_radius_for succeed. Keyed by both `display_name` + // and the URI fragment — SCIP descriptors (`NewExporter()` + // in scip-go, `Component.` in scip-typescript) differ from + // tier-1's plain-identifier fragment extraction. + let mut translate: HashMap = HashMap::new(); + let this_file_syms = self.file_symbols(&uri); + for sym in this_file_syms.iter() { + if !sym.display_name.is_empty() { + translate + .entry(sym.display_name.clone()) + .or_insert_with(|| sym.uri.clone()); + } + let frag = extract_name(&sym.uri); + if !frag.is_empty() { + translate + .entry(frag.to_owned()) + .or_insert_with(|| sym.uri.clone()); + } + } + + // Cross-file fallback: when the same-file `translate` map + // misses (callee/caller defined in another SCIP document), + // fall back to the global `name_to_symbols` index. Only + // accept unambiguous hits (single URI) so we don't alias + // unrelated homonyms across packages. v2.3.2 Issue #1. + let name_to_symbols = &self.name_to_symbols; + let resolve = |name: &str, fallback: &str| -> String { + if let Some(u) = translate.get(name) { + return u.clone(); + } + if let Some(uris) = name_to_symbols.get(name) { + if uris.len() == 1 { + return uris[0].clone(); + } + } + fallback.to_owned() + }; + let mut filled = false; - for edge in tier1_edges.iter().filter(|e| e.kind == EdgeKind::Calls) { + let calls: Vec<(String, String)> = tier1_edges + .iter() + .filter(|e| e.kind == EdgeKind::Calls) + .map(|edge| { + let caller_name = extract_name(&edge.from_uri); + let callee_name_raw = extract_name(&edge.to_uri); + ( + resolve(caller_name, &edge.from_uri), + resolve(callee_name_raw, &edge.to_uri), + ) + }) + .collect(); + for (from_uri, to_uri) in calls { self.callee_to_callers - .entry(edge.to_uri.clone()) + .entry(to_uri.clone()) .or_default() - .push(edge.from_uri.clone()); + .push(from_uri.clone()); self.caller_to_callees - .entry(edge.from_uri.clone()) + .entry(from_uri.clone()) .or_default() - .push(edge.to_uri.clone()); - let callee_name = extract_name(&edge.to_uri).to_owned(); + .push(to_uri.clone()); + let callee_name = + normalize_callee_name(extract_name(&to_uri)).to_owned(); if !callee_name.is_empty() { self.callee_name_to_callers .entry(callee_name) .or_default() - .push(edge.from_uri.clone()); + .push(from_uri.clone()); } - pairs.push((edge.from_uri.clone(), edge.to_uri.clone())); + pairs.push((from_uri, to_uri)); filled = true; } if filled { @@ -796,6 +925,21 @@ impl LipDatabase { } else { EdgesSource::Empty }; + // v2.3.2 diagnostic — gated on LIP_DEBUG_EDGES=1. Confirms which + // back-fill branch fired per file and the exact key written into + // `file_edges_source` (for pairing against the lookup-side log in + // `blast_radius_for`). + if std::env::var("LIP_DEBUG_EDGES") + .map(|v| v == "1") + .unwrap_or(false) + { + eprintln!( + "[lip-debug-edges] upsert_precomputed uri={} edges_src={:?} pairs={}", + uri, + edges_src, + pairs.len() + ); + } self.file_edges_source.insert(uri.clone(), edges_src); self.file_call_edges.insert(uri.clone(), pairs); @@ -812,6 +956,19 @@ impl LipDatabase { let uri = uri.as_str(); self.revision += 1; self.file_inputs.remove(uri); + // Snapshot display_names before removing sym_cache (v2.3.2 Issue #1 + // display_name indexing needs symmetric cleanup on file removal). + let stale_display_names: Vec<(String, String)> = self + .sym_cache + .get(uri) + .map(|c| { + c.value + .iter() + .filter(|s| !s.display_name.is_empty()) + .map(|s| (s.uri.clone(), s.display_name.clone())) + .collect() + }) + .unwrap_or_default(); self.sym_cache.remove(uri); self.occ_cache.remove(uri); self.api_cache.remove(uri); @@ -831,6 +988,14 @@ impl LipDatabase { } } } + for (sym_uri, display_name) in &stale_display_names { + if let Some(uris) = self.name_to_symbols.get_mut(display_name) { + uris.retain(|u| u != sym_uri); + if uris.is_empty() { + self.name_to_symbols.remove(display_name); + } + } + } self.def_index.retain(|_, (furi, _)| furi.as_str() != uri); self.remove_file_call_edges(uri); self.file_consumed_names.remove(uri); @@ -851,7 +1016,7 @@ impl LipDatabase { self.caller_to_callees.remove(&from); } } - let callee_name = extract_name(&to); + let callee_name = normalize_callee_name(extract_name(&to)); if let Some(callers) = self.callee_name_to_callers.get_mut(callee_name) { callers.retain(|c| *c != from); if callers.is_empty() { @@ -1244,6 +1409,14 @@ impl LipDatabase { // // caller_sym → minimum distance from symbol_uri let mut cpg_distance: HashMap = HashMap::new(); + let debug_edges = std::env::var("LIP_DEBUG_EDGES") + .map(|v| v == "1") + .unwrap_or(false); + // Phase-2 hit/miss counters (v2.3.2 diagnostic). `uri_*` covers the + // exact-URI `callee_to_callers` index; `name_*` covers the + // name-fragment `callee_name_to_callers` bridge. + let (mut uri_hits, mut uri_misses, mut name_hits, mut name_misses) = + (0u32, 0u32, 0u32, 0u32); { let mut queue: VecDeque<(String, u32)> = VecDeque::new(); cpg_distance.insert(symbol_uri.to_owned(), 0); @@ -1260,28 +1433,64 @@ impl LipDatabase { } // URI-exact callers (same-file or pre-resolved edges). if let Some(callers) = self.callee_to_callers.get(&callee).cloned() { + uri_hits += 1; for caller in callers { if !cpg_distance.contains_key(&caller) { cpg_distance.insert(caller.clone(), depth + 1); queue.push_back((caller, depth + 1)); } } + } else { + uri_misses += 1; } // Name-based callers: catches file-local URIs from other files. - let name = extract_name(&callee); + // Normalise the SCIP descriptor fragment so `SearchSymbols().` + // collides with the tier-1-indexed `SearchSymbols`. + let name = normalize_callee_name(extract_name(&callee)); if !name.is_empty() { if let Some(callers) = self.callee_name_to_callers.get(name).cloned() { + name_hits += 1; for caller in callers { if !cpg_distance.contains_key(&caller) { cpg_distance.insert(caller.clone(), depth + 1); queue.push_back((caller, depth + 1)); } } + } else { + name_misses += 1; } } } } + if debug_edges { + eprintln!( + "[lip-debug-edges] blast_radius_for Phase-2 symbol={} uri_hits={} uri_misses={} name_hits={} name_misses={} cpg_nodes={}", + symbol_uri, uri_hits, uri_misses, name_hits, name_misses, cpg_distance.len() + ); + // If nothing hit, dump a few representative keys from each index + // so we can eyeball the URI-form mismatch. + if uri_hits == 0 && name_hits == 0 { + let uri_keys: Vec<&String> = self.callee_to_callers.keys().take(3).collect(); + let name_keys: Vec<&String> = + self.callee_name_to_callers.keys().take(10).collect(); + let raw = extract_name(symbol_uri); + let normalized = normalize_callee_name(raw); + eprintln!( + "[lip-debug-edges] query_name_raw={:?} normalized={:?} callee_to_callers_total={} sample={:?}", + raw, + normalized, + self.callee_to_callers.len(), + uri_keys + ); + eprintln!( + "[lip-debug-edges] callee_name_to_callers_total={} sample={:?}", + self.callee_name_to_callers.len(), + name_keys + ); + } + } + // ── Phase 3: merge ──────────────────────────────────────────────── // // For each caller symbol from the CPG pass, resolve its defining file, @@ -1389,6 +1598,35 @@ impl LipDatabase { RiskLevel::Low }; + let edges_source = self.file_edges_source.get(&def_uri).copied(); + // v2.3.2 diagnostic — gated on LIP_DEBUG_EDGES=1. Prints the + // symbol_uri / def_uri / hit-or-miss triple so we can spot + // canonicalisation asymmetry between upsert-time and query-time. + if std::env::var("LIP_DEBUG_EDGES") + .map(|v| v == "1") + .unwrap_or(false) + { + let has = edges_source.is_some(); + eprintln!( + "[lip-debug-edges] blast_radius_for symbol={} def_uri={} edges_source_hit={} value={:?} sym_items={} file_items={}", + symbol_uri, + def_uri, + has, + edges_source, + sym_items.len(), + file_distance.len(), + ); + if !has { + // Dump up to 5 keys so we can compare against the insert log. + let keys: Vec<&String> = + self.file_edges_source.keys().take(5).collect(); + eprintln!( + "[lip-debug-edges] file_edges_source sample keys (total={}): {:?}", + self.file_edges_source.len(), + keys + ); + } + } BlastRadiusResult { symbol_uri: symbol_uri.to_owned(), direct_dependents: direct_count, @@ -1398,6 +1636,7 @@ impl LipDatabase { transitive_items, truncated, risk_level, + edges_source, } } @@ -1509,12 +1748,10 @@ impl LipDatabase { } } - let edges_source = self.file_edges_source.get(&canon_file).copied(); results.push(EnrichedBlastRadius { file_uri: canon_file.clone(), static_result, semantic_items, - edges_source, }); } } @@ -1594,12 +1831,10 @@ impl LipDatabase { } } - let edges_source = self.file_edges_source.get(&file_uri).copied(); Some(EnrichedBlastRadius { file_uri, static_result, semantic_items, - edges_source, }) } @@ -2765,6 +3000,26 @@ mod tests { ) } + // ── Helpers ─────────────────────────────────────────────────────────── + + #[test] + fn normalize_callee_name_strips_scip_descriptor_suffixes() { + // SCIP method / function descriptor form. + assert_eq!(normalize_callee_name("SearchSymbols()."), "SearchSymbols"); + assert_eq!(normalize_callee_name("foo()"), "foo"); + // SCIP term form. + assert_eq!(normalize_callee_name("MyField."), "MyField"); + // SCIP type form (trailing `#` already consumed by extract_name, but + // defensively handle residual non-identifier trailers). + assert_eq!(normalize_callee_name("Foo:"), "Foo"); + // Plain tier-1 identifier — unchanged. + assert_eq!(normalize_callee_name("plain_name"), "plain_name"); + // Snake-case / digits preserved. + assert_eq!(normalize_callee_name("do_thing_2()."), "do_thing_2"); + // Empty. + assert_eq!(normalize_callee_name(""), ""); + } + // ── Revision ────────────────────────────────────────────────────────── #[test] @@ -3842,6 +4097,240 @@ impl Greeter { assert_eq!(not_indexed, vec![unknown]); } + // v2.3.2 Issue #1 — tier-1 back-fill URI translation. + // + // SCIP-imported files carry SCIP descriptor fragments (`#NewExporter()`), + // but tier-1 tree-sitter emits plain identifier fragments (`#NewExporter`). + // Without translation, `blast_radius_for` Phase 3's `def_index.get(caller_sym)` + // misses every tier-1-emitted caller URI → Phase 4 falls through to file- + // level items with blank `symbol_uri`. + // + // This test upserts a precomputed file (SCIP-descriptor URIs, no edges) + // pointing at a real on-disk source. The back-fill must translate the + // tier-1 caller URI to the SCIP URI via the file's defs so the caller + // `ImpactItem` carries a non-empty `symbol_uri`. + #[test] + fn tier1_backfill_translates_caller_uri_to_scip_fragment() { + use crate::schema::{OwnedRange, OwnedSymbolInfo, SymbolKind}; + + let dir = tempfile::tempdir().expect("tempdir"); + let src_path = dir.path().join("chain.rs"); + std::fs::write(&src_path, "fn caller() { target(); }\nfn target() {}\n").unwrap(); + let abs = src_path.to_string_lossy(); + let file_uri = format!("lip://local//{}", abs.trim_start_matches('/')); + // SCIP descriptor fragments (distinct from tier-1's plain-identifier form). + let sym_caller = format!("{file_uri}#caller()"); + let sym_target = format!("{file_uri}#target()"); + + let syms = vec![ + OwnedSymbolInfo { + uri: sym_caller.clone(), + kind: SymbolKind::Function, + display_name: "caller".to_owned(), + confidence_score: 90, + is_exported: true, + ..Default::default() + }, + OwnedSymbolInfo { + uri: sym_target.clone(), + kind: SymbolKind::Function, + display_name: "target".to_owned(), + confidence_score: 90, + is_exported: true, + ..Default::default() + }, + ]; + let occs = vec![ + OwnedOccurrence { + symbol_uri: sym_caller.clone(), + range: OwnedRange { + start_line: 0, + start_char: 3, + end_line: 0, + end_char: 9, + }, + confidence_score: 90, + role: Role::Definition, + override_doc: None, + kind: ReferenceKind::Unknown, + is_test: false, + }, + OwnedOccurrence { + symbol_uri: sym_target.clone(), + range: OwnedRange { + start_line: 1, + start_char: 3, + end_line: 1, + end_char: 9, + }, + confidence_score: 90, + role: Role::Definition, + override_doc: None, + kind: ReferenceKind::Unknown, + is_test: false, + }, + ]; + + let mut db = LipDatabase::new(); + db.upsert_file_precomputed( + file_uri.clone(), + "rust".to_owned(), + "abc".to_owned(), + syms, + occs, + vec![], // empty edges → triggers tier-1 back-fill + ); + + let result = db.blast_radius_for(&sym_target); + let all_items: Vec<_> = result + .direct_items + .iter() + .chain(result.transitive_items.iter()) + .collect(); + + // At least one item should carry the SCIP caller URI — not a blank + // symbol_uri from the file-level fallback path. + assert!( + all_items.iter().any(|i| i.symbol_uri == sym_caller), + "tier-1 back-fill must translate the caller URI to the SCIP descriptor form; \ + got items: {:?}", + all_items + ); + assert!( + all_items.iter().all(|i| !i.symbol_uri.is_empty()), + "no ImpactItem should carry a blank symbol_uri when the caller is defined \ + in a SCIP-imported file; got items: {:?}", + all_items + ); + + // edges_source (v2.3.2) must now surface on BlastRadiusResult (not just EnrichedBlastRadius). + assert_eq!( + result.edges_source, + Some(EdgesSource::ScipWithTier1Edges), + "BlastRadiusResult.edges_source must be populated when back-fill ran" + ); + } + + // v2.3.2 Issue #1 cross-file — tier-1 back-fill callee translation across + // SCIP documents. Tier-1 emits edges with `to_uri = lip://local/#` + // even when the callee is defined in a different file. The same-file + // `translate` map misses, so we must fall back to the global + // `name_to_symbols` index (populated with SCIP display_name entries) to + // resolve cross-file callees. Without this, CKB's merge step sees + // `symbol_uri: ""` on every transitive item. + #[test] + fn tier1_backfill_resolves_cross_file_callee_via_name_index() { + use crate::schema::{OwnedRange, OwnedSymbolInfo, SymbolKind}; + + let dir = tempfile::tempdir().expect("tempdir"); + let caller_path = dir.path().join("caller.rs"); + let target_path = dir.path().join("target.rs"); + std::fs::write(&caller_path, "fn caller() { target(); }\n").unwrap(); + std::fs::write(&target_path, "pub fn target() {}\n").unwrap(); + let caller_abs = caller_path.to_string_lossy(); + let target_abs = target_path.to_string_lossy(); + let caller_uri = format!("lip://local//{}", caller_abs.trim_start_matches('/')); + let target_uri = format!("lip://local//{}", target_abs.trim_start_matches('/')); + // SCIP descriptor fragments — different from tier-1 plain identifiers. + let sym_caller = format!("{caller_uri}#caller()"); + let sym_target = format!("{target_uri}#target()"); + + let caller_syms = vec![OwnedSymbolInfo { + uri: sym_caller.clone(), + kind: SymbolKind::Function, + display_name: "caller".to_owned(), + confidence_score: 90, + is_exported: true, + ..Default::default() + }]; + let caller_occs = vec![OwnedOccurrence { + symbol_uri: sym_caller.clone(), + range: OwnedRange { + start_line: 0, + start_char: 3, + end_line: 0, + end_char: 9, + }, + confidence_score: 90, + role: Role::Definition, + override_doc: None, + kind: ReferenceKind::Unknown, + is_test: false, + }]; + let target_syms = vec![OwnedSymbolInfo { + uri: sym_target.clone(), + kind: SymbolKind::Function, + display_name: "target".to_owned(), + confidence_score: 90, + is_exported: true, + ..Default::default() + }]; + let target_occs = vec![OwnedOccurrence { + symbol_uri: sym_target.clone(), + range: OwnedRange { + start_line: 0, + start_char: 7, + end_line: 0, + end_char: 13, + }, + confidence_score: 90, + role: Role::Definition, + override_doc: None, + kind: ReferenceKind::Unknown, + is_test: false, + }]; + + let mut db = LipDatabase::new(); + // Import target first so `name_to_symbols["target"]` is populated + // before the caller's tier-1 back-fill runs. + db.upsert_file_precomputed( + target_uri.clone(), + "rust".to_owned(), + "t1".to_owned(), + target_syms, + target_occs, + vec![], + ); + db.upsert_file_precomputed( + caller_uri.clone(), + "rust".to_owned(), + "c1".to_owned(), + caller_syms, + caller_occs, + vec![], + ); + + let result = db.blast_radius_for(&sym_target); + let all_items: Vec<_> = result + .direct_items + .iter() + .chain(result.transitive_items.iter()) + .collect(); + + // Caller (cross-file) must be resolved to its SCIP URI, not emitted + // as a file-level fallback item with empty symbol_uri. + assert!( + all_items.iter().any(|i| i.symbol_uri == sym_caller), + "cross-file caller must resolve to SCIP URI via name_to_symbols fallback; \ + got items: {:?}", + all_items + ); + assert!( + all_items.iter().all(|i| !i.symbol_uri.is_empty()), + "no cross-file ImpactItem should carry a blank symbol_uri; got items: {:?}", + all_items + ); + // `result.edges_source` reflects the *target* file's edges (no outgoing + // calls in `target.rs` → Empty). Verify the caller file recorded the + // back-filled edge separately. + let caller_edges_src = db.file_edges_source.get(&caller_uri).copied(); + assert_eq!( + caller_edges_src, + Some(EdgesSource::ScipWithTier1Edges), + "caller file's back-fill must register ScipWithTier1Edges" + ); + } + #[test] fn blast_radius_batch_file_uri_populated() { let mut db = LipDatabase::new(); diff --git a/bindings/rust/src/query_graph/types.rs b/bindings/rust/src/query_graph/types.rs index 3389727..f0f85b0 100644 --- a/bindings/rust/src/query_graph/types.rs +++ b/bindings/rust/src/query_graph/types.rs @@ -102,17 +102,14 @@ pub enum EdgesSource { pub struct EnrichedBlastRadius { /// The input file URI this result was computed for. pub file_uri: String, - /// The static blast-radius result. + /// The static blast-radius result. `static_result.edges_source` carries + /// the call-edge provenance (moved off `EnrichedBlastRadius` in v2.3.2 + /// so non-enriched `QueryBlastRadius` responses carry it too). #[serde(flatten)] pub static_result: BlastRadiusResult, /// Semantically coupled files/symbols not in the static call graph. /// Empty when `include_semantic` was false or embeddings are unavailable. pub semantic_items: Vec, - /// Provenance for the call edges used to compute `static_result` - /// (v2.3.1). `None` when older daemons serialise this field without - /// any signal — deserialisers default to `None`. - #[serde(default, skip_serializing_if = "Option::is_none")] - pub edges_source: Option, } /// A single forward call edge returned by `QueryOutgoingCalls` (v2.3). @@ -254,6 +251,12 @@ pub struct BlastRadiusResult { pub truncated: bool, /// Composite risk level derived from caller count and spread. pub risk_level: RiskLevel, + /// Provenance for the call edges used to compute this result (v2.3.2). + /// Moved from `EnrichedBlastRadius` so `QueryBlastRadius` — not just + /// `QueryBlastRadiusBatch` / `QueryBlastRadiusSymbol` — carries it. + /// `None` when the daemon has no edges recorded for the target's file. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub edges_source: Option, } /// Result for a single sub-query inside a [`ClientMessage::BatchQuery`]. @@ -1541,6 +1544,81 @@ mod tests { serde_json::from_str(&json).expect("deserialize") } + fn blast_radius_fixture() -> BlastRadiusResult { + BlastRadiusResult { + symbol_uri: "lip://scip-go/gomod/foo@v1.0.0/Engine#SearchSymbols().".into(), + direct_dependents: 2, + transitive_dependents: 3, + affected_files: vec!["lip://local//x/y.go".into()], + direct_items: vec![], + transitive_items: vec![], + truncated: false, + risk_level: RiskLevel::Low, + edges_source: Some(EdgesSource::ScipWithTier1Edges), + } + } + + // v2.3.2 Issue — user-observed wire drop of `edges_source` despite + // internal state carrying `Some(ScipWithTier1Edges)`. Verifies that the + // field survives direct `BlastRadiusResult`, the `BlastRadiusResult` + // tuple-variant envelope (internally-tagged), and both + // `EnrichedBlastRadius` flatten sites (Batch / Symbol responses). + #[test] + fn edges_source_survives_all_response_envelopes() { + let br = blast_radius_fixture(); + + let direct = serde_json::to_string(&br).unwrap(); + assert!( + direct.contains("\"edges_source\":\"scip_with_tier1_edges\""), + "direct BlastRadiusResult must emit edges_source; got {direct}" + ); + + let envelope = ServerMessage::BlastRadiusResult(br.clone()); + let envelope_json = serde_json::to_string(&envelope).unwrap(); + assert!( + envelope_json.contains("\"edges_source\":\"scip_with_tier1_edges\""), + "ServerMessage::BlastRadiusResult envelope must carry edges_source; got {envelope_json}" + ); + + let enriched = EnrichedBlastRadius { + file_uri: "lip://local//x/y.go".into(), + static_result: br.clone(), + semantic_items: vec![], + }; + let enriched_json = serde_json::to_string(&enriched).unwrap(); + assert!( + enriched_json.contains("\"edges_source\":\"scip_with_tier1_edges\""), + "flattened EnrichedBlastRadius must carry edges_source; got {enriched_json}" + ); + + let batch = ServerMessage::BlastRadiusBatchResult { + results: vec![enriched.clone()], + not_indexed_uris: vec![], + }; + let batch_json = serde_json::to_string(&batch).unwrap(); + assert!( + batch_json.contains("\"edges_source\":\"scip_with_tier1_edges\""), + "BatchResult's flattened enriched items must carry edges_source; got {batch_json}" + ); + + let sym = ServerMessage::BlastRadiusSymbolResult { + result: Some(enriched), + }; + let sym_json = serde_json::to_string(&sym).unwrap(); + assert!( + sym_json.contains("\"edges_source\":\"scip_with_tier1_edges\""), + "SymbolResult's Some(enriched) must carry edges_source; got {sym_json}" + ); + + // Round-trip: deserialised form must preserve Some(...) too. + let rt = round_trip_server(&envelope); + if let ServerMessage::BlastRadiusResult(rt_br) = rt { + assert_eq!(rt_br.edges_source, Some(EdgesSource::ScipWithTier1Edges)); + } else { + panic!("envelope round-trip variant mismatch"); + } + } + #[test] fn batch_query_nearest_by_text_round_trips() { let msg = ClientMessage::BatchQueryNearestByText { diff --git a/bindings/rust/tests/integration.rs b/bindings/rust/tests/integration.rs index 34131ed..9c495db 100644 --- a/bindings/rust/tests/integration.rs +++ b/bindings/rust/tests/integration.rs @@ -2053,7 +2053,7 @@ async fn daemon_blast_radius_edges_source_tier1() { ServerMessage::BlastRadiusSymbolResult { result } => { let e = result.expect("expected Some result"); assert_eq!( - e.edges_source, + e.static_result.edges_source, Some(EdgesSource::Tier1), "Tier-1 upsert must set edges_source = Tier1" ); @@ -2076,9 +2076,12 @@ async fn daemon_blast_radius_edges_source_tier1() { assert!( results .iter() - .all(|e| e.edges_source == Some(EdgesSource::Tier1)), + .all(|e| e.static_result.edges_source == Some(EdgesSource::Tier1)), "every batch entry must carry edges_source = Tier1; got {:?}", - results.iter().map(|e| e.edges_source).collect::>(), + results + .iter() + .map(|e| e.static_result.edges_source) + .collect::>(), ); } other => panic!("expected BlastRadiusBatchResult, got {other:?}"), @@ -2167,7 +2170,7 @@ async fn daemon_precomputed_tier1_edge_fill_on_disk() { ServerMessage::BlastRadiusSymbolResult { result } => { let e = result.expect("expected Some result"); assert_eq!( - e.edges_source, + e.static_result.edges_source, Some(EdgesSource::ScipWithTier1Edges), "precomputed + on-disk source should back-fill tier-1 edges" ); diff --git a/docs/LIP_SPEC.mdx b/docs/LIP_SPEC.mdx index e4873b3..91a92f6 100644 --- a/docs/LIP_SPEC.mdx +++ b/docs/LIP_SPEC.mdx @@ -801,7 +801,9 @@ lip.query.blast_radius_batch(changed_file_uris, min_score?) → { risk_level: "low" | "medium" | "high", truncated: bool, semantic_items: [SemanticItem], // only when min_score is set - edges_source: EdgesSource, // v2.3.1, see below + edges_source: EdgesSource, // v2.3.1; v2.3.2 moved onto the + // underlying BlastRadiusResult so + // `query.blast_radius` carries it too }] } @@ -1294,7 +1296,7 @@ lip-protocol/ - [x] **`BatchQueryNearestByText`** — embed N query strings in a single round-trip and return one nearest-neighbour list per query. Replaces N sequential `QueryNearestByText` calls in multi-query workflows. - [x] **`QueryNearestBySymbol`** — find symbols similar to a given symbol URI. The daemon embeds the symbol's text (display_name + signature + doc) on demand and searches the symbol embedding store. `EmbeddingBatch` now routes `lip://` URIs to `symbol_embeddings` and `file://` URIs to `file_embeddings`. - [x] **`BatchAnnotationGet`** — retrieve an annotation key for multiple symbol URIs under a single db lock. Replaces N sequential `AnnotationGet` calls; safe inside `BatchQuery`. -- [x] **`IndexChanged` push notification** — emitted to all active sessions after every `Delta::Upsert` via the broadcast channel. Carries `indexed_files` count and `affected_uris`. Enables precise cache invalidation without polling `QueryIndexStatus`. +- [x] **`IndexChanged` push notification** — emitted to every active session *other than the one that produced the `Delta::Upsert`* via the broadcast channel. Internally, every broadcast message is wrapped in a `Notification { source_session: Option, message: ServerMessage }` envelope tagged with the emitter's session id; each session's drain loop skips envelopes whose `source_session` matches its own id so an importer never receives echoes of its own emissions. Tier 2 upgrades emit with `source_session: None` and reach every session. Carries `indexed_files` count and `affected_uris`. Enables precise cache invalidation without polling `QueryIndexStatus`. - [x] **`Handshake` / `HandshakeResult`** — clients send `Handshake { client_version }` on connect; daemon replies with `daemon_version` (semver) and `protocol_version` (monotonic integer, currently `1`). Version drift between daemon and client is now detectable at connect time. - [x] **`--managed` flag** (`lip daemon start --managed`) — spawns a background watchdog that polls the parent process every 2 s and exits when the parent has exited. Designed for IDE integrations that manage the daemon as a subprocess. diff --git a/tools/lip-cli/src/cmd/import.rs b/tools/lip-cli/src/cmd/import.rs index 3670617..231960e 100644 --- a/tools/lip-cli/src/cmd/import.rs +++ b/tools/lip-cli/src/cmd/import.rs @@ -117,11 +117,24 @@ pub async fn run(args: ImportArgs) -> anyhow::Result<()> { .as_ref() .map(|m| strip_file_scheme(&m.project_root)) .unwrap_or_default(); + let mut skipped_traversal: usize = 0; let mut deltas: Vec = index .documents .into_iter() - .map(|d| convert_document(d, confidence, &project_root_abs)) + .filter_map(|d| { + if !relative_path_is_within_root(&project_root_abs, &d.relative_path) { + skipped_traversal += 1; + return None; + } + Some(convert_document(d, confidence, &project_root_abs)) + }) .collect(); + if skipped_traversal > 0 { + eprintln!( + "warning: skipped {} SCIP document(s) whose relative_path escapes project_root", + skipped_traversal + ); + } // Also import external symbols as a synthetic document. if !index.external_symbols.is_empty() { @@ -458,6 +471,36 @@ fn build_document_uri(project_root_abs: &str, relative_path: &str) -> String { } } +/// True when `relative_path` stays inside `project_root_abs` after resolving +/// `..` / `.` segments. Used to drop SCIP documents that reference files +/// outside the project tree — scip-go, for example, sometimes emits entries +/// pointing into `$GOCACHE/../Library/Caches/go-build/...` which would be +/// ingested verbatim by the daemon's path-based indexer. +/// +/// Pure string-level normalization (does not stat) so it works for SCIP +/// imports on machines other than the one that produced the index. +/// Returns `true` when `project_root_abs` is empty (no anchor to validate +/// against — relative-form URIs are deferred to the daemon). +fn relative_path_is_within_root(project_root_abs: &str, relative_path: &str) -> bool { + if project_root_abs.is_empty() { + return true; + } + let mut depth: i32 = 0; + for seg in relative_path.split('/') { + match seg { + "" | "." => continue, + ".." => { + depth -= 1; + if depth < 0 { + return false; + } + } + _ => depth += 1, + } + } + true +} + fn convert_document(doc: scip::Document, confidence: u8, project_root_abs: &str) -> OwnedDelta { let uri = build_document_uri(project_root_abs, &doc.relative_path); let content_hash = sha256_hex(doc.relative_path.as_bytes()); @@ -1252,4 +1295,39 @@ mod tests { let uri = build_document_uri("/repo", "/src/foo.rs"); assert_eq!(uri, "lip://local//repo/src/foo.rs"); } + + // ── v2.3.2 path-traversal guard (Issue #3) ────────────────────────────── + + #[test] + fn relative_path_within_root_accepts_normal_paths() { + assert!(relative_path_is_within_root("/repo", "src/foo.rs")); + assert!(relative_path_is_within_root("/repo", "a/b/c/d.go")); + assert!(relative_path_is_within_root("/repo", "./src/foo.rs")); + assert!(relative_path_is_within_root("/repo", "a/./b.rs")); + } + + #[test] + fn relative_path_within_root_rejects_traversal() { + // scip-go can emit documents pointing into $GOCACHE. Reject them. + assert!(!relative_path_is_within_root( + "/Users/lisa/Work/Projects/CKB/src", + "../../../../Library/Caches/go-build/abc/def.go" + )); + assert!(!relative_path_is_within_root("/repo", "../outside.rs")); + // Even with intermediate descent, net escape is rejected. + assert!(!relative_path_is_within_root("/repo", "a/../../outside.rs")); + } + + #[test] + fn relative_path_within_root_allows_internal_parent_segments() { + // `a/b/../c.rs` stays inside the root — don't over-reject. + assert!(relative_path_is_within_root("/repo", "a/b/../c.rs")); + } + + #[test] + fn relative_path_within_root_empty_root_defers_to_daemon() { + // Without an anchor, we can't validate — accept and let the daemon + // resolve via registered roots. + assert!(relative_path_is_within_root("", "../../outside.rs")); + } } diff --git a/website/src/pages/docs/spec.mdx b/website/src/pages/docs/spec.mdx index 1c35189..05bf488 100644 --- a/website/src/pages/docs/spec.mdx +++ b/website/src/pages/docs/spec.mdx @@ -1092,7 +1092,7 @@ lip-protocol/ - [x] **`BatchQueryNearestByText`** — embed N query strings in one HTTP round-trip and return one nearest-neighbour list per query. Replaces N sequential `QueryNearestByText` calls. - [x] **`QueryNearestBySymbol`** — find symbols semantically similar to a given `lip://` URI. The daemon embeds the symbol's display name, signature, and docs on demand and searches the per-symbol embedding store. - [x] **`BatchAnnotationGet`** — retrieve an annotation key for multiple symbol URIs under a single db lock. Replaces N sequential `AnnotationGet` calls. -- [x] **`IndexChanged` push notification** — emitted to all active sessions after every successful `Delta::Upsert`. Carries `indexed_files` count and `affected_uris`. Enables precise cache invalidation without polling `QueryIndexStatus`. +- [x] **`IndexChanged` push notification** — emitted to every active session *other than the one that produced the `Delta::Upsert`* via the broadcast channel. Internally wrapped in a `Notification { source_session: Option, message: ServerMessage }` envelope so the emitting session skips its own echoes (Tier 2 upgrades use `source_session: None` and reach every session). Carries `indexed_files` count and `affected_uris`. Enables precise cache invalidation without polling `QueryIndexStatus`. - [x] **`Handshake` / `HandshakeResult`** — clients send `Handshake { client_version }` on connect; daemon replies with `daemon_version` (semver) and `protocol_version` (monotonic integer, currently `1`). Version drift between independently updated daemon and clients is detectable at connect time. - [x] **`--managed` flag** (`lip daemon start --managed`) — spawns a parent-process watchdog that calls `std::process::exit(0)` when the parent process exits. Designed for IDE integrations (CKB, VS Code extension) that manage the daemon as a subprocess. - [x] **`EmbeddingBatch` URI routing** — `lip://` URIs now route to `symbol_embeddings` (new field); `file://` URIs continue to use `file_embeddings`. Enables per-symbol dense vector search via `QueryNearestBySymbol`. From f648a254a78db88f03e8cd7ff927bd3262ff19ec Mon Sep 17 00:00:00 2001 From: Lisa Date: Fri, 24 Apr 2026 13:53:03 +0200 Subject: [PATCH 13/18] fix: Phase-3 fallback for tier-1-form caller URIs + split edges_src diagnostic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bug D (CKB testdrive follow-up): when the tier-1 back-fill resolver's `translate` and `name_to_symbols` indexes both miss for a caller name, the back-fill preserves the raw tier-1 URI (`lip://local//#`) as the caller in `callee_to_callers`. `def_index` was never populated for that URI — only SCIP occurrences register there — so Phase 3 of `blast_radius_for` skipped every such caller and Phase 4 emitted 100% blank `symbol_uri` in the CKB testdrive. Phase 3 now falls back to deriving the file URI by stripping the `#` fragment when `def_index` misses and the caller URI carries the `lip://local/` scheme, using the caller URI verbatim as `symbol_uri`. No double-indexing required. Regression test imports a caller file with no SCIP symbols against an on-disk source so the resolver must miss, then asserts the ImpactItem carries the full tier-1 caller URI rather than a blank. Also split the LIP_DEBUG_EDGES `upsert_precomputed` log into `scip_pairs` / `tier1_pairs` — the previous `pairs=N` total was ambiguous between "N from SCIP" (→ ScipOnly) and "0 from SCIP, N from back-fill" (→ ScipWithTier1Edges), masking upstream SCIP producer drift as LIP regression. Co-Authored-By: Claude Sonnet 4.6 --- CHANGELOG.md | 4 +- bindings/rust/src/query_graph/db.rs | 139 +++++++++++++++++++++++++++- 2 files changed, 137 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 48431e9..8e6dcec 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,7 +8,7 @@ All notable changes to this project are documented here. ## [2.3.2] — 2026-04-24 -**CKB testdrive follow-up.** Five correctness fixes discovered after v2.3.1 shipped and CKB began consuming `EnrichedBlastRadius` end-to-end. `protocol_version` stays at `2`; the only schema change moves an existing field between structurally-nested records and is wire-compatible via `#[serde(flatten)]`. +**CKB testdrive follow-up.** Six correctness fixes discovered after v2.3.1 shipped and CKB began consuming `EnrichedBlastRadius` end-to-end. `protocol_version` stays at `2`; the only schema change moves an existing field between structurally-nested records and is wire-compatible via `#[serde(flatten)]`. ### Changed @@ -24,6 +24,8 @@ All notable changes to this project are documented here. - **SCIP-descriptor / tier-1-identifier name-fragment mismatch in `callee_name_to_callers`.** Tier-1 extractor indexes plain identifiers (`SearchSymbols`); SCIP descriptors carry suffix sigils (`SearchSymbols().`, `MyField.`, `Foo:`). Phase-2 BFS in `blast_radius_for` did `extract_name(callee)` without stripping the sigils, so cross-provider lookups always missed even when both providers had indexed the same function. Added `normalize_callee_name(fragment)` — truncates at the first `(`, then trims trailing non-identifier chars — and applied it at all four `callee_name_to_callers` insert sites plus the BFS lookup site, so SCIP and tier-1 callees now share keys. Unit test `normalize_callee_name_strips_scip_descriptor_suffixes` covers the six canonical SCIP descriptor shapes. +- **Blank `symbol_uri` when tier-1 back-fill preserves a raw caller URI.** After the back-fill resolver falls back to `edge.from_uri` (both `translate` and `name_to_symbols` miss for the caller name — e.g. a caller function not captured as a SCIP `SymbolInformation`), `callee_to_callers` stores the raw tier-1 URI `lip://local//#`. `def_index` was never populated for that URI (it only records SCIP occurrences), so Phase 3 of `blast_radius_for` skipped every such caller and Phase 4 fell through to the file-level fallback — producing 100 % blank `symbol_uri` in the CKB testdrive against real projects. Phase 3 now derives the file URI by stripping the `#` fragment when `def_index` misses and the caller URI has the `lip://local/` scheme, using the caller URI verbatim as `symbol_uri`. No double-indexing required. Regression test `blast_radius_phase3_fallback_for_tier1_caller_uri` imports a caller file with no SCIP symbols (forcing the resolver miss) against an on-disk source whose tier-1 edge walks to a SCIP-indexed target, then asserts the `ImpactItem` carries the full tier-1 caller URI rather than a blank. + ### Added - **`LIP_DEBUG_EDGES=1` diagnostic gating.** `upsert_file_precomputed`, `blast_radius_for` Phase-2 BFS, and `write_message` (wire output) emit focused `[lip-debug-edges]` traces to stderr when the env var is set. Zero-cost and silent when unset. The wire log now reports `has_edges_source` / `edges_source_count` / `body_bytes` + 500-char head instead of a truncated 2 KB tail, so edges_source presence on the wire can be confirmed without scrolling through multi-kilobyte bodies. diff --git a/bindings/rust/src/query_graph/db.rs b/bindings/rust/src/query_graph/db.rs index f20ac89..7c81ffc 100644 --- a/bindings/rust/src/query_graph/db.rs +++ b/bindings/rust/src/query_graph/db.rs @@ -832,6 +832,10 @@ impl LipDatabase { // already in `def_index`, so we can look them up by name and rewrite // the tier-1 caller URI in-place. Callees stay tier-1 because the BFS // walks `callee_name_to_callers` via name fragment regardless. + // Snapshot the SCIP-origin count before any back-fill can append. + // The diagnostic at the end splits scip_pairs vs tier1_pairs so the + // log unambiguously explains which branch produced which edges. + let scip_pairs = pairs.len(); let edges_src = if !pairs.is_empty() { EdgesSource::ScipOnly } else if let Some(path) = crate::daemon::watcher::uri_to_path(&uri) { @@ -933,11 +937,14 @@ impl LipDatabase { .map(|v| v == "1") .unwrap_or(false) { + let tier1_pairs = pairs.len().saturating_sub(scip_pairs); eprintln!( - "[lip-debug-edges] upsert_precomputed uri={} edges_src={:?} pairs={}", + "[lip-debug-edges] upsert_precomputed uri={} edges_src={:?} pairs={} scip_pairs={} tier1_pairs={}", uri, edges_src, - pairs.len() + pairs.len(), + scip_pairs, + tier1_pairs ); } self.file_edges_source.insert(uri.clone(), edges_src); @@ -1504,10 +1511,31 @@ impl LipDatabase { if caller_sym == symbol_uri { continue; // skip the target itself } - if let Some((file_uri, _)) = self.def_index.get(caller_sym) { - let prev_dist = file_distance.get(file_uri).copied().unwrap_or(u32::MAX); + // Prefer def_index when present — it's the authoritative mapping. + // Fall back to deriving the file URI from the caller URI itself when + // the caller came from tier-1 back-fill (which inserts edges keyed + // by `lip://local/#` without populating def_index). + let file_uri_opt = self + .def_index + .get(caller_sym) + .map(|(f, _)| f.clone()) + .or_else(|| { + if caller_sym.starts_with("lip://local/") { + let hash_idx = caller_sym.rfind('#')?; + let candidate = &caller_sym[..hash_idx]; + if self.file_inputs.contains_key(candidate) { + Some(candidate.to_owned()) + } else { + None + } + } else { + None + } + }); + if let Some(file_uri) = file_uri_opt { + let prev_dist = file_distance.get(&file_uri).copied().unwrap_or(u32::MAX); file_distance.insert(file_uri.clone(), sym_dist.min(prev_dist)); - sym_items.push((caller_sym.clone(), file_uri.clone(), sym_dist)); + sym_items.push((caller_sym.clone(), file_uri, sym_dist)); } } @@ -4331,6 +4359,107 @@ impl Greeter { ); } + // v2.3.2 Issue #2 / Bug D — Phase-3 fallback for tier-1-form caller URIs. + // + // When the tier-1 back-fill resolver's `translate` map AND the global + // `name_to_symbols` index both miss for a caller name, the back-fill + // preserves the raw tier-1 URI (`lip://local//#`) as the + // caller key in `callee_to_callers`. That URI is NOT in `def_index` + // (def_index is populated only from SCIP occurrences). Phase 3 must + // therefore fall back to deriving the file URI by stripping the + // `#` fragment, or the caller gets dropped and every ImpactItem + // degrades to the file-level fallback with a blank `symbol_uri`. + #[test] + fn blast_radius_phase3_fallback_for_tier1_caller_uri() { + use crate::schema::{OwnedRange, OwnedSymbolInfo, SymbolKind}; + + let dir = tempfile::tempdir().expect("tempdir"); + let caller_path = dir.path().join("caller.rs"); + let target_path = dir.path().join("target.rs"); + // `orphan` is picked up by the tier-1 extractor but deliberately + // omitted from the caller file's SCIP symbols below, so the back-fill + // resolver for the caller side must fall back to the raw tier-1 URI. + std::fs::write(&caller_path, "fn orphan() { target(); }\n").unwrap(); + std::fs::write(&target_path, "pub fn target() {}\n").unwrap(); + let caller_abs = caller_path.to_string_lossy(); + let target_abs = target_path.to_string_lossy(); + let caller_uri = format!("lip://local//{}", caller_abs.trim_start_matches('/')); + let target_uri = format!("lip://local//{}", target_abs.trim_start_matches('/')); + // SCIP descriptor form for target — matches what scip-go/scip-clang emit. + let sym_target = format!("{target_uri}#target()"); + // The raw tier-1 URI that the back-fill will keep for the caller + // because `orphan` is neither in caller's display_name set nor in + // `name_to_symbols` from any other file. + let tier1_caller_sym = format!("{caller_uri}#orphan"); + + let target_syms = vec![OwnedSymbolInfo { + uri: sym_target.clone(), + kind: SymbolKind::Function, + display_name: "target".to_owned(), + confidence_score: 90, + is_exported: true, + ..Default::default() + }]; + let target_occs = vec![OwnedOccurrence { + symbol_uri: sym_target.clone(), + range: OwnedRange { + start_line: 0, + start_char: 7, + end_line: 0, + end_char: 13, + }, + confidence_score: 90, + role: Role::Definition, + override_doc: None, + kind: ReferenceKind::Unknown, + is_test: false, + }]; + + let mut db = LipDatabase::new(); + db.upsert_file_precomputed( + target_uri.clone(), + "rust".to_owned(), + "t1".to_owned(), + target_syms, + target_occs, + vec![], + ); + // Caller imported with NO SCIP symbols — forces the back-fill + // resolver to miss for `orphan` and keep the raw tier-1 URI. + db.upsert_file_precomputed( + caller_uri.clone(), + "rust".to_owned(), + "c1".to_owned(), + vec![], + vec![], + vec![], + ); + + let result = db.blast_radius_for(&sym_target); + let all_items: Vec<_> = result + .direct_items + .iter() + .chain(result.transitive_items.iter()) + .collect(); + + // Option (b) fallback: the tier-1 caller URI survives Phase 3 and + // is emitted as a symbol-level ImpactItem, not a file-level blank. + assert!( + all_items + .iter() + .any(|i| i.symbol_uri == tier1_caller_sym && i.file_uri == caller_uri), + "Phase 3 must fall back to stripping `#` when def_index misses \ + for a tier-1-form caller URI; got items: {:?}", + all_items + ); + assert!( + all_items.iter().all(|i| !i.symbol_uri.is_empty()), + "no ImpactItem should carry a blank symbol_uri under the option-(b) \ + fallback; got items: {:?}", + all_items + ); + } + #[test] fn blast_radius_batch_file_uri_populated() { let mut db = LipDatabase::new(); From dbf0a3eb30380d72adeb17cf291ebfa01a2e90d0 Mon Sep 17 00:00:00 2001 From: Lisa Date: Fri, 24 Apr 2026 17:39:23 +0200 Subject: [PATCH 14/18] =?UTF-8?q?feat:=20v2.3.3=20=E2=80=94=20QueryOutgoin?= =?UTF-8?q?gImpact=20(forward-direction=20twin=20of=20QueryBlastRadiusSymb?= =?UTF-8?q?ol)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Additive RPC so CKB can query the forward call-graph direction with the same enriched envelope and edges_source provenance gating as blast radius. BFS over caller_to_callees, depth clamped 1..=8, NODE_LIMIT=200. Symmetric Bug-D-style #-strip fallback on the callee side. Semantic enrichment via SemanticImpactItem with Static/Semantic/Both tagging (symbol embedding preferred, file embedding fallback). edges_source lives on OutgoingImpactStatic so CKB can apply the same EdgesSourceEmpty → skip fold gate. Co-Authored-By: Claude Sonnet 4.6 --- CHANGELOG.md | 10 + Cargo.lock | 6 +- Cargo.toml | 2 +- bindings/rust/src/daemon/session.rs | 19 ++ bindings/rust/src/query_graph/db.rs | 387 ++++++++++++++++++++++++- bindings/rust/src/query_graph/types.rs | 133 +++++++++ 6 files changed, 551 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8e6dcec..4fff706 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,16 @@ All notable changes to this project are documented here. ## [Unreleased] +## [2.3.3] — 2026-04-24 + +**Outgoing-impact symmetry.** Adds a single additive RPC so CKB can query the forward direction of the call graph with the same enriched envelope and provenance gating that `QueryBlastRadiusSymbol` already provides for the reverse direction. `protocol_version` stays at `2`; pre-v2.3.3 daemons reply `UnknownMessage`. + +### Added + +- **`QueryOutgoingImpact { symbol_uri, depth?, min_score? }` → `OutgoingImpactResult { result: Option }`** — forward-direction twin of `QueryBlastRadiusSymbol`. BFS walks `caller_to_callees` starting from `symbol_uri`, splits direct vs. transitive hops, and wraps the static result in an envelope flattened with `#[serde(flatten)] static_result: OutgoingImpactStatic` so `edges_source` lives on the inner struct (matching the v2.3.2 shape for blast radius). `depth` is clamped to `1..=8` with a default of 8; `NODE_LIMIT=200` bounds the BFS frontier and trips `truncated: true` on overflow. Semantic enrichment reuses `SemanticImpactItem { source: Static | Semantic | Both }`: symbol-level embedding is preferred, with file-level embedding as the fallback seed, and static-hit files are tagged `Both` when their URI also appears in the nearest-embedding set. The Bug-D-style `#`-strip fallback from v2.3.2 Phase 3 is applied symmetrically on the callee side, so tier-1 URIs with no `def_index` entry still resolve to their file URI instead of producing blank `symbol_uri`. `edges_source: Option` on `OutgoingImpactStatic` mirrors blast radius so CKB can apply the same `EdgesSourceEmpty → skip fold` provenance gate. Advertised as `query_outgoing_impact` in `HandshakeResult.supported_messages`; round-trip tests `query_outgoing_impact_round_trips`, `query_outgoing_impact_is_batchable`, `outgoing_impact_result_round_trips` cover the wire shape, and db-level tests `outgoing_impact_direct_and_transitive` + `outgoing_impact_phase3_fallback_for_tier1_callee_uri` cover BFS correctness and the Phase-3 fallback. + +--- + ## [2.3.2] — 2026-04-24 **CKB testdrive follow-up.** Six correctness fixes discovered after v2.3.1 shipped and CKB began consuming `EnrichedBlastRadius` end-to-end. `protocol_version` stays at `2`; the only schema change moves an existing field between structurally-nested records and is wire-compatible via `#[serde(flatten)]`. diff --git a/Cargo.lock b/Cargo.lock index 25fb35e..95ccffa 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1074,7 +1074,7 @@ checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53" [[package]] name = "lip-cli" -version = "2.3.2" +version = "2.3.3" dependencies = [ "anyhow", "clap", @@ -1093,7 +1093,7 @@ dependencies = [ [[package]] name = "lip-core" -version = "2.3.2" +version = "2.3.3" dependencies = [ "anyhow", "criterion", @@ -1130,7 +1130,7 @@ dependencies = [ [[package]] name = "lip-registry" -version = "2.3.2" +version = "2.3.3" dependencies = [ "anyhow", "axum", diff --git a/Cargo.toml b/Cargo.toml index 1cb3599..845005a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,7 +7,7 @@ members = [ ] [workspace.package] -version = "2.3.2" +version = "2.3.3" edition = "2021" rust-version = "1.78" authors = ["Lisa Welsch "] diff --git a/bindings/rust/src/daemon/session.rs b/bindings/rust/src/daemon/session.rs index 6058e9b..769f853 100644 --- a/bindings/rust/src/daemon/session.rs +++ b/bindings/rust/src/daemon/session.rs @@ -469,6 +469,16 @@ impl Session { ServerMessage::OutgoingCallsResult { edges, truncated } } + ClientMessage::QueryOutgoingImpact { + symbol_uri, + depth, + min_score, + } => { + let mut db = self.db.lock().await; + let result = db.outgoing_impact_for(&symbol_uri, depth, min_score); + ServerMessage::OutgoingImpactResult { result } + } + ClientMessage::QueryWorkspaceSymbols { query, limit, @@ -2086,6 +2096,15 @@ fn process_query_sync( ok(ServerMessage::OutgoingCallsResult { edges, truncated }) } + ClientMessage::QueryOutgoingImpact { + symbol_uri, + depth, + min_score, + } => { + let result = db.outgoing_impact_for(&symbol_uri, depth, min_score); + ok(ServerMessage::OutgoingImpactResult { result }) + } + ClientMessage::QueryWorkspaceSymbols { query, limit, diff --git a/bindings/rust/src/query_graph/db.rs b/bindings/rust/src/query_graph/db.rs index 7c81ffc..1c3c076 100644 --- a/bindings/rust/src/query_graph/db.rs +++ b/bindings/rust/src/query_graph/db.rs @@ -13,8 +13,8 @@ use std::sync::Arc; use crate::indexer::{language::Language, Tier1Indexer}; use crate::query_graph::types::{ - ApiSurface, BlastRadiusResult, EdgesSource, EnrichedBlastRadius, ImpactItem, ImpactSource, - RiskLevel, SemanticImpactItem, SimilarSymbol, + ApiSurface, BlastRadiusResult, EdgesSource, EnrichedBlastRadius, EnrichedOutgoingImpact, + ImpactItem, ImpactSource, OutgoingImpactStatic, RiskLevel, SemanticImpactItem, SimilarSymbol, }; use crate::schema::EdgeKind; use crate::schema::{ @@ -1919,6 +1919,219 @@ impl LipDatabase { (edges, truncated) } + /// Forward-direction symbol impact with optional semantic enrichment (v2.3.3). + /// + /// Symmetric to [`Self::blast_radius_for_symbol`]. Runs a forward BFS + /// over `caller_to_callees` starting at `symbol_uri`, groups callees + /// by distance (direct = 1, transitive >= 2), and surfaces the same + /// [`EdgesSource`] provenance as the incoming-direction query. + /// + /// `depth` clamps to `[1, 8]`; `None` defaults to 8. `min_score` gates + /// semantic enrichment (embedding NN on the target's embedding). The + /// same tier-1-URI fallback used in Phase 3 of `blast_radius_for` + /// (strip `#` when `def_index` misses) applies here so callees + /// emitted by the Tier-1 back-fill resolve to `ImpactItem` rows with + /// a non-empty `symbol_uri`. + /// + /// Returns `None` when the symbol has no known defining file. + pub fn outgoing_impact_for( + &mut self, + symbol_uri: &str, + depth: Option, + min_score: Option, + ) -> Option { + const NODE_LIMIT: usize = 200; + let depth = depth.unwrap_or(8).clamp(1, 8); + + let canon_symbol = self.canonicalize_uri(symbol_uri); + let symbol_uri = canon_symbol.as_str(); + + // Resolve the symbol's defining file. Fail closed if we can't — + // downstream enrichment needs it, and the caller can't distinguish + // "zero callees" from "unknown symbol" without this signal. + let file_uri = self.def_index.get(symbol_uri).map(|(f, _)| f.clone())?; + if !self.file_inputs.contains_key(file_uri.as_str()) { + return None; + } + let threshold = min_score.unwrap_or(0.6); + + // ── Forward BFS over caller_to_callees ─────────────────────────── + let mut callee_distance: HashMap = HashMap::new(); + let mut truncated = false; + { + let mut frontier: Vec = vec![symbol_uri.to_owned()]; + let mut visited: HashSet = HashSet::new(); + visited.insert(symbol_uri.to_owned()); + + 'bfs: for hop in 1..=depth { + let mut next: Vec = Vec::new(); + for caller in &frontier { + let Some(callees) = self.caller_to_callees.get(caller) else { + continue; + }; + for callee in callees { + if callee == symbol_uri { + continue; // skip self-cycles back to the seed + } + if callee_distance.len() > NODE_LIMIT { + truncated = true; + break 'bfs; + } + let prev = callee_distance.get(callee).copied().unwrap_or(u32::MAX); + if hop < prev { + callee_distance.insert(callee.clone(), hop); + } + if visited.insert(callee.clone()) { + next.push(callee.clone()); + } + } + } + if next.is_empty() { + break; + } + frontier = next; + } + } + + // ── Map each callee to its defining file ───────────────────────── + // + // Mirrors Phase 3 of `blast_radius_for`: prefer `def_index`, fall + // back to stripping `#` from a `lip://local/` URI when the + // tier-1 back-fill kept a raw caller/callee URI that was never + // registered in `def_index` (v2.3.2 Bug D, symmetric direction). + let mut direct_items: Vec = Vec::new(); + let mut transitive_items: Vec = Vec::new(); + let mut seen: HashSet<(String, String)> = HashSet::new(); + for (callee_sym, &dist) in &callee_distance { + let resolved_file = self + .def_index + .get(callee_sym) + .map(|(f, _)| f.clone()) + .or_else(|| { + if callee_sym.starts_with("lip://local/") { + let hash_idx = callee_sym.rfind('#')?; + let candidate = &callee_sym[..hash_idx]; + if self.file_inputs.contains_key(candidate) { + Some(candidate.to_owned()) + } else { + None + } + } else { + None + } + }); + let Some(callee_file) = resolved_file else { + continue; // unresolved external symbol — drop rather than emit blank + }; + if !seen.insert((callee_file.clone(), callee_sym.clone())) { + continue; + } + let item = ImpactItem { + file_uri: callee_file, + symbol_uri: callee_sym.clone(), + distance: dist, + confidence: ImpactItem::confidence_at(dist), + }; + if dist == 1 { + direct_items.push(item); + } else { + transitive_items.push(item); + } + } + + // Deterministic ordering. + direct_items.sort_by(|a, b| { + a.file_uri + .cmp(&b.file_uri) + .then(a.symbol_uri.cmp(&b.symbol_uri)) + }); + transitive_items.sort_by(|a, b| { + a.distance + .cmp(&b.distance) + .then(a.file_uri.cmp(&b.file_uri)) + .then(a.symbol_uri.cmp(&b.symbol_uri)) + }); + + let edges_source = self.file_edges_source.get(&file_uri).copied(); + + let static_result = OutgoingImpactStatic { + target_uri: symbol_uri.to_owned(), + direct_items, + transitive_items, + edges_source, + truncated, + }; + + // ── Semantic enrichment ────────────────────────────────────────── + // + // Same seed as `blast_radius_for_symbol`: the target's own + // embedding. Per-symbol preferred, file-level fallback. The + // `source` tagging (`Static | Semantic | Both`) references the set + // of *callee files* we already reached statically, so a semantic + // hit already confirmed by the call graph flips to `Both`. + let mut semantic_items: Vec = Vec::new(); + if min_score.is_some() { + let static_files: HashSet = static_result + .direct_items + .iter() + .chain(static_result.transitive_items.iter()) + .map(|i| i.file_uri.clone()) + .collect(); + + if let Some(sym_embedding) = self.symbol_embeddings.get(symbol_uri).cloned() { + let neighbours = + self.nearest_symbol_by_vector(&sym_embedding, 20, Some(symbol_uri), None); + for n in neighbours { + if n.score < threshold { + continue; + } + let hit_file = self + .def_index + .get(&n.uri) + .map(|(f, _)| f.clone()) + .unwrap_or_else(|| n.uri.clone()); + let source = if static_files.contains(&hit_file) { + ImpactSource::Both + } else { + ImpactSource::Semantic + }; + semantic_items.push(SemanticImpactItem { + file_uri: hit_file, + symbol_uri: n.uri, + similarity: n.score, + source, + }); + } + } else if let Some(file_embedding) = self.file_embeddings.get(&file_uri).cloned() { + let neighbours = self.nearest_by_vector( + &file_embedding, + 20, + Some(&file_uri), + None, + Some(threshold), + ); + for neighbour in neighbours { + let source = if static_files.contains(&neighbour.uri) { + ImpactSource::Both + } else { + ImpactSource::Semantic + }; + semantic_items.push(SemanticImpactItem { + file_uri: neighbour.uri, + symbol_uri: String::new(), + similarity: neighbour.score, + source, + }); + } + } + } + + Some(EnrichedOutgoingImpact { + static_result, + semantic_items, + }) + } + /// Find the symbol URI whose occurrence range contains `(line, col)` in `uri`. /// /// Returns `None` if no occurrence covers the given position. @@ -4460,6 +4673,176 @@ impl Greeter { ); } + // v2.3.3 — QueryOutgoingImpact basic forward-BFS test. Two files with + // a single cross-file call chain; assert direct vs transitive split + // and that `edges_source` is surfaced from the target's file. + #[test] + fn outgoing_impact_direct_and_transitive() { + use crate::schema::{OwnedGraphEdge, OwnedRange, OwnedSymbolInfo, SymbolKind}; + + let mut db = LipDatabase::new(); + let root_uri = "lip://local//abs/root.rs".to_owned(); + let mid_uri = "lip://local//abs/mid.rs".to_owned(); + let leaf_uri = "lip://local//abs/leaf.rs".to_owned(); + let sym_root = format!("{root_uri}#root"); + let sym_mid = format!("{mid_uri}#mid"); + let sym_leaf = format!("{leaf_uri}#leaf"); + + let defn_occ = |sym: &str, range: OwnedRange| OwnedOccurrence { + symbol_uri: sym.to_owned(), + range, + confidence_score: 90, + role: Role::Definition, + override_doc: None, + kind: ReferenceKind::Unknown, + is_test: false, + }; + let mk_sym = |uri: &str, name: &str| OwnedSymbolInfo { + uri: uri.to_owned(), + kind: SymbolKind::Function, + display_name: name.to_owned(), + confidence_score: 90, + is_exported: true, + ..Default::default() + }; + let range = OwnedRange { + start_line: 0, + start_char: 0, + end_line: 0, + end_char: 1, + }; + // root → mid → leaf + let root_edge = OwnedGraphEdge { + from_uri: sym_root.clone(), + to_uri: sym_mid.clone(), + kind: EdgeKind::Calls, + at_range: range.clone(), + }; + let mid_edge = OwnedGraphEdge { + from_uri: sym_mid.clone(), + to_uri: sym_leaf.clone(), + kind: EdgeKind::Calls, + at_range: range.clone(), + }; + + db.upsert_file_precomputed( + leaf_uri.clone(), + "rust".to_owned(), + "l1".to_owned(), + vec![mk_sym(&sym_leaf, "leaf")], + vec![defn_occ(&sym_leaf, range.clone())], + vec![], + ); + db.upsert_file_precomputed( + mid_uri.clone(), + "rust".to_owned(), + "m1".to_owned(), + vec![mk_sym(&sym_mid, "mid")], + vec![defn_occ(&sym_mid, range.clone())], + vec![mid_edge], + ); + db.upsert_file_precomputed( + root_uri.clone(), + "rust".to_owned(), + "r1".to_owned(), + vec![mk_sym(&sym_root, "root")], + vec![defn_occ(&sym_root, range.clone())], + vec![root_edge], + ); + + let result = db + .outgoing_impact_for(&sym_root, Some(4), None) + .expect("root should resolve"); + + let direct: Vec<_> = result.static_result.direct_items.iter().collect(); + let trans: Vec<_> = result.static_result.transitive_items.iter().collect(); + assert_eq!(direct.len(), 1, "expected one direct callee (mid)"); + assert_eq!(direct[0].symbol_uri, sym_mid); + assert_eq!(direct[0].distance, 1); + assert_eq!(trans.len(), 1, "expected one transitive callee (leaf)"); + assert_eq!(trans[0].symbol_uri, sym_leaf); + assert_eq!(trans[0].distance, 2); + assert_eq!( + result.static_result.edges_source, + Some(EdgesSource::ScipOnly), + "edges_source should reflect root.rs (ScipOnly — pre-computed edges)" + ); + assert!(result.semantic_items.is_empty(), "min_score=None → no enrichment"); + } + + // v2.3.3 — Bug-D-symmetric test: when the tier-1 back-fill keeps a + // callee in raw `lip://local//#` form (resolver misses in + // both translate + name_to_symbols), outgoing_impact_for must strip + // the `#` fragment to derive file_uri instead of dropping the + // callee entirely. + #[test] + fn outgoing_impact_phase3_fallback_for_tier1_callee_uri() { + use crate::schema::{OwnedRange, OwnedSymbolInfo, SymbolKind}; + + let dir = tempfile::tempdir().expect("tempdir"); + let caller_path = dir.path().join("caller.rs"); + // `orphan` is a free-standing callee name unknown to name_to_symbols, + // so the tier-1 back-fill resolver must fall through to the raw + // tier-1 URI `caller_uri#orphan` as the callee key. + std::fs::write(&caller_path, "fn entry() { orphan(); }\n").unwrap(); + let caller_abs = caller_path.to_string_lossy(); + let caller_uri = format!("lip://local//{}", caller_abs.trim_start_matches('/')); + let sym_entry = format!("{caller_uri}#entry()"); + let tier1_callee = format!("{caller_uri}#orphan"); + + let caller_syms = vec![OwnedSymbolInfo { + uri: sym_entry.clone(), + kind: SymbolKind::Function, + display_name: "entry".to_owned(), + confidence_score: 90, + is_exported: true, + ..Default::default() + }]; + let caller_occs = vec![OwnedOccurrence { + symbol_uri: sym_entry.clone(), + range: OwnedRange { + start_line: 0, + start_char: 3, + end_line: 0, + end_char: 8, + }, + confidence_score: 90, + role: Role::Definition, + override_doc: None, + kind: ReferenceKind::Unknown, + is_test: false, + }]; + + let mut db = LipDatabase::new(); + // Caller: empty SCIP edges → tier-1 back-fill runs over on-disk + // source, keeps `orphan` as a raw tier-1 URI since resolver misses. + db.upsert_file_precomputed( + caller_uri.clone(), + "rust".to_owned(), + "c1".to_owned(), + caller_syms, + caller_occs, + vec![], + ); + + let result = db + .outgoing_impact_for(&sym_entry, Some(2), None) + .expect("caller entry symbol should resolve"); + let direct = &result.static_result.direct_items; + assert!( + direct + .iter() + .any(|i| i.symbol_uri == tier1_callee && i.file_uri == caller_uri), + "tier-1-form callee must survive via #-strip fallback; got: {:?}", + direct + ); + assert!( + direct.iter().all(|i| !i.symbol_uri.is_empty()), + "outgoing direct items must never carry blank symbol_uri; got: {:?}", + direct + ); + } + #[test] fn blast_radius_batch_file_uri_populated() { let mut db = LipDatabase::new(); diff --git a/bindings/rust/src/query_graph/types.rs b/bindings/rust/src/query_graph/types.rs index f0f85b0..a5455d5 100644 --- a/bindings/rust/src/query_graph/types.rs +++ b/bindings/rust/src/query_graph/types.rs @@ -119,6 +119,40 @@ pub struct OutgoingCallEdge { pub to_uri: String, } +/// Static forward-impact result for `QueryOutgoingImpact` (v2.3.3). +/// +/// Symmetric to [`BlastRadiusResult`] — runs a forward BFS over +/// `caller_to_callees` starting at `target_uri`, groups callees by +/// distance (direct = 1, transitive >= 2), and surfaces the same +/// [`EdgesSource`] provenance the incoming-direction query carries. +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +pub struct OutgoingImpactStatic { + /// Symbol URI the forward BFS started from. + pub target_uri: String, + /// Direct callees (distance = 1). + pub direct_items: Vec, + /// Transitive callees (distance >= 2). + pub transitive_items: Vec, + /// Call-edge provenance for the *target's defining file*. `None` + /// when the daemon has no edges recorded for that file. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub edges_source: Option, + /// `true` when BFS hit the depth or node cap. + pub truncated: bool, +} + +/// Forward-impact result with optional semantic enrichment (v2.3.3). +/// Symmetric envelope to [`EnrichedBlastRadius`]. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct EnrichedOutgoingImpact { + #[serde(flatten)] + pub static_result: OutgoingImpactStatic, + /// Callees/files surfaced via embedding similarity to the target. + /// `source: Static | Semantic | Both` marks overlap with the static + /// call graph the same way [`EnrichedBlastRadius`] does. + pub semantic_items: Vec, +} + /// How the client's query matched a workspace symbol's display name (v2.3 #5). /// Discriminator only — not a ranking signal; the numeric `score` on /// [`RankedSymbol`] is what callers sort by. @@ -323,6 +357,12 @@ pub enum ServerMessage { edges: Vec, truncated: bool, }, + /// Response to [`ClientMessage::QueryOutgoingImpact`] (v2.3.3). + /// `result` is `None` when the symbol's defining file is not indexed. + OutgoingImpactResult { + #[serde(default, skip_serializing_if = "Option::is_none")] + result: Option, + }, WorkspaceSymbolsResult { symbols: Vec, /// v2.3 Feature #5: per-symbol ranking information. @@ -868,6 +908,22 @@ pub enum ClientMessage { /// to bound response size on pathological graphs. depth: u32, }, + /// Forward-direction symbol impact with optional semantic enrichment + /// (v2.3.3). Symmetric to [`ClientMessage::QueryBlastRadiusSymbol`] — + /// same envelope shape, same threshold semantics, same `edges_source` + /// provenance. Walks `caller_to_callees` instead of `callee_to_callers`. + QueryOutgoingImpact { + symbol_uri: String, + /// BFS depth. `None` or values outside [1,8] clamp to 8 (the safety + /// ceiling). Clients pass smaller depths to bound response size in + /// latency-sensitive workflows. + #[serde(default, skip_serializing_if = "Option::is_none")] + depth: Option, + /// Cosine-similarity threshold for semantic enrichment. `None` + /// skips enrichment entirely. Matches `QueryBlastRadiusSymbol`. + #[serde(default, skip_serializing_if = "Option::is_none")] + min_score: Option, + }, QueryWorkspaceSymbols { query: String, limit: Option, @@ -1377,6 +1433,7 @@ impl ClientMessage { "query_blast_radius_batch", "query_blast_radius_symbol", "query_outgoing_calls", + "query_outgoing_impact", "query_workspace_symbols", "query_document_symbols", "query_dead_symbols", @@ -1454,6 +1511,7 @@ impl ClientMessage { ClientMessage::QueryBlastRadiusBatch { .. } => "query_blast_radius_batch", ClientMessage::QueryBlastRadiusSymbol { .. } => "query_blast_radius_symbol", ClientMessage::QueryOutgoingCalls { .. } => "query_outgoing_calls", + ClientMessage::QueryOutgoingImpact { .. } => "query_outgoing_impact", ClientMessage::QueryWorkspaceSymbols { .. } => "query_workspace_symbols", ClientMessage::QueryDocumentSymbols { .. } => "query_document_symbols", ClientMessage::QueryDeadSymbols { .. } => "query_dead_symbols", @@ -1803,6 +1861,76 @@ mod tests { assert_eq!(uri, "file:///src/lib.rs"); } + // ── v2.3.3 round-trip tests ─────────────────────────────────────── + #[test] + fn query_outgoing_impact_round_trips() { + let msg = ClientMessage::QueryOutgoingImpact { + symbol_uri: "lip://local/src/lib.rs#foo".into(), + depth: Some(3), + min_score: Some(0.7), + }; + let rt = round_trip_client(&msg); + let ClientMessage::QueryOutgoingImpact { + symbol_uri, + depth, + min_score, + } = rt + else { + panic!("wrong variant"); + }; + assert_eq!(symbol_uri, "lip://local/src/lib.rs#foo"); + assert_eq!(depth, Some(3)); + assert_eq!(min_score, Some(0.7)); + } + + #[test] + fn query_outgoing_impact_is_batchable() { + assert!(ClientMessage::QueryOutgoingImpact { + symbol_uri: String::new(), + depth: None, + min_score: None, + } + .is_batchable()); + } + + #[test] + fn outgoing_impact_result_round_trips() { + let msg = ServerMessage::OutgoingImpactResult { + result: Some(EnrichedOutgoingImpact { + static_result: OutgoingImpactStatic { + target_uri: "lip://local//abs/lib.rs#foo".into(), + direct_items: vec![ImpactItem { + file_uri: "lip://local//abs/callee.rs".into(), + symbol_uri: "lip://local//abs/callee.rs#bar".into(), + distance: 1, + confidence: ImpactItem::confidence_at(1), + }], + transitive_items: vec![], + edges_source: Some(EdgesSource::ScipWithTier1Edges), + truncated: false, + }, + semantic_items: vec![SemanticImpactItem { + file_uri: "lip://local//abs/other.rs".into(), + symbol_uri: "lip://local//abs/other.rs#baz".into(), + similarity: 0.82, + source: ImpactSource::Semantic, + }], + }), + }; + let json = serde_json::to_string(&msg).expect("serialise"); + let rt: ServerMessage = serde_json::from_str(&json).expect("deserialise"); + let ServerMessage::OutgoingImpactResult { result: Some(r) } = rt else { + panic!("wrong variant"); + }; + assert_eq!(r.static_result.direct_items.len(), 1); + assert_eq!( + r.static_result.edges_source, + Some(EdgesSource::ScipWithTier1Edges) + ); + assert_eq!(r.semantic_items.len(), 1); + assert_eq!(r.semantic_items[0].source, ImpactSource::Semantic); + } + // ── v2.3.1 round-trip tests ─────────────────────────────────────── #[test] fn register_project_root_round_trips() { @@ -1967,6 +2095,11 @@ mod tests { symbol_uri: String::new(), depth: 1, }, + ClientMessage::QueryOutgoingImpact { + symbol_uri: String::new(), + depth: None, + min_score: None, + }, ClientMessage::QueryWorkspaceSymbols { query: String::new(), limit: None, From 474b1818b4f9ab446d886e254da406a2480a0927 Mon Sep 17 00:00:00 2001 From: Lisa Date: Fri, 24 Apr 2026 18:13:01 +0200 Subject: [PATCH 15/18] =?UTF-8?q?feat:=20v2.3.4=20=E2=80=94=20module=5Fid?= =?UTF-8?q?=20on=20ImpactItem=20+=20SemanticImpactItem?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three-tier resolution (slice URI prefix → SCIP package descriptor → language-appropriate manifest walk), resolved once at upsert time and stored on FileInput. Surfaces on every ImpactItem and SemanticImpactItem built by blast_radius_for / blast_radius_for_symbol / blast_radius_batch and outgoing_impact_for so CKB's cross-module risk classifier gets a useful grouping key instead of collapsing to ModuleCount=0. Manifest coverage: Cargo.toml, go.mod, package.json, pyproject.toml, setup.py, pubspec.yaml. Unsupported languages (C/C++/Kotlin/Swift/Java) return None. Field is #[serde(default, skip_serializing_if = None)], so the wire shape stays byte-identical for emitters that don't populate it. Co-Authored-By: Claude Sonnet 4.6 --- CHANGELOG.md | 16 + Cargo.lock | 6 +- Cargo.toml | 2 +- bindings/rust/src/query_graph/db.rs | 263 ++++++++++ bindings/rust/src/query_graph/mod.rs | 1 + bindings/rust/src/query_graph/module_id.rs | 551 +++++++++++++++++++++ bindings/rust/src/query_graph/types.rs | 12 + 7 files changed, 847 insertions(+), 4 deletions(-) create mode 100644 bindings/rust/src/query_graph/module_id.rs diff --git a/CHANGELOG.md b/CHANGELOG.md index 4fff706..cb9d739 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,22 @@ All notable changes to this project are documented here. ## [Unreleased] +## [2.3.4] — 2026-04-24 + +**Module-level grouping on impact items.** Adds `module_id: Option` to `ImpactItem` and `SemanticImpactItem` so consumers whose risk classifier weights cross-module blast (CKB's `RecomputeBlastRadius.ModuleCount`) get a useful value instead of the conservative zero that the unioned static-plus-LIP set previously collapsed to. `protocol_version` stays at `2`; the field is `#[serde(default, skip_serializing_if = "Option::is_none")]`, so the wire shape stays byte-identical for emitters that don't populate it and deserialises cleanly on pre-v2.3.4 clients. + +### Added + +- **`ImpactItem.module_id` and `SemanticImpactItem.module_id`** — resolved once at upsert time, stored on `FileInput`, surfaced on every `ImpactItem` / `SemanticImpactItem` built by `blast_radius_for`, `blast_radius_for_symbol`, `blast_radius_batch`, and `outgoing_impact_for`. Three-tier resolution, first hit wins: + + 1. **Slice URI prefix** — `lip:///@/...` → `"/"`. Covers mounted dependency slices. + 2. **SCIP package descriptor** — the first ` ` pair parsed from any SCIP symbol attached to the file → `"/"`. Covers every file imported via `upsert_file_precomputed` whose symbols carry real SCIP metadata (scip-go, scip-typescript, scip-rs, etc.). Rejects `local ` sentinels and empty-package descriptors. + 3. **Manifest walk** — upward walk (depth-capped at 12) from the file's directory, looking for a language-appropriate manifest: `Cargo.toml` (Rust, `[package] name`), `go.mod` (Go, `module `), `package.json` (TypeScript / JavaScript / TSX / JSX, top-level `"name"`), `pyproject.toml` (Python, `[project].name` or `[tool.poetry].name`), `setup.py` (Python, `name="…"` in setup call), `pubspec.yaml` (Dart, top-level `name:`). Parse failures, I/O failures, and unsupported languages (C / C++ / Kotlin / Swift / Java) return `None` rather than propagating. + + Unit-tested per parser (`parse_cargo_toml_extracts_crate_name`, `parse_go_mod_extracts_module_path`, `parse_package_json_ignores_name_inside_values`, `parse_pyproject_toml_{project,poetry}_section`, `parse_setup_py_double_and_single_quotes`, `parse_pubspec_yaml_{name,ignores_nested_name}`, plus Cargo's workspace-only and dependency-section edge cases). Resolver-level tests cover the priority ordering (`resolve_prefers_slice_uri_over_scip`, `resolve_falls_back_to_scip_when_no_slice`, `resolve_walks_manifest_for_tier1_rust_file`, `resolve_returns_none_when_unsupported_language_and_no_scip`). Integration tests (`blast_radius_surfaces_module_id_from_scip_descriptor`, `blast_radius_surfaces_module_id_from_cargo_toml_walk`, `outgoing_impact_surfaces_module_id`) confirm the field reaches the wire through both RPCs. + +--- + ## [2.3.3] — 2026-04-24 **Outgoing-impact symmetry.** Adds a single additive RPC so CKB can query the forward direction of the call graph with the same enriched envelope and provenance gating that `QueryBlastRadiusSymbol` already provides for the reverse direction. `protocol_version` stays at `2`; pre-v2.3.3 daemons reply `UnknownMessage`. diff --git a/Cargo.lock b/Cargo.lock index 95ccffa..a5010aa 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1074,7 +1074,7 @@ checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53" [[package]] name = "lip-cli" -version = "2.3.3" +version = "2.3.4" dependencies = [ "anyhow", "clap", @@ -1093,7 +1093,7 @@ dependencies = [ [[package]] name = "lip-core" -version = "2.3.3" +version = "2.3.4" dependencies = [ "anyhow", "criterion", @@ -1130,7 +1130,7 @@ dependencies = [ [[package]] name = "lip-registry" -version = "2.3.3" +version = "2.3.4" dependencies = [ "anyhow", "axum", diff --git a/Cargo.toml b/Cargo.toml index 845005a..b7005dd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,7 +7,7 @@ members = [ ] [workspace.package] -version = "2.3.3" +version = "2.3.4" edition = "2021" rust-version = "1.78" authors = ["Lisa Welsch "] diff --git a/bindings/rust/src/query_graph/db.rs b/bindings/rust/src/query_graph/db.rs index 1c3c076..e9b7499 100644 --- a/bindings/rust/src/query_graph/db.rs +++ b/bindings/rust/src/query_graph/db.rs @@ -149,6 +149,12 @@ struct FileInput { /// Content hash supplied by the caller (e.g. from `OwnedDocument.content_hash`). /// Used by `stale_files` so Merkle sync works even when `text` is empty. content_hash: String, + /// v2.3.4 — module grouping identifier, resolved at upsert time. + /// Source priority: slice URI > SCIP descriptor > language-appropriate + /// manifest walk. See [`crate::query_graph::module_id`]. `None` for + /// files whose language has no manifest convention and whose URI carries + /// no slice or SCIP metadata. + module_id: Option, } #[derive(Debug)] @@ -531,11 +537,19 @@ impl LipDatabase { /// Register or update a file. Bumps the global revision and invalidates /// cached derived data for `uri`. + /// Return the module grouping id stored for `file_uri`, if any (v2.3.4). + fn module_id_for(&self, file_uri: &str) -> Option { + self.file_inputs + .get(file_uri) + .and_then(|fi| fi.module_id.clone()) + } + pub fn upsert_file(&mut self, uri: String, text: String, language: String) { let uri = self.canonicalize_uri(&uri); self.revision += 1; let rev = self.revision; let content_hash = sha256_hex(text.as_bytes()); + let module_id = crate::query_graph::module_id::resolve_module_id(&uri, &language, &[]); self.file_inputs.insert( uri.clone(), FileInput { @@ -544,6 +558,7 @@ impl LipDatabase { revision: rev, precomputed: false, content_hash, + module_id, }, ); // Invalidate the direct derived caches. api_cache is intentionally kept @@ -677,6 +692,8 @@ impl LipDatabase { let uri = self.canonicalize_uri(&uri); self.revision += 1; let rev = self.revision; + let module_id = + crate::query_graph::module_id::resolve_module_id(&uri, &language, &symbols); self.file_inputs.insert( uri.clone(), FileInput { @@ -685,6 +702,7 @@ impl LipDatabase { revision: rev, precomputed: true, content_hash, + module_id, }, ); @@ -1556,6 +1574,7 @@ impl LipDatabase { symbol_uri: sym.clone(), distance: *dist, confidence: ImpactItem::confidence_at(*dist), + module_id: self.module_id_for(file), }; affected_files_set.insert(file.clone()); if *dist == 1 { @@ -1579,6 +1598,7 @@ impl LipDatabase { symbol_uri: String::new(), distance, confidence: ImpactItem::confidence_at(distance), + module_id: self.module_id_for(file_uri), }; affected_files_set.insert(file_uri.clone()); if distance == 1 { @@ -1743,11 +1763,13 @@ impl LipDatabase { } else { ImpactSource::Semantic }; + let module_id = self.module_id_for(&hit_file); semantic_items.push(SemanticImpactItem { file_uri: hit_file, symbol_uri: n.uri, similarity: n.score, source, + module_id, }); } } else if let Some(file_embedding) = @@ -1766,11 +1788,13 @@ impl LipDatabase { } else { ImpactSource::Semantic }; + let module_id = self.module_id_for(&neighbour.uri); semantic_items.push(SemanticImpactItem { file_uri: neighbour.uri, symbol_uri: String::new(), similarity: neighbour.score, source, + module_id, }); } } @@ -1828,11 +1852,13 @@ impl LipDatabase { } else { ImpactSource::Semantic }; + let module_id = self.module_id_for(&hit_file); semantic_items.push(SemanticImpactItem { file_uri: hit_file, symbol_uri: n.uri, similarity: n.score, source, + module_id, }); } } else if let Some(file_embedding) = self.file_embeddings.get(&file_uri).cloned() { @@ -1849,11 +1875,13 @@ impl LipDatabase { } else { ImpactSource::Semantic }; + let module_id = self.module_id_for(&neighbour.uri); semantic_items.push(SemanticImpactItem { file_uri: neighbour.uri, symbol_uri: String::new(), similarity: neighbour.score, source, + module_id, }); } } @@ -2026,11 +2054,13 @@ impl LipDatabase { if !seen.insert((callee_file.clone(), callee_sym.clone())) { continue; } + let module_id = self.module_id_for(&callee_file); let item = ImpactItem { file_uri: callee_file, symbol_uri: callee_sym.clone(), distance: dist, confidence: ImpactItem::confidence_at(dist), + module_id, }; if dist == 1 { direct_items.push(item); @@ -2095,11 +2125,13 @@ impl LipDatabase { } else { ImpactSource::Semantic }; + let module_id = self.module_id_for(&hit_file); semantic_items.push(SemanticImpactItem { file_uri: hit_file, symbol_uri: n.uri, similarity: n.score, source, + module_id, }); } } else if let Some(file_embedding) = self.file_embeddings.get(&file_uri).cloned() { @@ -2116,11 +2148,13 @@ impl LipDatabase { } else { ImpactSource::Semantic }; + let module_id = self.module_id_for(&neighbour.uri); semantic_items.push(SemanticImpactItem { file_uri: neighbour.uri, symbol_uri: String::new(), similarity: neighbour.score, source, + module_id, }); } } @@ -5715,4 +5749,233 @@ impl Greeter { "removed file must not appear in invalidation results" ); } + + // v2.3.4 — ImpactItem.module_id surfaces on blast-radius results when + // the file was imported via SCIP with a parseable package descriptor. + #[test] + fn blast_radius_surfaces_module_id_from_scip_descriptor() { + use crate::schema::{OwnedRange, OwnedSymbolInfo, SymbolKind}; + + let caller_uri = "lip://local//abs/caller.rs".to_owned(); + let target_uri = "lip://local//abs/target.rs".to_owned(); + // SCIP-descriptor-form symbols: the package component ("cargo my-crate") + // is what `resolve_module_id` tier-2 parses. + let sym_caller = "scip-rs cargo my-crate 0.1.0 caller.rs#caller().".to_owned(); + let sym_target = "scip-rs cargo my-crate 0.1.0 target.rs#target().".to_owned(); + + let mk_sym = |uri: &str| OwnedSymbolInfo { + uri: uri.to_owned(), + kind: SymbolKind::Function, + display_name: uri + .rsplit('#') + .next() + .unwrap_or("") + .trim_end_matches("().") + .to_owned(), + confidence_score: 90, + is_exported: true, + ..Default::default() + }; + let defn_occ = |sym: &str| OwnedOccurrence { + symbol_uri: sym.to_owned(), + range: OwnedRange { + start_line: 0, + start_char: 0, + end_line: 0, + end_char: 1, + }, + confidence_score: 90, + role: Role::Definition, + override_doc: None, + kind: ReferenceKind::Unknown, + is_test: false, + }; + let edge = OwnedGraphEdge { + from_uri: sym_caller.clone(), + to_uri: sym_target.clone(), + kind: EdgeKind::Calls, + at_range: OwnedRange { + start_line: 0, + start_char: 0, + end_line: 0, + end_char: 1, + }, + }; + + let mut db = LipDatabase::new(); + db.upsert_file_precomputed( + caller_uri.clone(), + "rust".to_owned(), + "c1".to_owned(), + vec![mk_sym(&sym_caller)], + vec![defn_occ(&sym_caller)], + vec![edge], + ); + db.upsert_file_precomputed( + target_uri.clone(), + "rust".to_owned(), + "t1".to_owned(), + vec![mk_sym(&sym_target)], + vec![defn_occ(&sym_target)], + vec![], + ); + + let result = db.blast_radius_for(&sym_target); + let direct = result + .direct_items + .iter() + .find(|i| i.file_uri == caller_uri) + .expect("caller should appear as direct impact"); + assert_eq!( + direct.module_id.as_deref(), + Some("cargo/my-crate"), + "module_id must be derived from the SCIP package descriptor; got {:?}", + direct.module_id + ); + } + + // v2.3.4 — when no SCIP or slice metadata is present, the manifest walk + // fills module_id from Cargo.toml for tier-1-indexed Rust files. + #[test] + fn blast_radius_surfaces_module_id_from_cargo_toml_walk() { + let dir = tempfile::tempdir().expect("tempdir"); + std::fs::write( + dir.path().join("Cargo.toml"), + "[package]\nname = \"my-local-crate\"\n", + ) + .unwrap(); + let src_dir = dir.path().join("src"); + std::fs::create_dir(&src_dir).unwrap(); + let caller_path = src_dir.join("caller.rs"); + let target_path = src_dir.join("target.rs"); + std::fs::write(&caller_path, "fn caller() { target(); }\n").unwrap(); + std::fs::write(&target_path, "pub fn target() {}\n").unwrap(); + let caller_uri = format!( + "lip://local/{}", + caller_path.to_string_lossy().trim_start_matches('/') + ); + let target_uri = format!( + "lip://local/{}", + target_path.to_string_lossy().trim_start_matches('/') + ); + let caller_uri = format!("lip://local//{}", &caller_uri["lip://local/".len()..]); + let target_uri = format!("lip://local//{}", &target_uri["lip://local/".len()..]); + + let mut db = LipDatabase::new(); + db.upsert_file( + target_uri.clone(), + std::fs::read_to_string(&target_path).unwrap(), + "rust".to_owned(), + ); + db.upsert_file( + caller_uri.clone(), + std::fs::read_to_string(&caller_path).unwrap(), + "rust".to_owned(), + ); + + // Look up target's symbol — the tier-1 extractor emits `#target`. + let sym_target = format!("{target_uri}#target"); + let result = db.blast_radius_for(&sym_target); + let all_items: Vec<_> = result + .direct_items + .iter() + .chain(result.transitive_items.iter()) + .collect(); + assert!( + !all_items.is_empty(), + "tier-1 blast radius should include caller.rs" + ); + // Every item in this test comes from a file under the crate root, + // so all module_ids must resolve to "my-local-crate". + for item in &all_items { + assert_eq!( + item.module_id.as_deref(), + Some("my-local-crate"), + "manifest-walk should fill module_id for tier-1-indexed Rust files; got {:?}", + item + ); + } + } + + // v2.3.4 — the forward twin (QueryOutgoingImpact) also fills module_id, + // confirming the lookup is symmetric with blast radius. + #[test] + fn outgoing_impact_surfaces_module_id() { + use crate::schema::{OwnedGraphEdge, OwnedRange, OwnedSymbolInfo, SymbolKind}; + + let root_uri = "lip://local//abs/root.rs".to_owned(); + let leaf_uri = "lip://local//abs/leaf.rs".to_owned(); + // SCIP descriptors differ per file so we can verify the lookup is + // per-file, not per-query. + let sym_root = "scip-rs cargo pkg-a 0.1.0 root.rs#root().".to_owned(); + let sym_leaf = "scip-rs cargo pkg-b 0.1.0 leaf.rs#leaf().".to_owned(); + + let mk_sym = |uri: &str, name: &str| OwnedSymbolInfo { + uri: uri.to_owned(), + kind: SymbolKind::Function, + display_name: name.to_owned(), + confidence_score: 90, + is_exported: true, + ..Default::default() + }; + let defn_occ = |sym: &str| OwnedOccurrence { + symbol_uri: sym.to_owned(), + range: OwnedRange { + start_line: 0, + start_char: 0, + end_line: 0, + end_char: 1, + }, + confidence_score: 90, + role: Role::Definition, + override_doc: None, + kind: ReferenceKind::Unknown, + is_test: false, + }; + let edge = OwnedGraphEdge { + from_uri: sym_root.clone(), + to_uri: sym_leaf.clone(), + kind: EdgeKind::Calls, + at_range: OwnedRange { + start_line: 0, + start_char: 0, + end_line: 0, + end_char: 1, + }, + }; + + let mut db = LipDatabase::new(); + db.upsert_file_precomputed( + root_uri.clone(), + "rust".to_owned(), + "r1".to_owned(), + vec![mk_sym(&sym_root, "root")], + vec![defn_occ(&sym_root)], + vec![edge], + ); + db.upsert_file_precomputed( + leaf_uri.clone(), + "rust".to_owned(), + "l1".to_owned(), + vec![mk_sym(&sym_leaf, "leaf")], + vec![defn_occ(&sym_leaf)], + vec![], + ); + + let result = db + .outgoing_impact_for(&sym_root, Some(4), None) + .expect("root resolves"); + let leaf_item = result + .static_result + .direct_items + .iter() + .find(|i| i.file_uri == leaf_uri) + .expect("leaf is a direct callee"); + assert_eq!( + leaf_item.module_id.as_deref(), + Some("cargo/pkg-b"), + "outgoing_impact must carry the callee's module_id; got {:?}", + leaf_item.module_id + ); + } } diff --git a/bindings/rust/src/query_graph/mod.rs b/bindings/rust/src/query_graph/mod.rs index d3b21f8..c7cabb8 100644 --- a/bindings/rust/src/query_graph/mod.rs +++ b/bindings/rust/src/query_graph/mod.rs @@ -35,6 +35,7 @@ //! roadmap. pub mod db; +pub(crate) mod module_id; pub mod types; pub use db::LipDatabase; diff --git a/bindings/rust/src/query_graph/module_id.rs b/bindings/rust/src/query_graph/module_id.rs new file mode 100644 index 0000000..8b629c9 --- /dev/null +++ b/bindings/rust/src/query_graph/module_id.rs @@ -0,0 +1,551 @@ +//! Module-identifier resolution for `ImpactItem.module_id` (v2.3.4). +//! +//! Resolves a stable "which module does this file belong to" string that CKB's +//! risk classifier uses to weight cross-module blast. Three tiers, tried in +//! order, first hit wins: +//! +//! 1. **Third-party slice URI** — `lip:///@/...` +//! → `"/"`. Covers mounted dependency slices. +//! 2. **SCIP symbol descriptor** — the ` ` pair from any SCIP +//! symbol's package descriptor. Covers every file imported via +//! `upsert_file_precomputed` with real SCIP metadata. +//! 3. **Manifest walk** — upward walk from the file's directory looking for a +//! language-appropriate manifest (Cargo.toml, go.mod, package.json, +//! pyproject.toml / setup.py, pubspec.yaml). Covers tier-1-only local +//! files. +//! +//! Unsupported languages (C/C++/Kotlin/Swift/Java) return `None` from the +//! manifest walk; they still get a value from tiers 1 or 2 if applicable. +//! Everything is best-effort and never blocks indexing: parse errors and I/O +//! failures return `None` rather than propagating. + +use std::path::{Path, PathBuf}; + +use crate::schema::OwnedSymbolInfo; + +/// Maximum upward hops during manifest resolution. Twelve is enough for any +/// realistic monorepo (file depth from manifest rarely exceeds 6–8). +const MANIFEST_WALK_DEPTH_CAP: usize = 12; + +/// Resolve the module identifier for a file, trying three tiers in order. +/// +/// `scip_symbols` may be empty for tier-1-only files; the second tier is +/// skipped in that case. +pub(crate) fn resolve_module_id( + uri: &str, + language: &str, + scip_symbols: &[OwnedSymbolInfo], +) -> Option { + if let Some(id) = from_slice_uri(uri) { + return Some(id); + } + for sym in scip_symbols { + if let Some(id) = from_scip_symbol(&sym.uri) { + return Some(id); + } + } + if let Some(path) = crate::daemon::watcher::uri_to_path(uri) { + return from_manifest_walk(&path, language); + } + None +} + +/// Tier 1: slice URIs carry the manager and package in the URI itself. +/// +/// `lip://cargo/serde@1.0.0/...` → `"cargo/serde"`. +/// `lip://local/...` is NOT a slice — returns `None`. +fn from_slice_uri(uri: &str) -> Option { + let rest = uri.strip_prefix("lip://")?; + if rest.starts_with("local/") || rest == "local" { + return None; + } + let first_slash = rest.find('/')?; + let manager = &rest[..first_slash]; + let after_manager = &rest[first_slash + 1..]; + // Package extends to the first `@` (version marker) or `/` (path) — whichever comes first. + let pkg_end = after_manager + .find(|c: char| c == '@' || c == '/') + .unwrap_or(after_manager.len()); + let package = &after_manager[..pkg_end]; + if manager.is_empty() || package.is_empty() { + return None; + } + Some(format!("{manager}/{package}")) +} + +/// Tier 2: SCIP symbols are space-separated ` `. +/// +/// Returns `"/"` when all four header tokens are present and +/// non-sentinel. SCIP's `local ` short-form has only two tokens and +/// returns `None`, as do empty-sentinel packages (`. . .`). +fn from_scip_symbol(symbol: &str) -> Option { + let mut parts = symbol.split_whitespace(); + let _scheme = parts.next()?; + let manager = parts.next()?; + let name = parts.next()?; + let _version = parts.next()?; + if manager == "." || name == "." || manager.is_empty() || name.is_empty() { + return None; + } + Some(format!("{manager}/{name}")) +} + +/// Tier 3: walk upward from the file's directory looking for a manifest +/// recognised for `language`. Returns the manifest's "name" value on first +/// hit. +fn from_manifest_walk(file_path: &Path, language: &str) -> Option { + let parsers = manifests_for_language(language); + if parsers.is_empty() { + return None; + } + let mut current: PathBuf = file_path.parent()?.to_path_buf(); + for _ in 0..MANIFEST_WALK_DEPTH_CAP { + for (filename, parser) in &parsers { + let candidate = current.join(filename); + if candidate.is_file() { + if let Ok(text) = std::fs::read_to_string(&candidate) { + if let Some(name) = parser(&text) { + return Some(name); + } + } + } + } + match current.parent() { + Some(p) => current = p.to_path_buf(), + None => break, + } + } + None +} + +type ManifestParser = fn(&str) -> Option; + +fn manifests_for_language(language: &str) -> Vec<(&'static str, ManifestParser)> { + match language.to_ascii_lowercase().as_str() { + "rust" => vec![("Cargo.toml", parse_cargo_toml)], + "go" => vec![("go.mod", parse_go_mod)], + "typescript" | "javascript" | "tsx" | "jsx" | "typescriptreact" | "javascriptreact" => { + vec![("package.json", parse_package_json)] + } + "python" => vec![ + ("pyproject.toml", parse_pyproject_toml), + ("setup.py", parse_setup_py), + ], + "dart" => vec![("pubspec.yaml", parse_pubspec_yaml)], + _ => vec![], + } +} + +/// `[package]` section, `name = "foo"` line. Stops at the first subsequent +/// section header so workspace roots whose `[workspace] members = [...]` +/// precedes `[package]` still parse. +fn parse_cargo_toml(text: &str) -> Option { + let mut in_package = false; + for line in text.lines() { + let t = strip_toml_comment(line).trim(); + if t.starts_with('[') && t.ends_with(']') { + in_package = t == "[package]"; + continue; + } + if !in_package { + continue; + } + if let Some(rest) = t.strip_prefix("name") { + if let Some(name) = parse_assignment_value(rest) { + return Some(name); + } + } + } + None +} + +/// `module github.com/foo/bar` — first `module` directive wins. +fn parse_go_mod(text: &str) -> Option { + for line in text.lines() { + let t = strip_go_comment(line).trim(); + if let Some(rest) = t.strip_prefix("module") { + let rest = rest.trim_start(); + if rest.is_empty() { + continue; + } + let name = rest.trim().trim_matches('"'); + if !name.is_empty() { + return Some(name.to_owned()); + } + } + } + None +} + +/// JSON "name": "foo" at top level. Conservative: bails if the value is not +/// a double-quoted string. Doesn't attempt full JSON parsing — package.json +/// may contain comments in workspaces that ship `package.json5`, but the +/// common case is well-formed JSON. +fn parse_package_json(text: &str) -> Option { + let mut idx = 0; + let bytes = text.as_bytes(); + while idx < bytes.len() { + if let Some(k) = find_json_key(&text[idx..], "name") { + let after = &text[idx + k..]; + if let Some(value) = parse_json_string_value(after) { + if !value.is_empty() { + return Some(value); + } + } + idx += k + 1; + } else { + break; + } + } + None +} + +/// Look for `"name": "..."` — returns the offset *past* the key, at the colon +/// or beyond. +fn find_json_key(text: &str, key: &str) -> Option { + let pattern = format!("\"{key}\""); + let mut search_start = 0; + while let Some(pos) = text[search_start..].find(&pattern) { + let abs = search_start + pos; + // Ensure the next non-whitespace char is `:` (so this is a key, not a value). + let after = &text[abs + pattern.len()..]; + let trimmed = after.trim_start(); + if trimmed.starts_with(':') { + return Some(abs + pattern.len()); + } + search_start = abs + pattern.len(); + } + None +} + +fn parse_json_string_value(after_key: &str) -> Option { + let colon_idx = after_key.find(':')?; + let after_colon = &after_key[colon_idx + 1..]; + let trimmed = after_colon.trim_start(); + if !trimmed.starts_with('"') { + return None; + } + let body = &trimmed[1..]; + // No escape handling — package names never contain backslashes or quotes. + let end = body.find('"')?; + Some(body[..end].to_owned()) +} + +/// `[project]` or `[tool.poetry]` section, `name = "foo"` line. +fn parse_pyproject_toml(text: &str) -> Option { + let mut section = String::new(); + for line in text.lines() { + let t = strip_toml_comment(line).trim(); + if t.starts_with('[') && t.ends_with(']') { + section = t[1..t.len() - 1].to_owned(); + continue; + } + if section != "project" && section != "tool.poetry" { + continue; + } + if let Some(rest) = t.strip_prefix("name") { + if let Some(name) = parse_assignment_value(rest) { + return Some(name); + } + } + } + None +} + +/// `setup(name="foo", ...)` — only tolerates the single common form. +fn parse_setup_py(text: &str) -> Option { + let mut idx = 0; + while let Some(pos) = text[idx..].find("name") { + let abs = idx + pos; + let after = &text[abs + 4..]; + let trimmed = after.trim_start(); + if !trimmed.starts_with('=') { + idx = abs + 4; + continue; + } + let after_eq = trimmed[1..].trim_start(); + let quote = after_eq.chars().next()?; + if quote != '"' && quote != '\'' { + idx = abs + 4; + continue; + } + let body = &after_eq[1..]; + if let Some(end) = body.find(quote) { + let name = &body[..end]; + if !name.is_empty() { + return Some(name.to_owned()); + } + } + idx = abs + 4; + } + None +} + +/// Top-level `name: foo` line. Pub files are shallow YAML; full-YAML parsing +/// is overkill. +fn parse_pubspec_yaml(text: &str) -> Option { + for line in text.lines() { + // Ignore indented lines (nested keys). + if line.starts_with(' ') || line.starts_with('\t') { + continue; + } + let t = strip_yaml_comment(line).trim_end(); + if let Some(rest) = t.strip_prefix("name:") { + let name = rest.trim().trim_matches('"').trim_matches('\''); + if !name.is_empty() { + return Some(name.to_owned()); + } + } + } + None +} + +// ── small helpers ────────────────────────────────────────────────────────── + +fn strip_toml_comment(line: &str) -> &str { + match line.find('#') { + Some(i) => &line[..i], + None => line, + } +} + +fn strip_go_comment(line: &str) -> &str { + match line.find("//") { + Some(i) => &line[..i], + None => line, + } +} + +fn strip_yaml_comment(line: &str) -> &str { + match line.find('#') { + Some(i) => &line[..i], + None => line, + } +} + +/// Parse the RHS of a TOML `name = "foo"` assignment. `rest` is the slice +/// starting right after the `name` keyword. +fn parse_assignment_value(rest: &str) -> Option { + let r = rest.trim_start(); + let r = r.strip_prefix('=')?; + let r = r.trim(); + // Strip surrounding single or double quotes, if any. + let unquoted = r + .strip_prefix('"') + .and_then(|s| s.strip_suffix('"')) + .or_else(|| r.strip_prefix('\'').and_then(|s| s.strip_suffix('\''))) + .unwrap_or(r); + if unquoted.is_empty() { + None + } else { + Some(unquoted.to_owned()) + } +} + +// ── tests ────────────────────────────────────────────────────────────────── + +#[cfg(test)] +mod tests { + use super::*; + + fn sym(uri: &str) -> OwnedSymbolInfo { + OwnedSymbolInfo { + uri: uri.to_owned(), + display_name: String::new(), + kind: crate::schema::SymbolKind::Function, + documentation: None, + signature: None, + confidence_score: 0, + relationships: vec![], + runtime_p99_ms: None, + call_rate_per_s: None, + taint_labels: vec![], + blast_radius: 0, + is_exported: false, + ..Default::default() + } + } + + #[test] + fn slice_uri_returns_manager_slash_package() { + assert_eq!( + from_slice_uri("lip://cargo/serde@1.0.0/src/lib.rs#Deserialize"), + Some("cargo/serde".to_owned()) + ); + assert_eq!( + from_slice_uri("lip://npm/react@18.2.0/index.js"), + Some("npm/react".to_owned()) + ); + assert_eq!( + from_slice_uri("lip://gomod/github.com/foo/bar@v1.0/pkg/foo.go"), + Some("gomod/github.com".to_owned()) + ); + } + + #[test] + fn slice_uri_rejects_local_scheme() { + assert_eq!(from_slice_uri("lip://local/src/main.rs"), None); + assert_eq!(from_slice_uri("lip://local//Users/a/proj/src/main.rs"), None); + } + + #[test] + fn scip_symbol_extracts_manager_and_name() { + assert_eq!( + from_scip_symbol("scip-go gomod github.com/foo/bar v1.0 internal/query/SearchSymbols()."), + Some("gomod/github.com/foo/bar".to_owned()) + ); + assert_eq!( + from_scip_symbol("scip-typescript npm react 18.2.0 src/`App.tsx`/App#"), + Some("npm/react".to_owned()) + ); + } + + #[test] + fn scip_symbol_rejects_local_and_sentinels() { + assert_eq!(from_scip_symbol("local 42"), None); + assert_eq!(from_scip_symbol("scip-go . . . foo/bar."), None); + assert_eq!(from_scip_symbol(""), None); + } + + #[test] + fn parse_cargo_toml_extracts_crate_name() { + let toml = r#" +[package] +name = "my-crate" +version = "0.1.0" +"#; + assert_eq!(parse_cargo_toml(toml), Some("my-crate".to_owned())); + } + + #[test] + fn parse_cargo_toml_ignores_dependency_names() { + // `name` under `[dependencies]` must not be confused with the package name. + let toml = r#" +[dependencies] +name = "should-not-match" + +[package] +name = "real-crate" +"#; + assert_eq!(parse_cargo_toml(toml), Some("real-crate".to_owned())); + } + + #[test] + fn parse_cargo_toml_workspace_only_returns_none() { + let toml = r#" +[workspace] +members = ["a", "b"] +"#; + assert_eq!(parse_cargo_toml(toml), None); + } + + #[test] + fn parse_go_mod_extracts_module_path() { + let gomod = "module github.com/foo/bar\n\ngo 1.21\n"; + assert_eq!( + parse_go_mod(gomod), + Some("github.com/foo/bar".to_owned()) + ); + } + + #[test] + fn parse_package_json_extracts_name() { + let json = r#"{ + "name": "@scope/pkg", + "version": "1.0.0" +}"#; + assert_eq!(parse_package_json(json), Some("@scope/pkg".to_owned())); + } + + #[test] + fn parse_package_json_ignores_name_inside_values() { + // "name" appearing inside a description value must not hijack the parse. + let json = r#"{ + "description": "a pkg whose name is special", + "name": "real-name" +}"#; + assert_eq!(parse_package_json(json), Some("real-name".to_owned())); + } + + #[test] + fn parse_pyproject_toml_project_section() { + let toml = r#" +[project] +name = "my-py-pkg" +version = "0.1" +"#; + assert_eq!(parse_pyproject_toml(toml), Some("my-py-pkg".to_owned())); + } + + #[test] + fn parse_pyproject_toml_poetry_section() { + let toml = r#" +[tool.poetry] +name = "poetry-pkg" +"#; + assert_eq!(parse_pyproject_toml(toml), Some("poetry-pkg".to_owned())); + } + + #[test] + fn parse_setup_py_double_and_single_quotes() { + assert_eq!( + parse_setup_py("setup(name=\"pkg-a\", version=\"1.0\")"), + Some("pkg-a".to_owned()) + ); + assert_eq!( + parse_setup_py("setup(name='pkg-b')"), + Some("pkg-b".to_owned()) + ); + } + + #[test] + fn parse_pubspec_yaml_name() { + let yaml = "name: my_dart_pkg\nversion: 0.1.0\n"; + assert_eq!(parse_pubspec_yaml(yaml), Some("my_dart_pkg".to_owned())); + } + + #[test] + fn parse_pubspec_yaml_ignores_nested_name() { + let yaml = "dependencies:\n name: not-this\nname: root_pkg\n"; + assert_eq!(parse_pubspec_yaml(yaml), Some("root_pkg".to_owned())); + } + + #[test] + fn resolve_prefers_slice_uri_over_scip() { + let scip_sym = sym("scip-go gomod github.com/foo/bar v1.0 pkg/Baz()."); + let id = resolve_module_id( + "lip://cargo/serde@1.0.0/src/lib.rs", + "rust", + &[scip_sym], + ); + assert_eq!(id, Some("cargo/serde".to_owned())); + } + + #[test] + fn resolve_falls_back_to_scip_when_no_slice() { + let scip_sym = sym("scip-go gomod github.com/foo/bar v1.0 pkg/Baz()."); + let id = resolve_module_id("lip://local/pkg/foo.go", "go", &[scip_sym]); + assert_eq!(id, Some("gomod/github.com/foo/bar".to_owned())); + } + + #[test] + fn resolve_walks_manifest_for_tier1_rust_file() { + let tmp = tempfile::tempdir().unwrap(); + let cargo = tmp.path().join("Cargo.toml"); + std::fs::write(&cargo, "[package]\nname = \"walker-crate\"\n").unwrap(); + let src_dir = tmp.path().join("src"); + std::fs::create_dir(&src_dir).unwrap(); + let main = src_dir.join("main.rs"); + std::fs::write(&main, "fn main() {}\n").unwrap(); + + let uri = format!("lip://local/{}", main.display()); + let id = resolve_module_id(&uri, "rust", &[]); + assert_eq!(id, Some("walker-crate".to_owned())); + } + + #[test] + fn resolve_returns_none_when_unsupported_language_and_no_scip() { + let id = resolve_module_id("lip://local/foo.c", "c", &[]); + assert_eq!(id, None); + } +} diff --git a/bindings/rust/src/query_graph/types.rs b/bindings/rust/src/query_graph/types.rs index a5455d5..6b2aad4 100644 --- a/bindings/rust/src/query_graph/types.rs +++ b/bindings/rust/src/query_graph/types.rs @@ -49,6 +49,13 @@ pub struct ImpactItem { /// Confidence that this dependency is real. /// Decreases with distance: 0.95 → 0.85 → 0.75 → 0.50 (floor). pub confidence: f32, + /// v2.3.4 — stable module grouping key for this file, used by risk + /// classifiers that weight cross-module blast. Resolved at upsert time + /// from the slice URI prefix, the SCIP symbol's package descriptor, or + /// a language-appropriate manifest (Cargo.toml, go.mod, package.json, + /// pyproject.toml, pubspec.yaml). `None` when no source yields a value. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub module_id: Option, } impl ImpactItem { @@ -183,6 +190,9 @@ pub struct SemanticImpactItem { /// Cosine similarity in [0.0, 1.0]. pub similarity: f32, pub source: ImpactSource, + /// v2.3.4 — module grouping key for this file. See [`ImpactItem::module_id`]. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub module_id: Option, } /// A single nearest-neighbor hit returned by `ServerMessage::NearestResult`. @@ -1904,6 +1914,7 @@ mod tests { symbol_uri: "lip://local//abs/callee.rs#bar".into(), distance: 1, confidence: ImpactItem::confidence_at(1), + module_id: None, }], transitive_items: vec![], edges_source: Some(EdgesSource::ScipWithTier1Edges), @@ -1914,6 +1925,7 @@ mod tests { symbol_uri: "lip://local//abs/other.rs#baz".into(), similarity: 0.82, source: ImpactSource::Semantic, + module_id: None, }], }), }; From 87f5f0bf5ffd00252165f97a002a85ffe74ebfb4 Mon Sep 17 00:00:00 2001 From: Lisa Date: Sat, 25 Apr 2026 00:23:04 +0200 Subject: [PATCH 16/18] =?UTF-8?q?fix:=20v2.3.5=20=E2=80=94=20forward-direc?= =?UTF-8?q?tion=20name-bridge=20(caller=5Fname=5Fto=5Fcallees)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Forward-direction twin of v2.3.2's callee_name_to_callers. Index keyed by normalize_callee_name(extract_name(from_uri)); populated at all three edge-insertion sites (regular tier-1 upsert, SCIP pre-computed edges, SCIP-empty tier-1 back-fill) and pruned in remove_file_call_edges. outgoing_impact_for's BFS now consults both caller_to_callees (URI-exact) and caller_name_to_callees (name-bridge) on every hop, matching Phase 2 of blast_radius_for. Closes the asymmetry where QueryOutgoingImpact seeded from a SCIP descriptor URI (e.g. pkg#Engine#AnalyzeImpact().) returned empty direct_items because the tier-1 back-fill had kept the raw tier-1 caller URI when the method name was ambiguous across the codebase (translate-map miss + name_to_symbols multi-hit fallthrough). Regression test outgoing_impact_name_bridge_for_tier1_caller_uri. All 438 tests pass. Co-Authored-By: Claude Sonnet 4.6 --- Cargo.lock | 6 +- Cargo.toml | 2 +- README.md | 2 +- bindings/rust/src/query_graph/db.rs | 163 +++++++++++++++++++++++++++- docs/LIP_SPEC.mdx | 91 +++++++++++++++- website/src/pages/docs/daemon.mdx | 9 +- website/src/pages/docs/spec.mdx | 48 +++++++- 7 files changed, 306 insertions(+), 15 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a5010aa..acb9be6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1074,7 +1074,7 @@ checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53" [[package]] name = "lip-cli" -version = "2.3.4" +version = "2.3.5" dependencies = [ "anyhow", "clap", @@ -1093,7 +1093,7 @@ dependencies = [ [[package]] name = "lip-core" -version = "2.3.4" +version = "2.3.5" dependencies = [ "anyhow", "criterion", @@ -1130,7 +1130,7 @@ dependencies = [ [[package]] name = "lip-registry" -version = "2.3.4" +version = "2.3.5" dependencies = [ "anyhow", "axum", diff --git a/Cargo.toml b/Cargo.toml index b7005dd..dc8a9a9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,7 +7,7 @@ members = [ ] [workspace.package] -version = "2.3.4" +version = "2.3.5" edition = "2021" rust-version = "1.78" authors = ["Lisa Welsch "] diff --git a/README.md b/README.md index 838e49b..18012cb 100644 --- a/README.md +++ b/README.md @@ -379,7 +379,7 @@ Requires Rust 1.78+. No system `protoc` required. ## Status -v2.3.1 — CKB import-landing fix: `RegisterProjectRoot` + daemon-side canonical URI resolution, `EdgesSource` provenance on blast radius results, tier-1 edge back-fill when SCIP imports carry none, `lip import` emits canonical `lip://local///` URIs, and `lip import --verify` round-trips a sample after push. v2.3 — CKB structural-parity bundle: rich symbol metadata (`signature_normalized`, `modifiers`, `visibility`, `container_name`, `extraction_tier`, `modifiers_source`), reference classification (`ReferenceKind` + `is_test` on every occurrence), `QueryBlastRadiusSymbol` (single-symbol blast radius), `QueryOutgoingCalls` (forward call-graph BFS), and ranked / filtered `QueryWorkspaceSymbols` (`kind_filter`, `scope`, `modifier_filter`, tiered `ranked` output). All additive; `protocol_version` stays at `2`. v2.2 — function-level blast radius, `ReindexStale`, `BatchFileStatus`, `QueryAbiHash`, Tier 1.5 Datalog inference, Tier 2 exponential backoff. v2.1 — `StreamContext` (token-budgeted RAG context streaming); `protocol_version` bumped to `2`. v2.0 — `ExplainMatch`, model provenance on embeddings. v1.9: `filter` glob + `min_score` on all NN calls, `GetCentroid`, `QueryStaleEmbeddings`. v1.8: `FindBoundaries`, `SemanticDiff`, `QueryNearestInStore` (cross-repo federation), `QueryNoveltyScore`, `ExtractTerminology`, `PruneDeleted`. v1.7: 6 semantic retrieval primitives. v1.6: `ReindexFiles`, `Similarity`, `QueryExpansion`, `Cluster`, `ExportEmbeddings`. Wire format is JSON. +v2.3.5 — forward-direction name-bridge symmetry: new `caller_name_to_callees` index mirrors v2.3.2's `callee_name_to_callers` at every edge-insertion site and in `remove_file_call_edges`; `outgoing_impact_for`'s BFS now consults both URI-exact and name-bridge indexes on every hop (matching Phase 2 of `blast_radius_for`). Closes the asymmetry where `QueryOutgoingImpact` seeded from a SCIP descriptor URI returned empty `direct_items` because the tier-1 back-fill had keyed callers in raw tier-1 form for name-overloaded methods. v2.3.4 — module-level grouping on impact items: `ImpactItem.module_id` and `SemanticImpactItem.module_id` (`Option`), resolved once at upsert time via three-tier precedence (slice URI prefix → SCIP package descriptor → upward manifest walk for `Cargo.toml` / `go.mod` / `package.json` / `pyproject.toml` / `setup.py` / `pubspec.yaml`, depth-capped at 12). Unlocks CKB's `RecomputeBlastRadius.ModuleCount` for non-sliced LIP-only traffic. v2.3.3 — outgoing-impact symmetry: `QueryOutgoingImpact` (forward-direction twin of `QueryBlastRadiusSymbol` with the same `EnrichedOutgoingImpact` envelope, `edges_source` provenance, and semantic enrichment via `SemanticImpactItem { source: Static | Semantic | Both }`). v2.3.2 — CKB testdrive follow-up: `edges_source` moved onto `BlastRadiusResult` (so plain `QueryBlastRadius` carries provenance), tier-1 back-fill URIs translate to SCIP descriptor form (same-file and cross-file), path-traversal guard on SCIP document ingestion, `callee_name_to_callers` normalization (strips SCIP descriptor sigils), and Phase-3 blank-`symbol_uri` fallback. v2.3.1 — CKB import-landing fix: `RegisterProjectRoot` + daemon-side canonical URI resolution, `EdgesSource` provenance on blast radius results, tier-1 edge back-fill when SCIP imports carry none, `lip import` emits canonical `lip://local///` URIs, and `lip import --verify` round-trips a sample after push. v2.3 — CKB structural-parity bundle: rich symbol metadata (`signature_normalized`, `modifiers`, `visibility`, `container_name`, `extraction_tier`, `modifiers_source`), reference classification (`ReferenceKind` + `is_test` on every occurrence), `QueryBlastRadiusSymbol` (single-symbol blast radius), `QueryOutgoingCalls` (forward call-graph BFS), and ranked / filtered `QueryWorkspaceSymbols` (`kind_filter`, `scope`, `modifier_filter`, tiered `ranked` output). All additive; `protocol_version` stays at `2`. v2.2 — function-level blast radius, `ReindexStale`, `BatchFileStatus`, `QueryAbiHash`, Tier 1.5 Datalog inference, Tier 2 exponential backoff. v2.1 — `StreamContext` (token-budgeted RAG context streaming); `protocol_version` bumped to `2`. v2.0 — `ExplainMatch`, model provenance on embeddings. v1.9: `filter` glob + `min_score` on all NN calls, `GetCentroid`, `QueryStaleEmbeddings`. v1.8: `FindBoundaries`, `SemanticDiff`, `QueryNearestInStore` (cross-repo federation), `QueryNoveltyScore`, `ExtractTerminology`, `PruneDeleted`. v1.7: 6 semantic retrieval primitives. v1.6: `ReindexFiles`, `Similarity`, `QueryExpansion`, `Cluster`, `ExportEmbeddings`. Wire format is JSON. --- diff --git a/bindings/rust/src/query_graph/db.rs b/bindings/rust/src/query_graph/db.rs index e9b7499..c4f5d6d 100644 --- a/bindings/rust/src/query_graph/db.rs +++ b/bindings/rust/src/query_graph/db.rs @@ -214,6 +214,16 @@ pub struct LipDatabase { /// file is stored here under key `"foo"`, so blast_radius on the canonical /// definition `lip://local/Y#foo` still finds all callers. callee_name_to_callers: HashMap>, + /// Forward-direction twin of `callee_name_to_callers` (v2.3.5). + /// CPG name index: caller display_name → [callee symbol_uris]. Bridges + /// the tier-1 back-fill's URI-form mismatches during `outgoing_impact_for` + /// the same way the reverse bridge serves `blast_radius_for`: when the + /// SCIP descriptor caller URI (`Engine#AnalyzeImpact().`) misses the + /// URI-exact `caller_to_callees` key (which the back-fill ended up + /// writing under the raw tier-1 form), the BFS falls through to this + /// name-fragment keyed bridge. Keys are normalised via + /// `normalize_callee_name(extract_name(from_uri))`. + caller_name_to_callees: HashMap>, /// Pre-built symbols from mounted dependency slices (Tier 3, score=100). /// Keyed by symbol URI. Not derived from source text — set directly by `mount_slice`. mounted_symbols: HashMap, @@ -271,6 +281,7 @@ impl LipDatabase { file_call_edges: HashMap::new(), name_to_symbols: HashMap::new(), callee_name_to_callers: HashMap::new(), + caller_name_to_callees: HashMap::new(), mounted_symbols: HashMap::new(), mounted_packages: HashMap::new(), file_consumed_names: HashMap::new(), @@ -630,6 +641,17 @@ impl LipDatabase { .or_default() .push(edge.from_uri.clone()); } + // Forward twin: enables outgoing_impact_for to seed from a + // SCIP descriptor URI when the back-fill kept the raw + // tier-1 caller URI. v2.3.5. + let caller_name = + normalize_callee_name(extract_name(&edge.from_uri)).to_owned(); + if !caller_name.is_empty() { + self.caller_name_to_callees + .entry(caller_name) + .or_default() + .push(edge.to_uri.clone()); + } pairs.push((edge.from_uri.clone(), edge.to_uri.clone())); } let src = if pairs.is_empty() { @@ -833,6 +855,13 @@ impl LipDatabase { .or_default() .push(edge.from_uri.clone()); } + let caller_name = normalize_callee_name(extract_name(&edge.from_uri)).to_owned(); + if !caller_name.is_empty() { + self.caller_name_to_callees + .entry(caller_name) + .or_default() + .push(edge.to_uri.clone()); + } pairs.push((edge.from_uri.clone(), edge.to_uri.clone())); } @@ -933,6 +962,14 @@ impl LipDatabase { .or_default() .push(from_uri.clone()); } + let caller_name = + normalize_callee_name(extract_name(&from_uri)).to_owned(); + if !caller_name.is_empty() { + self.caller_name_to_callees + .entry(caller_name) + .or_default() + .push(to_uri.clone()); + } pairs.push((from_uri, to_uri)); filled = true; } @@ -1048,6 +1085,13 @@ impl LipDatabase { self.callee_name_to_callers.remove(callee_name); } } + let caller_name = normalize_callee_name(extract_name(&from)); + if let Some(callees) = self.caller_name_to_callees.get_mut(caller_name) { + callees.retain(|c| *c != to); + if callees.is_empty() { + self.caller_name_to_callees.remove(caller_name); + } + } } } } @@ -1984,6 +2028,14 @@ impl LipDatabase { let threshold = min_score.unwrap_or(0.6); // ── Forward BFS over caller_to_callees ─────────────────────────── + // + // Mirrors Phase 2 of `blast_radius_for`: at each hop, consult both + // the URI-exact `caller_to_callees` index and the name-bridge + // `caller_name_to_callees`. The bridge catches the case where the + // seed (or an intermediate caller) is a SCIP descriptor URI while + // the tier-1 back-fill kept the raw tier-1 form as the index key — + // symmetric to how the reverse direction bridges file-local callee + // URIs to scip-form seeds. v2.3.5. let mut callee_distance: HashMap = HashMap::new(); let mut truncated = false; { @@ -1994,10 +2046,22 @@ impl LipDatabase { 'bfs: for hop in 1..=depth { let mut next: Vec = Vec::new(); for caller in &frontier { - let Some(callees) = self.caller_to_callees.get(caller) else { - continue; - }; - for callee in callees { + // URI-exact callees. + let mut direct_callees: Vec = self + .caller_to_callees + .get(caller) + .cloned() + .unwrap_or_default(); + // Name-bridge callees: normalise the caller's fragment + // so a SCIP descriptor (`AnalyzeImpact().`) collides + // with the tier-1-indexed `AnalyzeImpact`. + let caller_name = normalize_callee_name(extract_name(caller)); + if !caller_name.is_empty() { + if let Some(extra) = self.caller_name_to_callees.get(caller_name) { + direct_callees.extend(extra.iter().cloned()); + } + } + for callee in &direct_callees { if callee == symbol_uri { continue; // skip self-cycles back to the seed } @@ -4877,6 +4941,97 @@ impl Greeter { ); } + // v2.3.5 — forward-direction twin of the `callee_name_to_callers` bridge. + // When a caller symbol is registered under a SCIP descriptor URI + // (`pkg#Engine#AnalyzeImpact().`) but the pre-computed call edges key + // the caller in tier-1 form (`...#AnalyzeImpact`), the seed lookup in + // `caller_to_callees` misses. `outgoing_impact_for` must fall through + // to `caller_name_to_callees` via the normalised name fragment. + #[test] + fn outgoing_impact_name_bridge_for_tier1_caller_uri() { + use crate::schema::{OwnedGraphEdge, OwnedRange, OwnedSymbolInfo, SymbolKind}; + + let mut db = LipDatabase::new(); + let caller_file = "lip://local//abs/engine.go".to_owned(); + let callee_file = "lip://local//abs/leaf.go".to_owned(); + // SCIP descriptor form for the caller (Go method on Engine receiver). + // `extract_name` + `normalize_callee_name` collapses this to + // `"AnalyzeImpact"` — matching the tier-1-form caller URI below. + let scip_caller_sym = format!("{caller_file}#Engine#AnalyzeImpact()."); + // Raw tier-1 caller URI that the back-fill resolver's fallthrough + // would keep when the same-file `translate` map and the global + // `name_to_symbols` index both miss for an overloaded method name. + let tier1_caller_sym = format!("{caller_file}#AnalyzeImpact"); + let callee_sym = format!("{callee_file}#leaf()."); + + let range = OwnedRange { + start_line: 0, + start_char: 0, + end_line: 0, + end_char: 1, + }; + let defn_occ = |sym: &str| OwnedOccurrence { + symbol_uri: sym.to_owned(), + range: range.clone(), + confidence_score: 90, + role: Role::Definition, + override_doc: None, + kind: ReferenceKind::Unknown, + is_test: false, + }; + let mk_sym = |uri: &str, name: &str| OwnedSymbolInfo { + uri: uri.to_owned(), + kind: SymbolKind::Function, + display_name: name.to_owned(), + confidence_score: 90, + is_exported: true, + ..Default::default() + }; + + // Callee registered cleanly in SCIP form so Phase-3 resolution can + // map it to a file via `def_index` rather than the #-strip fallback. + db.upsert_file_precomputed( + callee_file.clone(), + "go".to_owned(), + "c1".to_owned(), + vec![mk_sym(&callee_sym, "leaf")], + vec![defn_occ(&callee_sym)], + vec![], + ); + // Caller file: SCIP symbol + definition (feeds def_index with the + // SCIP descriptor), but the pre-computed edge keys the caller in + // tier-1 form. This is the shape the tier-1 back-fill produces + // when the caller name is ambiguous across the codebase. + let edge = OwnedGraphEdge { + from_uri: tier1_caller_sym.clone(), + to_uri: callee_sym.clone(), + kind: EdgeKind::Calls, + at_range: range.clone(), + }; + db.upsert_file_precomputed( + caller_file.clone(), + "go".to_owned(), + "e1".to_owned(), + vec![mk_sym(&scip_caller_sym, "AnalyzeImpact")], + vec![defn_occ(&scip_caller_sym)], + vec![edge], + ); + + // Query using the SCIP descriptor URI — URI-exact seed lookup in + // `caller_to_callees` misses (key is the tier-1 form), so the + // forward BFS must bridge via `caller_name_to_callees`. + let enriched = db + .outgoing_impact_for(&scip_caller_sym, None, None) + .expect("outgoing_impact_for must return Some for a known symbol"); + let direct = &enriched.static_result.direct_items; + assert!( + direct.iter().any(|i| i.symbol_uri == callee_sym), + "forward name-bridge must surface the callee from a SCIP-form seed; \ + got direct_items: {:?}", + direct + ); + } + #[test] fn blast_radius_batch_file_uri_populated() { let mut db = LipDatabase::new(); diff --git a/docs/LIP_SPEC.mdx b/docs/LIP_SPEC.mdx index 91a92f6..96335e0 100644 --- a/docs/LIP_SPEC.mdx +++ b/docs/LIP_SPEC.mdx @@ -807,6 +807,10 @@ lip.query.blast_radius_batch(changed_file_uris, min_score?) → { }] } +// ImpactItem / SemanticItem shape (v2.3.4 added `module_id`): +// ImpactItem { file_uri, symbol_uri, distance, confidence, module_id? } +// SemanticItem { file_uri, symbol_uri, similarity, source, module_id? } + enum EdgesSource { "tier1", // edges came from the tree-sitter extractor "scip_with_tier1_edges", // SCIP import that had no edges; daemon back-filled @@ -840,6 +844,17 @@ empty) will carry the matched symbol, with no wire format change required. stays structural-only. Semantic callers are additive — they inform the human reviewer ("8 callers (+3 semantically coupled)") but do not inflate risk scores. +**`module_id` (v2.3.4).** Every `ImpactItem` and `SemanticItem` optionally carries +`module_id: Option` identifying the module the impacted file belongs to. +Resolved once at upsert time (stored on `FileInput`) with three-tier precedence: +(1) slice URI prefix `lip:///@/…` → `"/"`; +(2) first ` ` pair parsed from any SCIP symbol attached to the file; +(3) upward manifest walk (depth-capped at 12) for `Cargo.toml`, `go.mod`, `package.json`, +`pyproject.toml`, `setup.py`, or `pubspec.yaml`. Unsupported languages (C / C++ / Kotlin +/ Swift / Java without a manifest) return `None`. CKB's `RecomputeBlastRadius.ModuleCount` +uses this to distinguish "47 dependents across 12 modules" from the conservative +zero it previously derived from the unioned set. + #### 8.1.2 Symbol-level blast radius ``` @@ -885,6 +900,46 @@ Traversal is capped at 200 nodes; the `truncated` flag reports when the cap fire Useful for "what does this function reach?" queries — tracing a public API's downstream behaviour, auditing service calls, generating call-graph visualisations. +#### 8.1.4 Outgoing impact (enriched forward traversal) + +``` +lip.query.outgoing_impact(symbol_uri, depth?, min_score?) → { + result: null | EnrichedOutgoingImpact +} + +// EnrichedOutgoingImpact flattens OutgoingImpactStatic via `serde(flatten)`, +// so edges_source lives on the inner struct (mirroring v2.3.2 blast radius): +// { +// direct_items: [ImpactItem], +// transitive_items: [ImpactItem], +// truncated: bool, +// edges_source: EdgesSource, +// semantic_items: [SemanticImpactItem], // only when min_score is set +// } +``` + +Forward-direction twin of `QueryBlastRadiusSymbol` (§8.1.2). BFS walks +`caller_to_callees` starting from `symbol_uri`, splits direct vs. transitive hops, +and returns the same enriched envelope that blast-radius queries use — including +`edges_source` provenance and semantic enrichment when `min_score` is set. `depth` +clamps to `[1, 8]` with a default of 8; `NODE_LIMIT = 200` caps the frontier and +trips `truncated: true` on overflow. + +From v2.3.5, each frontier hop also consults `caller_name_to_callees` — +a forward-direction name-bridge keyed by the normalised caller fragment. +Restores full symmetry with `blast_radius_for`'s Phase 2, so a SCIP +descriptor seed (e.g. `pkg#Engine#AnalyzeImpact().`) still returns callees +when the tier-1 back-fill keyed the caller in raw tier-1 form because the +method name was ambiguous across the codebase. + +Semantic enrichment prefers symbol-level embeddings with a file-level fallback seed; +`SemanticImpactItem.source` is `Both` when a static hit's file also appears in the +nearest-embedding set, `Static` or `Semantic` otherwise. Tier-1 URIs with no +`def_index` entry fall through to the file URI (strip `#` fragment) rather +than producing blank `symbol_uri` — the same Phase-3 fallback applied on the callee +side that v2.3.2 introduced for blast radius. `ImpactItem` / `SemanticImpactItem` +carry `module_id` (§8.1.1) so CKB can attribute module counts to callees. + ### 8.2 Taint tracking Symbols can be annotated with `taint_labels` (e.g. `["PII", "UNSAFE_IO"]`). LIP @@ -1374,6 +1429,40 @@ lip-protocol/ - [x] **Tier 1.5 Datalog inference** — `LipDatabase::run_tier1_5_inference()` fixed-point loop: (1) callee elevation when all callers ≥ 80 confidence → raise to 65; (2) exported leaf symbols with ≥ 40 confidence → +5 capped at 65. Never lowers confidence; ceiling 65 leaves headroom for Tier 2. - [x] **Tier 2 backoff recovery** — language server backends recover from transient crashes with exponential backoff (2–300 s, permanent disable only after 8 failures). `BackoffState { failure_count, available_after }` per backend. Replaces immediate permanent disable on first crash. +### v2.3.1 — CKB import landing fix ✓ + +- [x] **`RegisterProjectRoot { root }`** — idempotent filesystem-root registration. Lets the daemon resolve relative `lip://local/` URIs against the absolute-form records emitted by the tier-1 indexer and by `lip import`. Longest matching root wins when multiple are registered. Advertised as `register_project_root` in `HandshakeResult.supported_messages`; pre-v2.3.1 daemons reply `UnknownMessage`. `RegisterTier3Source` now auto-registers its `source.project_root` so SCIP imports need no second round-trip. +- [x] **`EdgesSource` provenance on blast radius** — `EnrichedBlastRadius` gains `edges_source: Option` with four variants (`Tier1 | ScipWithTier1Edges | ScipOnly | Empty`). Consumers that maintain their own fallback path can detect when LIP has no structural edges and route around us. `#[serde(default, skip_serializing_if = Option::is_none)]`. +- [x] **Tier-1 edge back-fill on SCIP imports** — `upsert_file_precomputed` now runs the tree-sitter tier-1 extractor over the file on disk when the incoming SCIP document carries no call edges. Produces `edges_source = ScipWithTier1Edges` on success, `Empty` when the file is unreadable or yields no calls. Fills the gap where `scip-go` inconsistently emits call edges and `scip-clang` omits them entirely. +- [x] **`lip import --verify`** — after pushing deltas, samples up to 10 documents and round-trips `QueryFileStatus` (expecting `indexed = true`) plus a `QueryWorkspaceSymbols` probe scoped to the file. Exits non-zero on any mismatch so CI catches silent import drops. +- [x] **`lip import` URI scheme** — imported documents now use `lip://local/` (when `Metadata.project_root` is absent) or `lip://local//` with the canonical doubled slash (when it is present), replacing the previous `file:///` form that silently failed to match any CKB query. +- [x] **`LipDatabase::canonicalize_uri`** — every public query- and mutation-surface method canonicalises its URI argument through `registered_roots` before hitting the input/embedding/def/sym/occ maps, so relative and absolute lip-local forms of the same file resolve to the same record. +- [x] **Self-echo deadlock fix** — broadcast notifications now carry `source_session: Option`; the drain loop skips envelopes whose `source_session` matches its own. Tier 2 upgrades emit with `source_session: None` so they still reach every session. Regression test `daemon_bulk_precomputed_import_does_not_deadlock` pushes 200 precomputed deltas through a single session. + +### v2.3.2 — CKB testdrive follow-up ✓ + +- [x] **`edges_source` moved from `EnrichedBlastRadius` onto `BlastRadiusResult`** — so non-enriched `QueryBlastRadius` (not just the batch / symbol variants) carries call-edge provenance. `EnrichedBlastRadius` still surfaces it through `#[serde(flatten)] static_result: BlastRadiusResult`, so the JSON wire shape is unchanged. Round-trip test `edges_source_survives_all_response_envelopes` asserts the field is emitted in every response variant. +- [x] **Tier-1 back-fill URIs translate to SCIP descriptor form** — same-file and cross-file. Tier-1 emits fragment URIs in plain-identifier form (`#NewExporter`) while scip-go / scip-typescript emit descriptor form (`#NewExporter()`, `#Component.`). The back-fill now builds a same-file `display_name → SCIP-uri` map and, for cross-file callees, falls back to a global `name_to_symbols` index populated at SCIP-import time. Regression tests `tier1_backfill_translates_caller_uri_to_scip_fragment` and `tier1_backfill_resolves_cross_file_callee_via_name_index`. +- [x] **Path-traversal guard on SCIP document ingestion** — scip-go ships documents whose `relative_path` points outside the project tree (`../../../../Library/Caches/go-build/…`). `convert_document` now rejects any document whose net depth falls below the project root under pure string-level normalization. Emits `warning: skipped N SCIP document(s) whose relative_path escapes project_root`. +- [x] **Double `lip://local/` prefix in `callee_to_callers` keys** — `SymbolExtractor::lip_uri` stripped `file://` but not an existing `lip://local/` prefix, so tier-1 back-fills against imported-with-canonical-key files produced double-prefixed edge URIs. Fixed by detecting the `lip://local/` prefix and appending `#` directly. +- [x] **SCIP-descriptor / tier-1-identifier name-fragment mismatch in `callee_name_to_callers`** — tier-1 indexes plain identifiers (`SearchSymbols`); SCIP carries sigils (`SearchSymbols().`, `MyField.`, `Foo:`). Added `normalize_callee_name(fragment)` and applied it at all four insert sites plus the BFS lookup site. Unit test `normalize_callee_name_strips_scip_descriptor_suffixes` covers the six canonical SCIP descriptor shapes. +- [x] **Blank `symbol_uri` when tier-1 back-fill preserves a raw caller URI** — Phase 3 of `blast_radius_for` now derives the file URI by stripping the `#` fragment when `def_index` misses and the caller URI has the `lip://local/` scheme, using the caller URI verbatim as `symbol_uri`. Regression test `blast_radius_phase3_fallback_for_tier1_caller_uri`. +- [x] **`LIP_DEBUG_EDGES=1` diagnostic gating** — zero-cost when unset. The wire log now reports `has_edges_source` / `edges_source_count` / `body_bytes` + 500-char head instead of a truncated 2 KB tail. + +### v2.3.3 — Outgoing-impact symmetry ✓ + +- [x] **`QueryOutgoingImpact { symbol_uri, depth?, min_score? } → OutgoingImpactResult { result: Option }`** — forward-direction twin of `QueryBlastRadiusSymbol`. BFS walks `caller_to_callees` starting from `symbol_uri`, splits direct vs. transitive hops, and wraps the static result in an envelope flattened with `#[serde(flatten)] static_result: OutgoingImpactStatic` so `edges_source` lives on the inner struct (matching the v2.3.2 shape for blast radius). `depth` is clamped to `1..=8` with a default of 8; `NODE_LIMIT = 200` bounds the BFS frontier and trips `truncated: true` on overflow. Semantic enrichment reuses `SemanticImpactItem { source: Static | Semantic | Both }`: symbol-level embedding is preferred, with file-level embedding as the fallback seed, and static-hit files are tagged `Both` when their URI also appears in the nearest-embedding set. The v2.3.2 Phase 3 `#`-strip fallback is applied symmetrically on the callee side, so tier-1 URIs with no `def_index` entry still resolve to their file URI instead of producing blank `symbol_uri`. Advertised as `query_outgoing_impact` in `HandshakeResult.supported_messages`. Spec §8.1.4. + +### v2.3.4 — Module-level grouping on impact items ✓ + +- [x] **`ImpactItem.module_id` and `SemanticImpactItem.module_id`** — `Option`, resolved once at upsert time (stored on `FileInput`), surfaced on every `ImpactItem` / `SemanticImpactItem` built by `blast_radius_for`, `blast_radius_for_symbol`, `blast_radius_batch`, and `outgoing_impact_for`. Three-tier resolution, first hit wins: (1) slice URI prefix `lip:///@/…` → `"/"`; (2) first ` ` pair parsed from any SCIP symbol attached to the file (rejects `local ` sentinels and empty-package descriptors); (3) upward manifest walk, depth-capped at 12, for `Cargo.toml` (Rust, `[package] name`), `go.mod` (Go, `module `), `package.json` (TypeScript / JavaScript / TSX / JSX, top-level `"name"`), `pyproject.toml` (Python, `[project].name` or `[tool.poetry].name`), `setup.py` (Python, `name="…"`), `pubspec.yaml` (Dart, top-level `name:`). Parse failures, I/O failures, and unsupported languages (C / C++ / Kotlin / Swift / Java) return `None`. Unit-tested per parser; integration tests `blast_radius_surfaces_module_id_from_scip_descriptor`, `blast_radius_surfaces_module_id_from_cargo_toml_walk`, `outgoing_impact_surfaces_module_id` confirm the field reaches the wire. `protocol_version` stays at `2`; `#[serde(default, skip_serializing_if = "Option::is_none")]` keeps the shape byte-identical for emitters that don't populate it. + +### v2.3.5 — Forward-direction name-bridge symmetry ✓ + +- [x] **`caller_name_to_callees` index** — forward-direction twin of v2.3.2's `callee_name_to_callers`. Keyed by `normalize_callee_name(extract_name(from_uri))`; maps the caller's plain-identifier fragment to every callee URI of every `Calls` edge the database has indexed. Populated at all three edge-insertion sites (regular tier-1 upsert in `upsert_file`, SCIP pre-computed edges in `upsert_file_precomputed`, the SCIP-empty tier-1 back-fill) and pruned in `remove_file_call_edges`, mirroring the reverse index byte-for-byte. +- [x] **Symmetric BFS in `outgoing_impact_for`** — every frontier hop now consults both `caller_to_callees` (URI-exact) and `caller_name_to_callees` (name-bridge via `normalize_callee_name(extract_name(caller))`), exactly matching Phase 2 of `blast_radius_for`. The union of the two lookups feeds the visited/distance machinery; self-cycles back to the seed are still skipped. Closes the asymmetry where `QueryOutgoingImpact` seeded from a SCIP descriptor URI (e.g. `pkg#Engine#AnalyzeImpact().`) returned empty `direct_items` because the SCIP tier-1 back-fill had kept the raw tier-1 caller URI (`…#AnalyzeImpact`) as the key when the method name was ambiguous across the codebase (translate-map miss + `name_to_symbols` miss on multi-hit). Blocks pre-v2.3.5 CKB integration of `QueryOutgoingImpact` for any Go / TypeScript / Python repo with name-overloaded methods. +- [x] **Regression coverage** — `outgoing_impact_name_bridge_for_tier1_caller_uri`: pre-computed edge keyed under tier-1 caller form (`…#AnalyzeImpact`); symbol definition registered under SCIP descriptor form (`…#Engine#AnalyzeImpact().`); `outgoing_impact_for(scip_descriptor)` must surface the callee via the name-bridge fallback. All 438 existing tests continue to pass — no behavioural change for URI-exact hits, only a strictly-additive bridge for misses. + ### v1.2 — In progress - [ ] FlatBuffers binary IPC — replace JSON wire framing with generated FlatBuffers tables @@ -1486,5 +1575,5 @@ via the `flatbuffers` crate, aligning with LIP's reference implementation langua --- -*LIP Specification v2.3.0 · April 2026 · MIT License* +*LIP Specification v2.3.5 · April 2026 · MIT License* *Lisa Welsch* diff --git a/website/src/pages/docs/daemon.mdx b/website/src/pages/docs/daemon.mdx index 07f22b2..c94a356 100644 --- a/website/src/pages/docs/daemon.mdx +++ b/website/src/pages/docs/daemon.mdx @@ -92,9 +92,12 @@ Each connection handles one request/response pair. The protocol is synchronous p | `QueryDefinition` | Find definition at (uri, line, col) | | `QueryReferences` | Find all references to a symbol URI | | `QueryHover` | Hover info at (uri, line, col) | -| `QueryBlastRadius` | Blast radius for a symbol URI | +| `QueryBlastRadius` | Blast radius for a symbol URI (carries `edges_source` since v2.3.2) | | `QueryBlastRadiusBatch` | Batch blast radius for changed files, with optional semantic enrichment (§8.1.1) | -| `QueryWorkspaceSymbols` | Search symbols by name | +| `QueryBlastRadiusSymbol` | Single-symbol blast radius — returns `EnrichedBlastRadius` or `None` (§8.1.2) | +| `QueryOutgoingCalls` | Forward call-graph BFS via `caller_to_callees` (§8.1.3) | +| `QueryOutgoingImpact` | Enriched forward traversal — twin of `QueryBlastRadiusSymbol` with `edges_source` + semantic enrichment (§8.1.4) | +| `QueryWorkspaceSymbols` | Search symbols by name; ranked + kind/scope/modifier filters since v2.3 | | `QueryDocumentSymbols` | List symbols in a file | | `QueryDeadSymbols` | Find unreferenced symbols | | `AnnotationGet/Set/List` | Persistent symbol annotations | @@ -106,6 +109,8 @@ Each connection handles one request/response pair. The protocol is synchronous p | `ReindexStale` | Atomic reindex if stale — re-reads only URIs that are unindexed or older than `max_age_seconds` | | `BatchFileStatus` | Query index status for multiple files in one round-trip (batchable) | | `QueryAbiHash` | SHA-256 over a file's exported API surface — stable recompilation trigger (batchable) | +| `RegisterProjectRoot` | Register a filesystem root so the daemon resolves `lip://local/` URIs (v2.3.1) | +| `RegisterTier3Source` | Declare provenance (tool name / version / project root) for a SCIP-import batch | | `BatchQuery` | Multiple queries in one round-trip | **Acknowledgment:** Every `Delta` receives a `DeltaAck { seq, accepted }` response, eliminating the fire-and-forget drift that LSP is known for. diff --git a/website/src/pages/docs/spec.mdx b/website/src/pages/docs/spec.mdx index 05bf488..1afa8db 100644 --- a/website/src/pages/docs/spec.mdx +++ b/website/src/pages/docs/spec.mdx @@ -9,7 +9,7 @@ import Badge from '../../components/Badge.astro' # LIP — Linked Incremental Protocol -**Design Document & Specification v2.0.1** · MIT License +**Design Document & Specification v2.3.5** · MIT License --- @@ -117,7 +117,7 @@ LIP is designed around three axioms: ## 2. Comparison Matrix -| Property | LSP 3.17 | SCIP | **LIP v2.0.1** | +| Property | LSP 3.17 | SCIP | **LIP v2.3.5** | |---|---|---|---| | Wire format | JSON-RPC 2.0 | Protobuf 3 | **FlatBuffers (zero-copy)** | | Framing | HTTP Content-Length | n/a (file) | **4-byte length prefix** | @@ -1168,6 +1168,48 @@ lip-protocol/ - [x] **Tier 1.5 Datalog inference** — `LipDatabase::run_tier1_5_inference()` runs a fixed-point loop applying two conservative rules: (1) if every direct caller of a symbol is at confidence ≥ 80, raise the callee to 65; (2) exported symbols with no local callers are raised by 5 points (capped at 65). Never lowers confidence; ceiling 65 leaves headroom for Tier 2. - [x] **Tier 2 backoff recovery** — language server backends now recover from transient crashes with exponential backoff (2–300 s, permanent disable only after 8 consecutive failures). `BackoffState { failure_count, available_after }` tracks per-backend state. Replaces immediate permanent disable on first crash. +### v2.3 — CKB structural-parity bundle ✓ + +- [x] **Rich symbol metadata** — `OwnedSymbolInfo` gains `signature_normalized`, `modifiers`, `visibility` + `visibility_confidence`, `container_name`, `extraction_tier`, and `modifiers_source`. Populated by all Tier-1 extractors (Rust / TypeScript / Python / Swift / Kotlin) and by the SCIP importer (via upstream-compatible `enclosing_symbol = 8` + prefix-parsed modifiers). +- [x] **Reference classification** — `OwnedOccurrence` gains `kind: ReferenceKind` (`Unknown` / `Call` / `Read` / `Write` / `Type` / `Implements` / `Extends`) + `is_test: bool`. Tier-1 classifier uses tree-sitter parent/field lookup; SCIP import/export round-trips via `SymbolRole::ReadAccess | WriteAccess | Test`. +- [x] **`QueryBlastRadiusSymbol { symbol_uri, min_score? } → BlastRadiusSymbolResult { result: Option }`** — single-symbol wrapper around `blast_radius_for_symbol`. Returns `None` for unknown or unindexed symbols so callers can distinguish "zero impact" from "no data." +- [x] **`QueryOutgoingCalls { symbol_uri, depth } → OutgoingCallsResult { edges, truncated }`** — forward call-graph BFS via a new `caller_to_callees` index. Depth clamped `[1, 8]`; NODE_LIMIT = 200. +- [x] **Ranked & filtered workspace symbols** — `QueryWorkspaceSymbols` adds `kind_filter`, `scope`, `modifier_filter`; `WorkspaceSymbolsResult` adds `ranked: Vec` with tiered scoring (`Exact = 1.0` / `Prefix = 0.8` / `Fuzzy = 0.5`). `ranked` is `skip_if_empty`; empty query preserves pre-v2.3 semantics. +- [x] **All additive.** `protocol_version` stays at `2`; every new field is `#[serde(default, skip_serializing_if = …)]`; every new message is advertised via `HandshakeResult.supported_messages`. + +### v2.3.1 — CKB import landing fix ✓ + +- [x] **`RegisterProjectRoot { root }`** — idempotent filesystem-root registration. Lets the daemon resolve relative `lip://local/` URIs against the absolute-form records emitted by the tier-1 indexer and by `lip import`. Longest matching root wins. Advertised as `register_project_root` in `HandshakeResult.supported_messages`. `RegisterTier3Source` now auto-registers its `source.project_root`. +- [x] **`EdgesSource` provenance on blast radius** — `EnrichedBlastRadius` gains `edges_source: Option` with four variants (`Tier1 | ScipWithTier1Edges | ScipOnly | Empty`). Consumers that maintain their own fallback path can now detect when LIP has no structural edges and route around us. +- [x] **Tier-1 edge back-fill on SCIP imports** — `upsert_file_precomputed` now falls back to running the tree-sitter tier-1 extractor over the file on disk when the incoming SCIP document has no call edges. Produces `edges_source = ScipWithTier1Edges` on success. Fills the gap where `scip-go` inconsistently emits call edges and `scip-clang` omits them entirely. +- [x] **`lip import --verify`** — after pushing deltas, samples up to 10 documents and round-trips `QueryFileStatus` + `QueryWorkspaceSymbols`. Exits non-zero on any mismatch so CI catches silent import drops. +- [x] **`lip import` URI scheme** — imported documents now use `lip://local/` or the canonical doubled-slash `lip://local//` form, replacing the previous `file:///` form that silently failed to match any CKB query. +- [x] **`LipDatabase::canonicalize_uri`** — every public query- and mutation-surface method canonicalises its URI argument through `registered_roots`, so relative and absolute lip-local forms resolve to the same record. +- [x] **Self-echo deadlock fix** — broadcast notifications now carry `source_session: Option`; the drain loop skips envelopes whose `source_session` matches its own. Regression test `daemon_bulk_precomputed_import_does_not_deadlock` pushes 200 precomputed deltas through a single session. + +### v2.3.2 — CKB testdrive follow-up ✓ + +- [x] **`edges_source` moved from `EnrichedBlastRadius` onto `BlastRadiusResult`** — so non-enriched `QueryBlastRadius` carries call-edge provenance. `EnrichedBlastRadius` still surfaces it via `#[serde(flatten)] static_result: BlastRadiusResult`, so the JSON wire shape is unchanged. Round-trip test `edges_source_survives_all_response_envelopes`. +- [x] **Tier-1 back-fill URIs translate to SCIP descriptor form** — same-file and cross-file. Tier-1 emits `#NewExporter`; SCIP carries `#NewExporter()` / `#Component.`. The back-fill now builds a same-file `display_name → SCIP-uri` map and, for cross-file callees, falls back to a global `name_to_symbols` index populated at SCIP-import time. +- [x] **Path-traversal guard on SCIP document ingestion** — rejects documents whose net depth falls below the project root under pure string-level normalization. Catches scip-go's `../../../../Library/Caches/go-build/…` drift. +- [x] **Double `lip://local/` prefix in `callee_to_callers` keys** — `SymbolExtractor::lip_uri` now detects the `lip://local/` prefix and appends `#` directly instead of re-wrapping. +- [x] **SCIP-descriptor / tier-1-identifier name-fragment mismatch** — added `normalize_callee_name(fragment)` (truncates at first `(`, trims trailing non-identifier chars) and applied it at all four `callee_name_to_callers` insert sites plus the BFS lookup site, so SCIP and tier-1 callees share keys. Unit test `normalize_callee_name_strips_scip_descriptor_suffixes`. +- [x] **Blank `symbol_uri` fallback** — Phase 3 of `blast_radius_for` derives the file URI by stripping the `#` fragment when `def_index` misses and the caller URI has the `lip://local/` scheme, using the caller URI verbatim as `symbol_uri`. Regression test `blast_radius_phase3_fallback_for_tier1_caller_uri`. +- [x] **`LIP_DEBUG_EDGES=1` diagnostic gating** — zero-cost when unset. The wire log reports `has_edges_source` / `edges_source_count` / `body_bytes` + 500-char head. + +### v2.3.3 — Outgoing-impact symmetry ✓ + +- [x] **`QueryOutgoingImpact { symbol_uri, depth?, min_score? } → OutgoingImpactResult { result: Option }`** — forward-direction twin of `QueryBlastRadiusSymbol`. BFS walks `caller_to_callees` starting from `symbol_uri`, splits direct vs. transitive hops, and wraps the static result in an envelope flattened with `#[serde(flatten)] static_result: OutgoingImpactStatic` so `edges_source` lives on the inner struct (matching the v2.3.2 shape for blast radius). `depth` clamps to `1..=8` (default 8); `NODE_LIMIT=200` bounds the BFS frontier. Semantic enrichment reuses `SemanticImpactItem { source: Static | Semantic | Both }`: symbol-level embedding is preferred, with file-level embedding as the fallback seed. The v2.3.2 Phase 3 `#`-strip fallback is applied symmetrically on the callee side. Advertised as `query_outgoing_impact` in `HandshakeResult.supported_messages`. + +### v2.3.4 — Module-level grouping on impact items ✓ + +- [x] **`ImpactItem.module_id` and `SemanticImpactItem.module_id`** — `Option`, resolved once at upsert time (stored on `FileInput`), surfaced on every `ImpactItem` / `SemanticImpactItem` built by `blast_radius_for`, `blast_radius_for_symbol`, `blast_radius_batch`, and `outgoing_impact_for`. Three-tier resolution, first hit wins: (1) slice URI prefix `lip:///@/…` → `"/"`; (2) first ` ` pair parsed from any SCIP symbol attached to the file (rejects `local ` sentinels); (3) upward manifest walk (depth-capped at 12) for `Cargo.toml`, `go.mod`, `package.json`, `pyproject.toml`, `setup.py`, `pubspec.yaml`. Parse failures, I/O failures, and unsupported languages (C / C++ / Kotlin / Swift / Java) return `None`. Unlocks CKB's `RecomputeBlastRadius.ModuleCount` for non-sliced LIP-only traffic. `protocol_version` stays at `2`; `#[serde(default, skip_serializing_if = "Option::is_none")]`. + +### v2.3.5 — Forward-direction name-bridge symmetry ✓ + +- [x] **`caller_name_to_callees` index** — forward-direction twin of v2.3.2's `callee_name_to_callers`. Keyed by `normalize_callee_name(extract_name(from_uri))`; populated at all three edge-insertion sites (regular tier-1 upsert, SCIP pre-computed edges, SCIP-empty tier-1 back-fill) and pruned in `remove_file_call_edges`. +- [x] **Symmetric BFS in `outgoing_impact_for`** — every frontier hop now consults both `caller_to_callees` (URI-exact) and `caller_name_to_callees` (name-bridge), exactly matching Phase 2 of `blast_radius_for`. Closes the asymmetry where `QueryOutgoingImpact` seeded from a SCIP descriptor URI (e.g. `pkg#Engine#AnalyzeImpact().`) returned empty `direct_items` because the tier-1 back-fill had kept the raw tier-1 caller URI when the method name was ambiguous across the codebase. Blocks pre-v2.3.5 CKB integration of `QueryOutgoingImpact` for Go / TypeScript / Python repos with name-overloaded methods. Regression test `outgoing_impact_name_bridge_for_tier1_caller_uri`. + ### v1.2 — In progress - [ ] FlatBuffers binary IPC — replace JSON wire framing with generated FlatBuffers tables @@ -1280,5 +1322,5 @@ via the `flatbuffers` crate, aligning with LIP's reference implementation langua --- -*LIP Specification v2.0.1 · April 2026 · MIT License* +*LIP Specification v2.3.5 · April 2026 · MIT License* *Lisa Welsch* From cad9f550958604f68f15c2115685dabb317044bb Mon Sep 17 00:00:00 2001 From: Lisa Date: Sat, 25 Apr 2026 00:47:16 +0200 Subject: [PATCH 17/18] chore: cargo fmt + clippy fixes (Rust 1.95) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pure CI hygiene — no behavioural change. Rust 1.95 added several clippy lints that trip on pre-existing idioms in the v2.3 codebase: - unnecessary_map_or: is_some_and over map_or(false, …) - unnecessary_sort_by: sort_by_key + std::cmp::Reverse - manual_pattern_char_cmp: .find(['@', '/']) over closure - cloned_ref_to_slice_refs: std::slice::from_ref for single-element slices Plus cargo fmt across 19 files to align with current rustfmt output. Co-Authored-By: Claude Sonnet 4.6 --- bindings/rust/src/daemon/session.rs | 24 ++-- bindings/rust/src/daemon/tier2_manager.rs | 5 +- bindings/rust/src/indexer/symbol_extractor.rs | 88 +++++++----- bindings/rust/src/indexer/tier1.rs | 24 ++-- bindings/rust/src/indexer/tier2/dart_ls.rs | 7 +- bindings/rust/src/indexer/tier2/enrich.rs | 120 +++++++++++++--- bindings/rust/src/indexer/tier2/gopls.rs | 7 +- bindings/rust/src/indexer/tier2/kotlin.rs | 7 +- bindings/rust/src/indexer/tier2/py_ls.rs | 7 +- .../rust/src/indexer/tier2/rust_analyzer.rs | 7 +- bindings/rust/src/indexer/tier2/swift_ls.rs | 7 +- bindings/rust/src/indexer/tier2/ts_server.rs | 3 +- bindings/rust/src/query_graph/db.rs | 53 ++++--- bindings/rust/src/query_graph/module_id.rs | 22 ++- bindings/rust/src/schema/signature.rs | 5 +- bindings/rust/src/schema/types.rs | 5 +- bindings/rust/src/schema/visibility.rs | 26 +--- bindings/rust/tests/integration.rs | 90 +++++++----- tools/lip-cli/src/cmd/import.rs | 132 ++++++++++++------ 19 files changed, 414 insertions(+), 225 deletions(-) diff --git a/bindings/rust/src/daemon/session.rs b/bindings/rust/src/daemon/session.rs index 769f853..21f6cd5 100644 --- a/bindings/rust/src/daemon/session.rs +++ b/bindings/rust/src/daemon/session.rs @@ -190,8 +190,7 @@ impl Session { if notification.source_session == Some(self.session_id) { continue; } - if let Err(e) = - write_message(&mut stream, ¬ification.message).await + if let Err(e) = write_message(&mut stream, ¬ification.message).await { error!("write error (notification): {e}"); break; @@ -462,9 +461,12 @@ impl Session { let (pairs, truncated) = db.outgoing_calls(&symbol_uri, depth); let edges = pairs .into_iter() - .map(|(from_uri, to_uri)| { - crate::query_graph::types::OutgoingCallEdge { from_uri, to_uri } - }) + .map( + |(from_uri, to_uri)| crate::query_graph::types::OutgoingCallEdge { + from_uri, + to_uri, + }, + ) .collect(); ServerMessage::OutgoingCallsResult { edges, truncated } } @@ -2069,8 +2071,7 @@ fn process_query_sync( changed_file_uris, min_score, } => { - let (results, not_indexed_uris) = - db.blast_radius_batch(&changed_file_uris, min_score); + let (results, not_indexed_uris) = db.blast_radius_batch(&changed_file_uris, min_score); ok(ServerMessage::BlastRadiusBatchResult { results, not_indexed_uris, @@ -2089,9 +2090,12 @@ fn process_query_sync( let (pairs, truncated) = db.outgoing_calls(&symbol_uri, depth); let edges = pairs .into_iter() - .map(|(from_uri, to_uri)| { - crate::query_graph::types::OutgoingCallEdge { from_uri, to_uri } - }) + .map( + |(from_uri, to_uri)| crate::query_graph::types::OutgoingCallEdge { + from_uri, + to_uri, + }, + ) .collect(); ok(ServerMessage::OutgoingCallsResult { edges, truncated }) } diff --git a/bindings/rust/src/daemon/tier2_manager.rs b/bindings/rust/src/daemon/tier2_manager.rs index 4b7d7aa..98419e9 100644 --- a/bindings/rust/src/daemon/tier2_manager.rs +++ b/bindings/rust/src/daemon/tier2_manager.rs @@ -999,7 +999,10 @@ mod tests { } let envelope = notify_rx.try_recv().expect("should receive a broadcast"); - assert_eq!(envelope.source_session, None, "Tier 2 upgrades are system-originated"); + assert_eq!( + envelope.source_session, None, + "Tier 2 upgrades are system-originated" + ); match envelope.message { ServerMessage::SymbolUpgraded { uri, diff --git a/bindings/rust/src/indexer/symbol_extractor.rs b/bindings/rust/src/indexer/symbol_extractor.rs index c3aea44..72030d8 100644 --- a/bindings/rust/src/indexer/symbol_extractor.rs +++ b/bindings/rust/src/indexer/symbol_extractor.rs @@ -132,9 +132,7 @@ impl<'a> SymbolExtractor<'a> { (self.language, pk), (Language::Rust, "call_expression" | "macro_invocation") | ( - Language::TypeScript - | Language::JavaScript - | Language::JavaScriptReact, + Language::TypeScript | Language::JavaScript | Language::JavaScriptReact, "call_expression" | "new_expression" ) | (Language::Python, "call") @@ -266,8 +264,9 @@ impl<'a> SymbolExtractor<'a> { let name = self.node_text(&name_node); if !name.is_empty() { let modifiers = self.rust_modifiers(&node); - let is_exported = - modifiers.iter().any(|m| m == "pub" || m.starts_with("pub(")); + let is_exported = modifiers + .iter() + .any(|m| m == "pub" || m.starts_with("pub(")); let (vis, vc) = visibility::infer(name, &modifiers, self.language); let container = self.rust_container(&node); let signature = self.rust_signature(&node); @@ -364,8 +363,7 @@ impl<'a> SymbolExtractor<'a> { return None; } let body = node.child_by_field_name("body")?; - let text = - std::str::from_utf8(&self.source[node.start_byte()..body.start_byte()]).ok()?; + let text = std::str::from_utf8(&self.source[node.start_byte()..body.start_byte()]).ok()?; Some(text.trim().to_owned()) } @@ -559,8 +557,7 @@ impl<'a> SymbolExtractor<'a> { return None; } let body = node.child_by_field_name("body")?; - let text = - std::str::from_utf8(&self.source[node.start_byte()..body.start_byte()]).ok()?; + let text = std::str::from_utf8(&self.source[node.start_byte()..body.start_byte()]).ok()?; Some(text.trim().to_owned()) } @@ -681,8 +678,7 @@ impl<'a> SymbolExtractor<'a> { return None; } let body = node.child_by_field_name("body")?; - let text = - std::str::from_utf8(&self.source[node.start_byte()..body.start_byte()]).ok()?; + let text = std::str::from_utf8(&self.source[node.start_byte()..body.start_byte()]).ok()?; // Strip the trailing `:` that separates signature from body. let trimmed = text.trim().trim_end_matches(':').trim_end(); Some(trimmed.to_owned()) @@ -815,7 +811,14 @@ impl<'a> SymbolExtractor<'a> { /// Collect Dart modifier keywords from a declaration node's direct children. fn dart_modifiers(&self, node: &Node) -> Vec { const KEYWORDS: &[&str] = &[ - "static", "abstract", "final", "const", "external", "factory", "late", "covariant", + "static", + "abstract", + "final", + "const", + "external", + "factory", + "late", + "covariant", ]; collect_matching_keywords(*node, KEYWORDS) } @@ -835,7 +838,7 @@ impl<'a> SymbolExtractor<'a> { .filter_map(|i| n.named_child(i)) .find(|c| c.kind() == "identifier") .map(|c| self.node_text(&c).trim().to_owned()); - if name.as_deref().map_or(false, |s| !s.is_empty()) { + if name.as_deref().is_some_and(|s| !s.is_empty()) { return name; } } @@ -851,8 +854,7 @@ impl<'a> SymbolExtractor<'a> { let body = node .child_by_field_name("body") .or_else(|| node.child_by_field_name("function_body"))?; - let text = - std::str::from_utf8(&self.source[node.start_byte()..body.start_byte()]).ok()?; + let text = std::str::from_utf8(&self.source[node.start_byte()..body.start_byte()]).ok()?; Some(text.trim().to_owned()) } @@ -1107,7 +1109,9 @@ impl<'a> SymbolExtractor<'a> { return self.c_declarator_name(child); } for i in 0..node.named_child_count() { - let Some(c) = node.named_child(i) else { continue }; + let Some(c) = node.named_child(i) else { + continue; + }; if matches!( c.kind(), "identifier" @@ -1251,8 +1255,11 @@ impl<'a> SymbolExtractor<'a> { for i in 0..node.child_count() { let Some(child) = node.child(i) else { continue }; match child.kind() { - "storage_class_specifier" | "type_qualifier" | "function_specifier" - | "virtual_function_specifier" | "explicit_function_specifier" => { + "storage_class_specifier" + | "type_qualifier" + | "function_specifier" + | "virtual_function_specifier" + | "explicit_function_specifier" => { let t = self.node_text(&child).trim().to_owned(); if !t.is_empty() { mods.push(t); @@ -1271,8 +1278,7 @@ impl<'a> SymbolExtractor<'a> { return None; } let body = node.child_by_field_name("body")?; - let text = - std::str::from_utf8(&self.source[node.start_byte()..body.start_byte()]).ok()?; + let text = std::str::from_utf8(&self.source[node.start_byte()..body.start_byte()]).ok()?; Some(text.trim().to_owned()) } @@ -1672,8 +1678,7 @@ impl<'a> SymbolExtractor<'a> { _ => SymbolKind::TypeAlias, }) .unwrap_or(SymbolKind::TypeAlias); - let (vis, vc) = - visibility::infer(&name, &[], self.language); + let (vis, vc) = visibility::infer(&name, &[], self.language); out.push(OwnedSymbolInfo { uri: self.lip_uri(&name), display_name: name.clone(), @@ -1699,8 +1704,7 @@ impl<'a> SymbolExtractor<'a> { if let Some(name_node) = spec.child_by_field_name("name") { let name = self.node_text(&name_node).to_owned(); if !name.is_empty() { - let (vis, vc) = - visibility::infer(&name, &[], self.language); + let (vis, vc) = visibility::infer(&name, &[], self.language); out.push(OwnedSymbolInfo { uri: self.lip_uri(&name), display_name: name.clone(), @@ -1734,8 +1738,7 @@ impl<'a> SymbolExtractor<'a> { return None; } let body = node.child_by_field_name("body")?; - let text = - std::str::from_utf8(&self.source[node.start_byte()..body.start_byte()]).ok()?; + let text = std::str::from_utf8(&self.source[node.start_byte()..body.start_byte()]).ok()?; Some(text.trim().to_owned()) } @@ -1753,9 +1756,7 @@ impl<'a> SymbolExtractor<'a> { let ty = p.child_by_field_name("type")?; let ident = match ty.kind() { "type_identifier" => Some(ty), - "pointer_type" => ty - .named_child(0) - .filter(|c| c.kind() == "type_identifier"), + "pointer_type" => ty.named_child(0).filter(|c| c.kind() == "type_identifier"), _ => None, }; return ident.map(|n| self.node_text(&n).trim().to_owned()); @@ -1945,8 +1946,7 @@ impl<'a> SymbolExtractor<'a> { } // Kotlin bodies come via field "body" (block or expression). let body = node.child_by_field_name("body")?; - let text = - std::str::from_utf8(&self.source[node.start_byte()..body.start_byte()]).ok()?; + let text = std::str::from_utf8(&self.source[node.start_byte()..body.start_byte()]).ok()?; // Strip trailing `=` that introduces an expression body. let trimmed = text.trim().trim_end_matches('=').trim_end(); Some(trimmed.to_owned()) @@ -2113,8 +2113,7 @@ impl<'a> SymbolExtractor<'a> { return None; } let body = node.child_by_field_name("body")?; - let text = - std::str::from_utf8(&self.source[node.start_byte()..body.start_byte()]).ok()?; + let text = std::str::from_utf8(&self.source[node.start_byte()..body.start_byte()]).ok()?; Some(text.trim().to_owned()) } @@ -2272,9 +2271,26 @@ fn kotlin_is_exported(node: Node) -> bool { /// Collect Kotlin modifier keywords from the `modifiers` child node. fn kotlin_modifiers(node: Node) -> Vec { const KEYWORDS: &[&str] = &[ - "private", "protected", "internal", "public", "abstract", "final", "open", "override", - "suspend", "inline", "external", "data", "sealed", "enum", "companion", "lateinit", - "const", "operator", "infix", "tailrec", + "private", + "protected", + "internal", + "public", + "abstract", + "final", + "open", + "override", + "suspend", + "inline", + "external", + "data", + "sealed", + "enum", + "companion", + "lateinit", + "const", + "operator", + "infix", + "tailrec", ]; for i in 0..node.named_child_count() { if let Some(child) = node.named_child(i) { diff --git a/bindings/rust/src/indexer/tier1.rs b/bindings/rust/src/indexer/tier1.rs index 510c9ff..08ed52f 100644 --- a/bindings/rust/src/indexer/tier1.rs +++ b/bindings/rust/src/indexer/tier1.rs @@ -303,7 +303,10 @@ mod tests { #[test] fn rust_signature_and_normalized() { - let syms = sym("pub fn add(x: i32, y: i32) -> i32 { x + y }", Language::Rust); + let syms = sym( + "pub fn add(x: i32, y: i32) -> i32 { x + y }", + Language::Rust, + ); let s = find(&syms, "add"); assert_eq!( s.signature.as_deref(), @@ -919,7 +922,10 @@ mod tests { #[test] fn ts_exported_function_modifier() { use crate::schema::Visibility; - let syms = sym("export function send(x: number): void {}", Language::TypeScript); + let syms = sym( + "export function send(x: number): void {}", + Language::TypeScript, + ); let s = find(&syms, "send"); assert!(s.modifiers.iter().any(|m| m == "export")); assert_eq!(s.visibility, Some(Visibility::Public)); @@ -944,13 +950,13 @@ mod tests { #[test] fn go_func_visibility_from_name() { use crate::schema::Visibility; - let syms = sym("package p\nfunc Exported(x int) bool { return true }", Language::Go); + let syms = sym( + "package p\nfunc Exported(x int) bool { return true }", + Language::Go, + ); let s = find(&syms, "Exported"); assert_eq!(s.visibility, Some(Visibility::Public)); - assert_eq!( - s.signature.as_deref(), - Some("func Exported(x int) bool") - ); + assert_eq!(s.signature.as_deref(), Some("func Exported(x int) bool")); } #[test] @@ -1041,9 +1047,7 @@ mod tests { // `obj.method()` — the property identifier is the callee. let src = "function demo(obj: any) { obj.method(); }"; let occs_list = occs(src, Language::TypeScript); - let callee = occs_list - .iter() - .find(|o| o.symbol_uri.contains("#method")); + let callee = occs_list.iter().find(|o| o.symbol_uri.contains("#method")); if let Some(c) = callee { assert_eq!( c.kind, diff --git a/bindings/rust/src/indexer/tier2/dart_ls.rs b/bindings/rust/src/indexer/tier2/dart_ls.rs index de96aec..857d1a6 100644 --- a/bindings/rust/src/indexer/tier2/dart_ls.rs +++ b/bindings/rust/src/indexer/tier2/dart_ls.rs @@ -293,7 +293,12 @@ impl DartBackend { is_exported, ..Default::default() }; - enrich_v23(&mut info, sig.as_deref(), sym.container.clone(), Language::Dart); + enrich_v23( + &mut info, + sig.as_deref(), + sym.container.clone(), + Language::Dart, + ); symbols.push(info); } diff --git a/bindings/rust/src/indexer/tier2/enrich.rs b/bindings/rust/src/indexer/tier2/enrich.rs index 9ce29fb..9aefc9d 100644 --- a/bindings/rust/src/indexer/tier2/enrich.rs +++ b/bindings/rust/src/indexer/tier2/enrich.rs @@ -89,31 +89,92 @@ fn modifier_keywords(lang: Language) -> &'static [&'static str] { Language::Rust => &[ "pub", "const", "async", "unsafe", "extern", "static", "mut", "default", "move", ], - Language::TypeScript - | Language::JavaScript - | Language::JavaScriptReact => &[ - "export", "default", "async", "static", "readonly", "public", "private", "protected", - "abstract", "declare", "override", "const", "let", "var", + Language::TypeScript | Language::JavaScript | Language::JavaScriptReact => &[ + "export", + "default", + "async", + "static", + "readonly", + "public", + "private", + "protected", + "abstract", + "declare", + "override", + "const", + "let", + "var", ], Language::Python => &["async", "def"], Language::Dart => &[ - "static", "abstract", "final", "const", "external", "factory", "late", "covariant", + "static", + "abstract", + "final", + "const", + "external", + "factory", + "late", + "covariant", "async", ], Language::Go => &["func"], Language::Kotlin => &[ - "private", "protected", "internal", "public", "abstract", "final", "open", "override", - "suspend", "inline", "external", "data", "sealed", "enum", "companion", "lateinit", - "const", "operator", "infix", "tailrec", + "private", + "protected", + "internal", + "public", + "abstract", + "final", + "open", + "override", + "suspend", + "inline", + "external", + "data", + "sealed", + "enum", + "companion", + "lateinit", + "const", + "operator", + "infix", + "tailrec", ], Language::Swift => &[ - "private", "fileprivate", "internal", "public", "open", "static", "final", "override", - "mutating", "nonmutating", "class", "required", "convenience", "lazy", "weak", - "unowned", "dynamic", + "private", + "fileprivate", + "internal", + "public", + "open", + "static", + "final", + "override", + "mutating", + "nonmutating", + "class", + "required", + "convenience", + "lazy", + "weak", + "unowned", + "dynamic", ], Language::C | Language::Cpp => &[ - "static", "extern", "const", "virtual", "override", "explicit", "inline", "constexpr", - "private", "protected", "public", "friend", "mutable", "volatile", "register", + "static", + "extern", + "const", + "virtual", + "override", + "explicit", + "inline", + "constexpr", + "private", + "protected", + "public", + "friend", + "mutable", + "volatile", + "register", "typedef", ], Language::Unknown => &[], @@ -158,7 +219,10 @@ mod tests { #[test] fn ts_export_async() { - let mods = extract_modifiers("export async function foo(): Promise", Language::TypeScript); + let mods = extract_modifiers( + "export async function foo(): Promise", + Language::TypeScript, + ); assert_eq!(mods, vec!["export", "async"]); } @@ -188,7 +252,10 @@ mod tests { assert_eq!(s.visibility, Some(Visibility::Public)); assert_eq!(s.visibility_confidence, Some(1.0)); assert_eq!(s.modifiers, vec!["pub".to_owned()]); - assert_eq!(s.signature_normalized.as_deref(), Some("pub fn foo() -> i32")); + assert_eq!( + s.signature_normalized.as_deref(), + Some("pub fn foo() -> i32") + ); } #[test] @@ -217,21 +284,36 @@ mod tests { #[test] fn enrich_skips_empty_container() { let mut s = sym("foo"); - enrich_v23(&mut s, Some("fn foo()"), Some(String::new()), Language::Rust); + enrich_v23( + &mut s, + Some("fn foo()"), + Some(String::new()), + Language::Rust, + ); assert!(s.container_name.is_none()); } #[test] fn enrich_python_name_convention() { let mut s = sym("_helper"); - enrich_v23(&mut s, Some("def _helper() -> None"), None, Language::Python); + enrich_v23( + &mut s, + Some("def _helper() -> None"), + None, + Language::Python, + ); assert_eq!(s.visibility, Some(Visibility::Private)); } #[test] fn enrich_ts_no_modifier_is_low_conf_internal() { let mut s = sym("foo"); - enrich_v23(&mut s, Some("function foo(): void"), None, Language::TypeScript); + enrich_v23( + &mut s, + Some("function foo(): void"), + None, + Language::TypeScript, + ); assert_eq!(s.visibility, Some(Visibility::Internal)); assert_eq!(s.visibility_confidence, Some(0.5)); } diff --git a/bindings/rust/src/indexer/tier2/gopls.rs b/bindings/rust/src/indexer/tier2/gopls.rs index 956f26f..1da47c4 100644 --- a/bindings/rust/src/indexer/tier2/gopls.rs +++ b/bindings/rust/src/indexer/tier2/gopls.rs @@ -246,7 +246,12 @@ impl GoplsBackend { is_exported, ..Default::default() }; - enrich_v23(&mut info, sig.as_deref(), sym.container.clone(), Language::Go); + enrich_v23( + &mut info, + sig.as_deref(), + sym.container.clone(), + Language::Go, + ); symbols.push(info); } diff --git a/bindings/rust/src/indexer/tier2/kotlin.rs b/bindings/rust/src/indexer/tier2/kotlin.rs index 8575f0e..c91df95 100644 --- a/bindings/rust/src/indexer/tier2/kotlin.rs +++ b/bindings/rust/src/indexer/tier2/kotlin.rs @@ -250,7 +250,12 @@ impl KotlinBackend { is_exported, ..Default::default() }; - enrich_v23(&mut info, sig.as_deref(), sym.container.clone(), Language::Kotlin); + enrich_v23( + &mut info, + sig.as_deref(), + sym.container.clone(), + Language::Kotlin, + ); symbols.push(info); } diff --git a/bindings/rust/src/indexer/tier2/py_ls.rs b/bindings/rust/src/indexer/tier2/py_ls.rs index a2b5232..0bc07bc 100644 --- a/bindings/rust/src/indexer/tier2/py_ls.rs +++ b/bindings/rust/src/indexer/tier2/py_ls.rs @@ -323,7 +323,12 @@ impl PythonBackend { is_exported, ..Default::default() }; - enrich_v23(&mut info, sig.as_deref(), sym.container.clone(), Language::Python); + enrich_v23( + &mut info, + sig.as_deref(), + sym.container.clone(), + Language::Python, + ); symbols.push(info); } diff --git a/bindings/rust/src/indexer/tier2/rust_analyzer.rs b/bindings/rust/src/indexer/tier2/rust_analyzer.rs index 7a7a606..0dc8112 100644 --- a/bindings/rust/src/indexer/tier2/rust_analyzer.rs +++ b/bindings/rust/src/indexer/tier2/rust_analyzer.rs @@ -413,7 +413,12 @@ impl RustAnalyzerBackend { is_exported, ..Default::default() }; - enrich_v23(&mut info, sig.as_deref(), sym.container.clone(), Language::Rust); + enrich_v23( + &mut info, + sig.as_deref(), + sym.container.clone(), + Language::Rust, + ); symbols.push(info); } diff --git a/bindings/rust/src/indexer/tier2/swift_ls.rs b/bindings/rust/src/indexer/tier2/swift_ls.rs index f43087a..aa77d73 100644 --- a/bindings/rust/src/indexer/tier2/swift_ls.rs +++ b/bindings/rust/src/indexer/tier2/swift_ls.rs @@ -249,7 +249,12 @@ impl SwiftBackend { is_exported, ..Default::default() }; - enrich_v23(&mut info, sig.as_deref(), sym.container.clone(), Language::Swift); + enrich_v23( + &mut info, + sig.as_deref(), + sym.container.clone(), + Language::Swift, + ); symbols.push(info); } diff --git a/bindings/rust/src/indexer/tier2/ts_server.rs b/bindings/rust/src/indexer/tier2/ts_server.rs index b99b9bb..4eb160b 100644 --- a/bindings/rust/src/indexer/tier2/ts_server.rs +++ b/bindings/rust/src/indexer/tier2/ts_server.rs @@ -289,7 +289,8 @@ impl TypeScriptBackend { .as_deref() .map(|s| s.starts_with("export")) .unwrap_or(false); - let ts_lang = if uri.ends_with(".js") || uri.ends_with(".mjs") || uri.ends_with(".cjs") { + let ts_lang = if uri.ends_with(".js") || uri.ends_with(".mjs") || uri.ends_with(".cjs") + { Language::JavaScript } else if uri.ends_with(".jsx") { Language::JavaScriptReact diff --git a/bindings/rust/src/query_graph/db.rs b/bindings/rust/src/query_graph/db.rs index c4f5d6d..7091377 100644 --- a/bindings/rust/src/query_graph/db.rs +++ b/bindings/rust/src/query_graph/db.rs @@ -355,7 +355,7 @@ impl LipDatabase { // produces the double-slash convention used by tier-1 extractors // (`lip://local//abs/path`). let mut roots: Vec<&String> = self.registered_roots.iter().collect(); - roots.sort_by(|a, b| b.len().cmp(&a.len())); + roots.sort_by_key(|r| std::cmp::Reverse(r.len())); for root in &roots { let candidate = format!("lip://local/{}/{}", root, body); if self.file_inputs.contains_key(&candidate) { @@ -644,8 +644,7 @@ impl LipDatabase { // Forward twin: enables outgoing_impact_for to seed from a // SCIP descriptor URI when the back-fill kept the raw // tier-1 caller URI. v2.3.5. - let caller_name = - normalize_callee_name(extract_name(&edge.from_uri)).to_owned(); + let caller_name = normalize_callee_name(extract_name(&edge.from_uri)).to_owned(); if !caller_name.is_empty() { self.caller_name_to_callees .entry(caller_name) @@ -714,8 +713,7 @@ impl LipDatabase { let uri = self.canonicalize_uri(&uri); self.revision += 1; let rev = self.revision; - let module_id = - crate::query_graph::module_id::resolve_module_id(&uri, &language, &symbols); + let module_id = crate::query_graph::module_id::resolve_module_id(&uri, &language, &symbols); self.file_inputs.insert( uri.clone(), FileInput { @@ -791,7 +789,8 @@ impl LipDatabase { // Seed sym_cache so file_symbols() returns the pre-computed symbols. let syms = Arc::new(symbols); - self.sym_cache.insert(uri.clone(), Cached::new(syms.clone(), rev)); + self.sym_cache + .insert(uri.clone(), Cached::new(syms.clone(), rev)); // v2.3.2 Issue #1 — also index SCIP defs by their `display_name` // (not just URI fragment) so tier-1 back-fill's cross-file callee @@ -954,16 +953,14 @@ impl LipDatabase { .entry(from_uri.clone()) .or_default() .push(to_uri.clone()); - let callee_name = - normalize_callee_name(extract_name(&to_uri)).to_owned(); + let callee_name = normalize_callee_name(extract_name(&to_uri)).to_owned(); if !callee_name.is_empty() { self.callee_name_to_callers .entry(callee_name) .or_default() .push(from_uri.clone()); } - let caller_name = - normalize_callee_name(extract_name(&from_uri)).to_owned(); + let caller_name = normalize_callee_name(extract_name(&from_uri)).to_owned(); if !caller_name.is_empty() { self.caller_name_to_callees .entry(caller_name) @@ -1541,8 +1538,7 @@ impl LipDatabase { // so we can eyeball the URI-form mismatch. if uri_hits == 0 && name_hits == 0 { let uri_keys: Vec<&String> = self.callee_to_callers.keys().take(3).collect(); - let name_keys: Vec<&String> = - self.callee_name_to_callers.keys().take(10).collect(); + let name_keys: Vec<&String> = self.callee_name_to_callers.keys().take(10).collect(); let raw = extract_name(symbol_uri); let normalized = normalize_callee_name(raw); eprintln!( @@ -1710,8 +1706,7 @@ impl LipDatabase { ); if !has { // Dump up to 5 keys so we can compare against the insert log. - let keys: Vec<&String> = - self.file_edges_source.keys().take(5).collect(); + let keys: Vec<&String> = self.file_edges_source.keys().take(5).collect(); eprintln!( "[lip-debug-edges] file_edges_source sample keys (total={}): {:?}", self.file_edges_source.len(), @@ -1943,11 +1938,7 @@ impl LipDatabase { /// Walks `caller_to_callees` up to `depth` hops. Returns a flat /// `(caller, callee)` edge list and a `truncated` flag that is `true` /// when the node cap was hit. - pub fn outgoing_calls( - &self, - symbol_uri: &str, - depth: u32, - ) -> (Vec<(String, String)>, bool) { + pub fn outgoing_calls(&self, symbol_uri: &str, depth: u32) -> (Vec<(String, String)>, bool) { const NODE_LIMIT: usize = 200; let depth = depth.clamp(1, 8); @@ -2288,8 +2279,7 @@ impl LipDatabase { pub fn set_file_embedding(&mut self, uri: &str, vector: Vec, model: &str) { let uri = self.canonicalize_uri(uri); self.file_embeddings.insert(uri.clone(), vector); - self.file_embedding_models - .insert(uri, model.to_owned()); + self.file_embedding_models.insert(uri, model.to_owned()); } /// Retrieve the stored embedding vector for a file, if any. @@ -3137,7 +3127,10 @@ impl LipDatabase { kind_filter: Option<&[crate::schema::SymbolKind]>, scope: Option<&str>, modifier_filter: Option<&[String]>, - ) -> (Vec, Vec) { + ) -> ( + Vec, + Vec, + ) { use crate::query_graph::types::{MatchType, RankedSymbol}; let q_lower = query.to_lowercase(); @@ -4430,8 +4423,7 @@ impl Greeter { "rust".to_owned(), ); let unknown = "file:///project/ghost.rs".to_owned(); - let (results, not_indexed) = - db.blast_radius_batch(&[unknown.clone()], None); + let (results, not_indexed) = db.blast_radius_batch(std::slice::from_ref(&unknown), None); assert!(results.is_empty()); assert_eq!(not_indexed, vec![unknown]); } @@ -4865,7 +4857,10 @@ impl Greeter { Some(EdgesSource::ScipOnly), "edges_source should reflect root.rs (ScipOnly — pre-computed edges)" ); - assert!(result.semantic_items.is_empty(), "min_score=None → no enrichment"); + assert!( + result.semantic_items.is_empty(), + "min_score=None → no enrichment" + ); } // v2.3.3 — Bug-D-symmetric test: when the tier-1 back-fill keeps a @@ -5041,8 +5036,7 @@ impl Greeter { "pub fn exported() {}".to_owned(), "rust".to_owned(), ); - let (results, not_indexed) = - db.blast_radius_batch(&[lib_uri.clone()], None); + let (results, not_indexed) = db.blast_radius_batch(std::slice::from_ref(&lib_uri), None); assert!(not_indexed.is_empty()); for entry in &results { assert_eq!(entry.file_uri, lib_uri, "file_uri must trace back to input"); @@ -5085,7 +5079,10 @@ impl Greeter { db.sym_cache.remove(&uri); let syms_cold = db.file_symbols(&uri); // Must not fall through to Tier 1 (which would parse empty text). - assert!(syms_cold.is_empty(), "cold precomputed cache must not run Tier-1 parser"); + assert!( + syms_cold.is_empty(), + "cold precomputed cache must not run Tier-1 parser" + ); } // ── WS3: name consumption index ─────────────────────────────────────────── diff --git a/bindings/rust/src/query_graph/module_id.rs b/bindings/rust/src/query_graph/module_id.rs index 8b629c9..70ae625 100644 --- a/bindings/rust/src/query_graph/module_id.rs +++ b/bindings/rust/src/query_graph/module_id.rs @@ -64,7 +64,7 @@ fn from_slice_uri(uri: &str) -> Option { let after_manager = &rest[first_slash + 1..]; // Package extends to the first `@` (version marker) or `/` (path) — whichever comes first. let pkg_end = after_manager - .find(|c: char| c == '@' || c == '/') + .find(['@', '/']) .unwrap_or(after_manager.len()); let package = &after_manager[..pkg_end]; if manager.is_empty() || package.is_empty() { @@ -385,13 +385,18 @@ mod tests { #[test] fn slice_uri_rejects_local_scheme() { assert_eq!(from_slice_uri("lip://local/src/main.rs"), None); - assert_eq!(from_slice_uri("lip://local//Users/a/proj/src/main.rs"), None); + assert_eq!( + from_slice_uri("lip://local//Users/a/proj/src/main.rs"), + None + ); } #[test] fn scip_symbol_extracts_manager_and_name() { assert_eq!( - from_scip_symbol("scip-go gomod github.com/foo/bar v1.0 internal/query/SearchSymbols()."), + from_scip_symbol( + "scip-go gomod github.com/foo/bar v1.0 internal/query/SearchSymbols()." + ), Some("gomod/github.com/foo/bar".to_owned()) ); assert_eq!( @@ -442,10 +447,7 @@ members = ["a", "b"] #[test] fn parse_go_mod_extracts_module_path() { let gomod = "module github.com/foo/bar\n\ngo 1.21\n"; - assert_eq!( - parse_go_mod(gomod), - Some("github.com/foo/bar".to_owned()) - ); + assert_eq!(parse_go_mod(gomod), Some("github.com/foo/bar".to_owned())); } #[test] @@ -513,11 +515,7 @@ name = "poetry-pkg" #[test] fn resolve_prefers_slice_uri_over_scip() { let scip_sym = sym("scip-go gomod github.com/foo/bar v1.0 pkg/Baz()."); - let id = resolve_module_id( - "lip://cargo/serde@1.0.0/src/lib.rs", - "rust", - &[scip_sym], - ); + let id = resolve_module_id("lip://cargo/serde@1.0.0/src/lib.rs", "rust", &[scip_sym]); assert_eq!(id, Some("cargo/serde".to_owned())); } diff --git a/bindings/rust/src/schema/signature.rs b/bindings/rust/src/schema/signature.rs index ddda146..13db4b3 100644 --- a/bindings/rust/src/schema/signature.rs +++ b/bindings/rust/src/schema/signature.rs @@ -333,10 +333,7 @@ mod tests { fn nested_parens_in_fn_type_arg() { // Callback params inside generic arg: outer paren strip only // at depth 1; inner should still get name stripped. - let got = normalize_signature( - "fn foo(cb: fn(x: i32) -> bool) -> ()", - Language::Rust, - ); + let got = normalize_signature("fn foo(cb: fn(x: i32) -> bool) -> ()", Language::Rust); // Inner `x:` is at paren depth 2, so NOT stripped by the current // depth-1-only rule. Record this limitation as the expected output. assert_eq!(got, "fn foo(_: fn(x: i32) -> bool) -> ()"); diff --git a/bindings/rust/src/schema/types.rs b/bindings/rust/src/schema/types.rs index 6f14542..893f62a 100644 --- a/bindings/rust/src/schema/types.rs +++ b/bindings/rust/src/schema/types.rs @@ -624,7 +624,10 @@ mod tests { is_test: false, }; let json = serde_json::to_string(&occ).unwrap(); - assert!(!json.contains("\"kind\""), "kind:unknown must be skipped: {json}"); + assert!( + !json.contains("\"kind\""), + "kind:unknown must be skipped: {json}" + ); assert!( !json.contains("\"is_test\""), "is_test:false must be skipped: {json}" diff --git a/bindings/rust/src/schema/visibility.rs b/bindings/rust/src/schema/visibility.rs index c5a3eba..ace7c99 100644 --- a/bindings/rust/src/schema/visibility.rs +++ b/bindings/rust/src/schema/visibility.rs @@ -18,9 +18,9 @@ use crate::schema::Visibility; pub fn infer(name: &str, modifiers: &[String], lang: Language) -> (Visibility, u8) { match lang { Language::Rust => infer_rust(modifiers), - Language::TypeScript - | Language::JavaScript - | Language::JavaScriptReact => infer_ts_js(modifiers), + Language::TypeScript | Language::JavaScript | Language::JavaScriptReact => { + infer_ts_js(modifiers) + } Language::Python => infer_python(name), Language::Dart => infer_dart(name), Language::Go => infer_go(name), @@ -172,10 +172,7 @@ mod tests { #[test] fn rust_no_modifier_is_private() { - assert_eq!( - infer("foo", &[], Language::Rust), - (Visibility::Private, 50) - ); + assert_eq!(infer("foo", &[], Language::Rust), (Visibility::Private, 50)); } // TypeScript / JavaScript @@ -264,18 +261,12 @@ mod tests { // Go #[test] fn go_capital_is_public() { - assert_eq!( - infer("Foo", &[], Language::Go), - (Visibility::Public, 80) - ); + assert_eq!(infer("Foo", &[], Language::Go), (Visibility::Public, 80)); } #[test] fn go_lowercase_is_private() { - assert_eq!( - infer("foo", &[], Language::Go), - (Visibility::Private, 80) - ); + assert_eq!(infer("foo", &[], Language::Go), (Visibility::Private, 80)); } // Kotlin @@ -339,10 +330,7 @@ mod tests { #[test] fn c_no_modifier_is_public_lowconf() { - assert_eq!( - infer("foo", &[], Language::C), - (Visibility::Public, 50) - ); + assert_eq!(infer("foo", &[], Language::C), (Visibility::Public, 50)); } // Unknown diff --git a/bindings/rust/tests/integration.rs b/bindings/rust/tests/integration.rs index 9c495db..2dac431 100644 --- a/bindings/rust/tests/integration.rs +++ b/bindings/rust/tests/integration.rs @@ -310,9 +310,9 @@ async fn daemon_restart_restores_journal() { send( &mut client, &ClientMessage::QueryWorkspaceSymbols { - kind_filter: None, - scope: None, - modifier_filter: None, + kind_filter: None, + scope: None, + modifier_filter: None, query: "persisted".into(), limit: Some(10), }, @@ -1292,11 +1292,13 @@ pub fn caller() { .await .expect("send workspace query"); let callee_uri = match recv(&mut client).await.expect("recv workspace") { - ServerMessage::WorkspaceSymbolsResult { symbols, .. } => symbols - .into_iter() - .find(|s| s.display_name == "callee") - .expect("expected 'callee' in workspace symbols") - .uri, + ServerMessage::WorkspaceSymbolsResult { symbols, .. } => { + symbols + .into_iter() + .find(|s| s.display_name == "callee") + .expect("expected 'callee' in workspace symbols") + .uri + } other => panic!("expected WorkspaceSymbolsResult, got {other:?}"), }; @@ -1374,11 +1376,13 @@ async fn daemon_tier1_test_file_stamps_is_test() { .await .expect("send workspace query"); let helper_uri = match recv(&mut client).await.expect("recv workspace") { - ServerMessage::WorkspaceSymbolsResult { symbols, .. } => symbols - .into_iter() - .find(|s| s.display_name == "helper") - .expect("expected 'helper' in workspace symbols") - .uri, + ServerMessage::WorkspaceSymbolsResult { symbols, .. } => { + symbols + .into_iter() + .find(|s| s.display_name == "helper") + .expect("expected 'helper' in workspace symbols") + .uri + } other => panic!("expected WorkspaceSymbolsResult, got {other:?}"), }; @@ -1472,11 +1476,13 @@ pub fn caller() { .await .expect("send workspace"); let victim_uri = match recv(&mut client).await.expect("recv workspace") { - ServerMessage::WorkspaceSymbolsResult { symbols, .. } => symbols - .into_iter() - .find(|s| s.display_name == "victim") - .expect("expected `victim` in workspace") - .uri, + ServerMessage::WorkspaceSymbolsResult { symbols, .. } => { + symbols + .into_iter() + .find(|s| s.display_name == "victim") + .expect("expected `victim` in workspace") + .uri + } other => panic!("expected WorkspaceSymbolsResult, got {other:?}"), }; @@ -1499,7 +1505,11 @@ pub fn caller() { // The file that defines `victim` — the enrichment's anchor. assert_eq!(enriched.file_uri, a_uri); assert!( - enriched.static_result.affected_files.iter().any(|f| f == b_uri), + enriched + .static_result + .affected_files + .iter() + .any(|f| f == b_uri), "expected caller file {b_uri} in affected_files, got {:?}", enriched.static_result.affected_files ); @@ -1577,11 +1587,13 @@ fn c() {} .await .expect("send workspace"); let a_uri = match recv(&mut client).await.expect("recv workspace") { - ServerMessage::WorkspaceSymbolsResult { symbols, .. } => symbols - .into_iter() - .find(|s| s.display_name == "a") - .expect("expected `a` in workspace symbols") - .uri, + ServerMessage::WorkspaceSymbolsResult { symbols, .. } => { + symbols + .into_iter() + .find(|s| s.display_name == "a") + .expect("expected `a` in workspace symbols") + .uri + } other => panic!("expected WorkspaceSymbolsResult, got {other:?}"), }; @@ -1601,11 +1613,15 @@ fn c() {} }; assert!(!truncated, "chain is tiny; truncated must be false"); assert!( - edges.iter().any(|e| e.from_uri == a_uri && e.to_uri.ends_with("#b")), + edges + .iter() + .any(|e| e.from_uri == a_uri && e.to_uri.ends_with("#b")), "expected A→B edge; got {edges:?}", ); assert!( - edges.iter().any(|e| e.from_uri.ends_with("#b") && e.to_uri.ends_with("#c")), + edges + .iter() + .any(|e| e.from_uri.ends_with("#b") && e.to_uri.ends_with("#c")), "expected B→C edge at depth=2; got {edges:?}", ); @@ -1721,7 +1737,10 @@ pub async fn handle() {} .map(|(r, _)| r) .expect("expected Handler in ranked list"); assert!( - matches!(exact.match_type, lip_core::query_graph::types::MatchType::Exact), + matches!( + exact.match_type, + lip_core::query_graph::types::MatchType::Exact + ), "Handler should be Exact match, got {:?}", exact.match_type, ); @@ -1733,7 +1752,10 @@ pub async fn handle() {} .map(|(r, _)| r) .expect("expected HandlerFactory in ranked list"); assert!( - matches!(prefix.match_type, lip_core::query_graph::types::MatchType::Prefix), + matches!( + prefix.match_type, + lip_core::query_graph::types::MatchType::Prefix + ), "HandlerFactory should be Prefix match, got {:?}", prefix.match_type, ); @@ -2030,11 +2052,13 @@ async fn daemon_blast_radius_edges_source_tier1() { .await .unwrap(); let target_uri = match recv(&mut client).await.unwrap() { - ServerMessage::WorkspaceSymbolsResult { symbols, .. } => symbols - .into_iter() - .find(|s| s.display_name == "target") - .expect("target not in workspace") - .uri, + ServerMessage::WorkspaceSymbolsResult { symbols, .. } => { + symbols + .into_iter() + .find(|s| s.display_name == "target") + .expect("target not in workspace") + .uri + } other => panic!("expected WorkspaceSymbolsResult, got {other:?}"), }; diff --git a/tools/lip-cli/src/cmd/import.rs b/tools/lip-cli/src/cmd/import.rs index 231960e..8035884 100644 --- a/tools/lip-cli/src/cmd/import.rs +++ b/tools/lip-cli/src/cmd/import.rs @@ -350,10 +350,7 @@ fn collect_verify_samples(deltas: &[OwnedDelta], max: usize) -> Vec anyhow::Result<()> { +async fn run_verification(stream: &mut UnixStream, samples: &[VerifySample]) -> anyhow::Result<()> { if samples.is_empty() { eprintln!("verify: no documents to sample"); return Ok(()); @@ -402,16 +399,11 @@ async fn run_verification( match ws { ServerMessage::WorkspaceSymbolsResult { symbols, .. } if !symbols.is_empty() => {} ServerMessage::WorkspaceSymbolsResult { .. } => { - eprintln!( - "verify: probe '{name}' missing in {}", - sample.file_uri - ); + eprintln!("verify: probe '{name}' missing in {}", sample.file_uri); failures += 1; } other => { - eprintln!( - "verify: unexpected reply to QueryWorkspaceSymbols({name}): {other:?}" - ); + eprintln!("verify: unexpected reply to QueryWorkspaceSymbols({name}): {other:?}"); failures += 1; } } @@ -427,10 +419,7 @@ async fn run_verification( Ok(()) } -async fn round_trip( - stream: &mut UnixStream, - msg: &ClientMessage, -) -> anyhow::Result { +async fn round_trip(stream: &mut UnixStream, msg: &ClientMessage) -> anyhow::Result { let body = serde_json::to_vec(msg)?; stream.write_all(&(body.len() as u32).to_be_bytes()).await?; stream.write_all(&body).await?; @@ -583,9 +572,7 @@ fn convert_symbol_info( (Some(v), Some(c as f32 / 100.0)) }; - let signature_normalized = signature - .as_deref() - .map(|s| normalize_signature(s, lang)); + let signature_normalized = signature.as_deref().map(|s| normalize_signature(s, lang)); let container_name = Some(sym.enclosing_symbol.clone()) .filter(|s| !s.is_empty()) @@ -640,28 +627,83 @@ fn scip_language_to_lip(lang: &str) -> Language { fn parse_modifiers_from_signature(signature: &str, lang: Language) -> Vec { let keywords: &[&str] = match lang { Language::Rust => &["pub", "const", "async", "unsafe", "extern", "static", "mut"], - Language::TypeScript - | Language::JavaScript - | Language::JavaScriptReact => &[ - "export", "default", "async", "static", "readonly", "public", "private", "protected", - "abstract", "declare", "override", + Language::TypeScript | Language::JavaScript | Language::JavaScriptReact => &[ + "export", + "default", + "async", + "static", + "readonly", + "public", + "private", + "protected", + "abstract", + "declare", + "override", ], Language::Dart => &[ - "static", "abstract", "final", "const", "external", "factory", "late", "covariant", + "static", + "abstract", + "final", + "const", + "external", + "factory", + "late", + "covariant", ], Language::Kotlin => &[ - "private", "protected", "internal", "public", "abstract", "final", "open", "override", - "suspend", "inline", "external", "data", "sealed", "companion", "lateinit", "const", - "operator", "infix", "tailrec", + "private", + "protected", + "internal", + "public", + "abstract", + "final", + "open", + "override", + "suspend", + "inline", + "external", + "data", + "sealed", + "companion", + "lateinit", + "const", + "operator", + "infix", + "tailrec", ], Language::Swift => &[ - "private", "fileprivate", "internal", "public", "open", "static", "final", "override", - "mutating", "nonmutating", "required", "convenience", "lazy", "weak", "unowned", + "private", + "fileprivate", + "internal", + "public", + "open", + "static", + "final", + "override", + "mutating", + "nonmutating", + "required", + "convenience", + "lazy", + "weak", + "unowned", "dynamic", ], Language::C | Language::Cpp => &[ - "static", "extern", "const", "virtual", "override", "explicit", "inline", "constexpr", - "private", "protected", "public", "friend", "mutable", "volatile", + "static", + "extern", + "const", + "virtual", + "override", + "explicit", + "inline", + "constexpr", + "private", + "protected", + "public", + "friend", + "mutable", + "volatile", ], // Python / Go / Unknown: prefix-parse not meaningful; rely on name rules. _ => &[], @@ -1087,8 +1129,7 @@ mod tests { documentation: vec!["render(): void".to_owned()], relationships: vec![], kind: scip::Kind::KMethod as i32, - enclosing_symbol: - "scip-typescript npm react 18.2.0 src/App.ts`App#".to_owned(), + enclosing_symbol: "scip-typescript npm react 18.2.0 src/App.ts`App#".to_owned(), }; let out = sym_with("typescript", proto); // Last descriptor segment of the enclosing symbol becomes container name. @@ -1175,8 +1216,7 @@ mod tests { documentation: vec!["pub fn bar()".to_owned()], relationships: vec![], kind: scip::Kind::KMethod as i32, - enclosing_symbol: - "rust-analyzer cargo mycrate 1.0 Mod/MyStruct#".to_owned(), + enclosing_symbol: "rust-analyzer cargo mycrate 1.0 Mod/MyStruct#".to_owned(), }], }], external_symbols: vec![], @@ -1189,8 +1229,7 @@ mod tests { let doc = &decoded.documents[0]; let sym = &doc.symbols[0]; assert_eq!( - sym.enclosing_symbol, - "rust-analyzer cargo mycrate 1.0 Mod/MyStruct#", + sym.enclosing_symbol, "rust-analyzer cargo mycrate 1.0 Mod/MyStruct#", "field 8 enclosing_symbol lost across prost encode/decode" ); @@ -1232,16 +1271,14 @@ mod tests { #[test] fn scip_write_wins_over_read_when_both_set() { - let bits = - scip::SymbolRole::WriteAccess as i32 | scip::SymbolRole::ReadAccess as i32; + let bits = scip::SymbolRole::WriteAccess as i32 | scip::SymbolRole::ReadAccess as i32; let o = convert_occurrence(&occ_with_roles(bits)).expect("occurrence converts"); assert_eq!(o.kind, ReferenceKind::Write); } #[test] fn scip_test_bit_sets_is_test() { - let bits = - scip::SymbolRole::Test as i32 | scip::SymbolRole::ReadAccess as i32; + let bits = scip::SymbolRole::Test as i32 | scip::SymbolRole::ReadAccess as i32; let o = convert_occurrence(&occ_with_roles(bits)).expect("occurrence converts"); assert!(o.is_test); assert_eq!(o.kind, ReferenceKind::Read); @@ -1250,8 +1287,7 @@ mod tests { #[test] fn scip_definition_keeps_kind_unknown() { // SCIP does not set read/write on definitions; kind stays Unknown. - let bits = - scip::SymbolRole::Definition as i32 | scip::SymbolRole::WriteAccess as i32; + let bits = scip::SymbolRole::Definition as i32 | scip::SymbolRole::WriteAccess as i32; let o = convert_occurrence(&occ_with_roles(bits)).expect("occurrence converts"); assert_eq!(o.role, Role::Definition); assert_eq!(o.kind, ReferenceKind::Unknown); @@ -1269,8 +1305,14 @@ mod tests { #[test] fn strip_file_scheme_trims_scheme_and_trailing_slash() { - assert_eq!(strip_file_scheme("file:///Users/lisa/repo"), "/Users/lisa/repo"); - assert_eq!(strip_file_scheme("file:///Users/lisa/repo/"), "/Users/lisa/repo"); + assert_eq!( + strip_file_scheme("file:///Users/lisa/repo"), + "/Users/lisa/repo" + ); + assert_eq!( + strip_file_scheme("file:///Users/lisa/repo/"), + "/Users/lisa/repo" + ); // Non-file URLs are returned verbatim (minus trailing slash). assert_eq!(strip_file_scheme("/already/absolute"), "/already/absolute"); assert_eq!(strip_file_scheme(""), ""); From d625e1c82c769e229e35cb848c69633dc6a24a8d Mon Sep 17 00:00:00 2001 From: Lisa Date: Sat, 25 Apr 2026 00:52:20 +0200 Subject: [PATCH 18/18] docs: add v2.3.5 CHANGELOG entry Co-Authored-By: Claude Sonnet 4.6 --- CHANGELOG.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index cb9d739..d332980 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,20 @@ All notable changes to this project are documented here. ## [Unreleased] +## [2.3.5] — 2026-04-25 + +**Forward-direction name-bridge symmetry.** Fixes `QueryOutgoingImpact` returning empty `direct_items` when the seed `symbol_uri` was indexed by a different provider than its callees — typically a SCIP-descriptor seed (`…#AnalyzeImpact().`) whose outgoing edges were recorded in tier-1 form (`…#AnalyzeImpact`), or vice versa. The reverse direction (`QueryBlastRadiusSymbol`) already handled this via the `callee_name_to_callers` name-bridge added in v2.3.2; the forward direction only consulted `caller_to_callees` by URI-exact match, so any caller-side SCIP/tier-1 mismatch produced zero direct hits even though `QueryBlastRadiusSymbol` on the same symbol worked. `protocol_version` stays at `2`; this is a pure correctness fix with no wire-shape change. + +### Added + +- **`caller_name_to_callees: HashMap>`** — forward-direction twin of `callee_name_to_callers`. Populated at all three edge-insertion sites (`upsert_file`, `upsert_file_precomputed` SCIP path, `upsert_file_precomputed` tier-1 back-fill) using `normalize_callee_name(extract_name(&edge.from_uri))` so SCIP descriptor sigils (`()`, `.`, `#`, `:`) are stripped to a shared identifier key. `remove_file_call_edges` drops caller-side entries symmetrically with the existing callee-side cleanup. `outgoing_impact_for`'s BFS now consults both indexes per hop — URI-exact lookup via `caller_to_callees` plus name-bridge lookup via `caller_name_to_callees` — matching the structural shape of `blast_radius_for` Phase 2. Regression test `outgoing_impact_name_bridge_for_tier1_caller_uri` seeds a SCIP-descriptor caller whose edges were recorded in tier-1 identifier form and asserts the callee surfaces on the wire. + +### Changed + +- **Rust 1.95 clippy hygiene.** Fixed five newly-enforced lints across `query_graph/db.rs`, `query_graph/module_id.rs`, and `indexer/symbol_extractor.rs`: `unnecessary_map_or`, `unnecessary_sort_by`, `manual_pattern_char_comparison`, and two `cloned_ref_to_slice_refs`. `cargo fmt --all` normalized 19 files touched by v2.3.x work. No behavior change. + +--- + ## [2.3.4] — 2026-04-24 **Module-level grouping on impact items.** Adds `module_id: Option` to `ImpactItem` and `SemanticImpactItem` so consumers whose risk classifier weights cross-module blast (CKB's `RecomputeBlastRadius.ModuleCount`) get a useful value instead of the conservative zero that the unioned static-plus-LIP set previously collapsed to. `protocol_version` stays at `2`; the field is `#[serde(default, skip_serializing_if = "Option::is_none")]`, so the wire shape stays byte-identical for emitters that don't populate it and deserialises cleanly on pre-v2.3.4 clients.