From c0bd99a336e200ee0c5934d349cceee3e4a8cfcf Mon Sep 17 00:00:00 2001
From: John Donalson <mirlok@dr.com>
Date: Sat, 24 Jan 2026 07:55:25 -0500
Subject: [PATCH 01/29] Add elbow detection and chunk deduplication utilities

Introduces elbow detection utilities using the Kneedle algorithm for adaptive threshold computation in hybrid search (elbow_detection.py). Adds a high-performance chunk deduplication module with exact and substring-based deduplication logic (chunk_deduplication.py), ported from ChunkHound to Context-Engine.
---
 scripts/hybrid/elbow_detection.py     | 249 ++++++++++++++++++++++++++
 scripts/ingest/chunk_deduplication.py | 189 +++++++++++++++++++
 2 files changed, 438 insertions(+)
 create mode 100644 scripts/hybrid/elbow_detection.py
 create mode 100644 scripts/ingest/chunk_deduplication.py

diff --git a/scripts/hybrid/elbow_detection.py b/scripts/hybrid/elbow_detection.py
new file mode 100644
index 00000000..52fda978
--- /dev/null
+++ b/scripts/hybrid/elbow_detection.py
@@ -0,0 +1,249 @@
+"""Elbow detection utilities for adaptive threshold computation.
+
+Implements the Kneedle algorithm (Satopaa et al. 2011) for finding elbow points
+in score curves. Used for adaptive threshold computation in hybrid search.
+
+Ported from ChunkHound to Context-Engine.
+
+Usage:
+    from scripts.hybrid.elbow_detection import compute_elbow_threshold, find_elbow_kneedle
+    
+    # With raw scores
+    scores = [0.95, 0.88, 0.45, 0.42, 0.40]
+    threshold = compute_elbow_threshold(scores)
+    
+    # With search results (dicts with 'score' or 'rerank_score' keys)
+    results = [{"score": 0.95}, {"score": 0.88}, {"score": 0.45}]
+    threshold = compute_elbow_threshold(results)
+    
+    # Filter results by elbow threshold
+    filtered = [r for r in results if r.get("score", 0) >= threshold]
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Sequence, Union
+
+import numpy as np
+
+logger = logging.getLogger(__name__)
+
+
+def find_elbow_kneedle(sorted_scores: Sequence[float]) -> int | None:
+    """Find elbow point in score curve using simplified Kneedle algorithm.
+
+    Implementation based on Kneedle algorithm (Satopaa et al. 2011):
+    1. Normalize scores to [0,1]
+    2. Draw line from first to last point
+    3. Find point with maximum perpendicular distance to line
+    4. That's the elbow/knee point
+
+    Args:
+        sorted_scores: Scores sorted DESCENDING (highest to lowest)
+
+    Returns:
+        Index of elbow point (0-based array index), or None if no clear elbow detected.
+        Return value can be used to threshold: scores[:elbow_idx+1] are above elbow.
+
+    Examples:
+        >>> scores = [0.95, 0.92, 0.88, 0.45, 0.42, 0.40]  # Clear drop at index 2
+        >>> find_elbow_kneedle(scores)
+        2  # Select first 3 items (indices 0, 1, 2)
+
+        >>> scores = [0.5, 0.5, 0.5, 0.5]  # All identical
+        >>> find_elbow_kneedle(scores)
+        None  # No elbow
+
+        >>> scores = [0.9, 0.8]  # Too few points
+        >>> find_elbow_kneedle(scores)
+        None  # Need at least 3 points
+    """
+    if len(sorted_scores) < 3:
+        logger.debug("Kneedle: Too few points (<3), cannot detect elbow")
+        return None  # Need at least 3 points for elbow
+
+    # Extract scores as numpy array
+    scores = np.array(sorted_scores)
+
+    # Normalize scores to [0, 1]
+    min_score = scores.min()
+    max_score = scores.max()
+    if max_score == min_score:
+        logger.debug("Kneedle: All scores identical, no elbow")
+        return None  # All scores identical, no elbow
+
+    normalized_scores = (scores - min_score) / (max_score - min_score)
+
+    # X-axis: normalized positions [0, 1]
+    x = np.linspace(0, 1, len(normalized_scores))
+
+    # Draw line from first point to last point
+    # Line equation: y = mx + b
+    x1, y1 = x[0], normalized_scores[0]
+    x2, y2 = x[-1], normalized_scores[-1]
+
+    # Handle vertical line case (shouldn't happen with normalized x)
+    if x2 == x1:
+        logger.debug("Kneedle: Vertical line case, no elbow")
+        return None
+
+    m = (y2 - y1) / (x2 - x1)
+    b = y1 - m * x1
+
+    # Compute perpendicular distance from each point to line
+    # Formula: |mx - y + b| / sqrt(m^2 + 1)
+    numerator = np.abs(m * x - normalized_scores + b)
+    denominator = np.sqrt(m**2 + 1)
+    distances = numerator / denominator
+
+    # Find point with maximum distance (that's the elbow)
+    elbow_idx = int(np.argmax(distances))
+
+    # Validate elbow is significant (distance > 1% of normalized range)
+    if distances[elbow_idx] < 0.01:
+        logger.debug(
+            f"Kneedle: Elbow not significant (distance={distances[elbow_idx]:.4f} < 0.01)"
+        )
+        return None  # Elbow not significant enough
+
+    logger.debug(
+        f"Kneedle: Found elbow at index {elbow_idx} "
+        f"(distance={distances[elbow_idx]:.4f}, score={sorted_scores[elbow_idx]:.3f})"
+    )
+
+    # Return 0-based index (for array slicing: scores[:elbow_idx+1])
+    return elbow_idx
+
+
+def compute_elbow_threshold(
+    chunks_or_scores: Union[Sequence[dict], Sequence[float]],
+    score_key: str = "score",
+    fallback_score_key: str = "rerank_score",
+) -> float:
+    """Compute elbow threshold from chunks or scores using Kneedle algorithm.
+
+    Uses the Kneedle algorithm (Satopaa et al. 2011) to detect the elbow point
+    in the score distribution. Falls back to median if Kneedle fails to find
+    a significant elbow.
+
+    Args:
+        chunks_or_scores: Either:
+            - List of chunks (dicts with score_key)
+            - List of raw float scores
+        score_key: Primary key to extract scores from dicts (default: "score")
+        fallback_score_key: Fallback key if primary not found (default: "rerank_score")
+
+    Returns:
+        Threshold value (score at elbow point, or median if no elbow)
+
+    Examples:
+        >>> chunks = [{'score': 0.95}, {'score': 0.88}]
+        >>> compute_elbow_threshold(chunks)
+        0.88
+
+        >>> scores = [0.95, 0.88, 0.45, 0.42]
+        >>> compute_elbow_threshold(scores)
+        0.45
+        
+        >>> # With rerank scores
+        >>> chunks = [{'rerank_score': 0.95}, {'rerank_score': 0.45}]
+        >>> compute_elbow_threshold(chunks, score_key="rerank_score")
+        0.45
+    """
+    # Handle empty input
+    if not chunks_or_scores:
+        return 0.5  # Default threshold
+
+    # Extract scores from chunks or use raw scores
+    if isinstance(chunks_or_scores[0], dict):
+        # Type narrowing: if first element is dict, all are dicts
+        chunk_list: Sequence[dict] = chunks_or_scores  # type: ignore[assignment]
+        scores = []
+        for c in chunk_list:
+            # Try primary key, then fallback, then 0.0
+            score = c.get(score_key)
+            if score is None:
+                score = c.get(fallback_score_key, 0.0)
+            scores.append(float(score))
+    else:
+        # Type narrowing: if first element is not dict, all are floats
+        scores = [float(s) for s in chunks_or_scores]
+
+    if not scores:
+        return 0.5
+
+    sorted_scores = sorted(scores, reverse=True)
+
+    # Try Kneedle algorithm first
+    elbow_idx = find_elbow_kneedle(sorted_scores)
+    if elbow_idx is not None and elbow_idx < len(sorted_scores):
+        threshold = float(sorted_scores[elbow_idx])
+        logger.debug(
+            f"Elbow threshold: {threshold:.3f} (Kneedle at index {elbow_idx} "
+            f"of {len(scores)} scores)"
+        )
+        return threshold
+
+    # Fallback to median if Kneedle fails
+    median_idx = len(sorted_scores) // 2
+    threshold = float(sorted_scores[median_idx])
+    logger.debug(
+        f"Elbow threshold: {threshold:.3f} (median fallback, "
+        f"Kneedle found no significant elbow in {len(scores)} scores)"
+    )
+    return threshold
+
+
+def filter_by_elbow(
+    results: Sequence[dict],
+    score_key: str = "score",
+    fallback_score_key: str = "rerank_score",
+    min_results: int = 1,
+) -> list[dict]:
+    """Filter results using elbow detection for adaptive thresholding.
+    
+    Args:
+        results: List of result dicts with score fields
+        score_key: Primary key to extract scores (default: "score")
+        fallback_score_key: Fallback key if primary not found (default: "rerank_score")
+        min_results: Minimum number of results to return (default: 1)
+        
+    Returns:
+        Filtered list of results above elbow threshold
+        
+    Example:
+        >>> results = [
+        ...     {"id": 1, "score": 0.95},
+        ...     {"id": 2, "score": 0.88},
+        ...     {"id": 3, "score": 0.45},  # <- elbow here
+        ...     {"id": 4, "score": 0.42},
+        ... ]
+        >>> filtered = filter_by_elbow(results)
+        >>> len(filtered)
+        3  # Only items above elbow threshold (0.45)
+    """
+    if not results:
+        return []
+    
+    threshold = compute_elbow_threshold(results, score_key, fallback_score_key)
+    
+    filtered = []
+    for r in results:
+        score = r.get(score_key)
+        if score is None:
+            score = r.get(fallback_score_key, 0.0)
+        if float(score) >= threshold:
+            filtered.append(r)
+    
+    # Ensure minimum results
+    if len(filtered) < min_results and len(results) >= min_results:
+        # Return top min_results by score
+        sorted_results = sorted(
+            results,
+            key=lambda x: float(x.get(score_key) or x.get(fallback_score_key, 0.0)),
+            reverse=True
+        )
+        return sorted_results[:min_results]
+    
+    return filtered if filtered else results[:min_results]
diff --git a/scripts/ingest/chunk_deduplication.py b/scripts/ingest/chunk_deduplication.py
new file mode 100644
index 00000000..176cd817
--- /dev/null
+++ b/scripts/ingest/chunk_deduplication.py
@@ -0,0 +1,189 @@
+"""High-performance chunk deduplication with O(n log n) complexity.
+
+Two-stage deduplication:
+1. Exact content matching via hash table (O(n))
+2. Substring detection via sorted interval scan (O(n log n))
+
+Ported from ChunkHound to Context-Engine.
+"""
+
+from __future__ import annotations
+
+import logging
+from collections import defaultdict
+from typing import Sequence, TypeVar
+
+import xxhash
+
+logger = logging.getLogger(__name__)
+
+T = TypeVar("T", bound=dict)
+
+# Specificity ranking (higher = more specific, keep over lower)
+CONCEPT_SPECIFICITY = {
+    # Context-Engine chunk types
+    "function": 4,
+    "method": 4,
+    "class": 4,
+    "interface": 4,
+    "struct": 4,
+    "enum": 4,
+    "type_alias": 3,
+    "import": 3,
+    "comment": 2,
+    "block": 1,
+    "array": 1,
+    "structure": 0,
+    # CAST+ concept types (from concept_extractor)
+    "DEFINITION": 4,
+    "IMPORT": 3,
+    "COMMENT": 2,
+    "BLOCK": 1,
+    "STRUCTURE": 0,
+}
+
+
+def normalize_content(content: str) -> str:
+    """Normalize content for consistent comparison."""
+    return content.replace("\r\n", "\n").replace("\r", "\n").strip()
+
+
+def get_chunk_specificity(chunk: dict) -> int:
+    """Get specificity ranking for chunk's type. Higher = more specific."""
+    chunk_type = chunk.get("chunk_type") or chunk.get("concept") or chunk.get("type", "")
+    if isinstance(chunk_type, str):
+        type_name = chunk_type.lower()
+    elif hasattr(chunk_type, "value"):
+        type_name = str(chunk_type.value).lower()
+    elif hasattr(chunk_type, "name"):
+        type_name = chunk_type.name.lower()
+    else:
+        type_name = str(chunk_type).lower() if chunk_type else ""
+    
+    return CONCEPT_SPECIFICITY.get(type_name, -1)
+
+
+def deduplicate_chunks(
+    chunks: Sequence[T],
+    language: str | None = None,
+    content_key: str = "code",
+) -> list[T]:
+    """Deduplicate chunks using hash-based exact match + interval-based substring detection.
+
+    Args:
+        chunks: List of chunk dictionaries
+        language: Optional language for language-specific exemptions
+        content_key: Key to extract content from chunks (default: "code")
+
+    Returns:
+        Deduplicated list of chunks
+    """
+    if not chunks:
+        return []
+
+    # Language exemptions: Vue and Haskell preserve duplicates
+    if language and language.lower() in ("vue", "vue_template", "haskell"):
+        return list(chunks)
+
+    # Stage 1: Exact content deduplication via hash table (O(n))
+    exact_deduplicated = _deduplicate_exact_content(chunks, content_key)
+
+    # Stage 2: Substring detection via interval scan (O(n log n))
+    final = _remove_substring_overlaps(exact_deduplicated, content_key)
+
+    logger.debug(
+        f"Deduplication: {len(chunks)} -> {len(exact_deduplicated)} (exact) -> {len(final)} (substring)"
+    )
+
+    return final
+
+
+def _deduplicate_exact_content(chunks: Sequence[T], content_key: str) -> list[T]:
+    """Remove chunks with identical normalized content, keeping highest specificity."""
+    hash_to_chunks: dict[int, list[T]] = defaultdict(list)
+
+    for chunk in chunks:
+        content = chunk.get(content_key, "")
+        if not content:
+            content = chunk.get("content", "") or chunk.get("text", "")
+        
+        normalized = normalize_content(content)
+        if not normalized:
+            continue
+
+        content_hash = xxhash.xxh3_64(normalized.encode("utf-8")).intdigest()
+        hash_to_chunks[content_hash].append(chunk)
+
+    result = []
+    for chunk_list in hash_to_chunks.values():
+        if len(chunk_list) == 1:
+            result.append(chunk_list[0])
+        else:
+            best = max(
+                chunk_list,
+                key=lambda c: (
+                    get_chunk_specificity(c),
+                    -(c.get("end_line", 0) - c.get("start_line", 0)),
+                ),
+            )
+            result.append(best)
+
+    return result
+
+
+def _remove_substring_overlaps(chunks: Sequence[T], content_key: str) -> list[T]:
+    """Remove BLOCK chunks that are substrings of DEFINITION/STRUCTURE chunks."""
+    definitions = []
+    blocks = []
+    other = []
+
+    for chunk in chunks:
+        specificity = get_chunk_specificity(chunk)
+        if specificity == 1:  # BLOCK-like
+            blocks.append(chunk)
+        elif specificity >= 3:  # DEFINITION-like
+            definitions.append(chunk)
+        else:
+            other.append(chunk)
+
+    definitions.sort(key=lambda c: c.get("start_line", 0))
+
+    final = other + definitions
+
+    for block in blocks:
+        block_content = normalize_content(
+            block.get(content_key, "") or block.get("content", "") or block.get("text", "")
+        )
+        block_start = block.get("start_line", 0)
+        block_end = block.get("end_line", 0)
+
+        is_substring = False
+        for definition in _find_overlapping(definitions, block_start, block_end):
+            def_content = normalize_content(
+                definition.get(content_key, "") or definition.get("content", "") or definition.get("text", "")
+            )
+            if block_content in def_content and len(block_content) < len(def_content):
+                is_substring = True
+                break
+
+        if not is_substring:
+            final.append(block)
+
+    return final
+
+
+def _find_overlapping(sorted_chunks: list[T], query_start: int, query_end: int) -> list[T]:
+    """Find chunks whose line ranges overlap with [query_start, query_end]."""
+    overlapping = []
+    for chunk in sorted_chunks:
+        chunk_start = chunk.get("start_line", 0)
+        chunk_end = chunk.get("end_line", 0)
+
+        if chunk_end < query_start:
+            continue
+        if chunk_start > query_end:
+            break
+
+        overlapping.append(chunk)
+
+    return overlapping

From d9ee48f238a4690093bef3949f2974f1cc35ed86 Mon Sep 17 00:00:00 2001
From: John Donalson <mirlok@dr.com>
Date: Sat, 24 Jan 2026 07:58:58 -0500
Subject: [PATCH 02/29] Add improved O(n log n) chunk deduplication with
 substring detection

Introduces a new deduplication method using O(n log n) substring detection in both CASTPlusChunker and SearchOptimizedChunker, falling back to legacy methods if unavailable. Adds deduplicate_semantic_chunks to chunk_deduplication.py for more accurate and efficient deduplication of SemanticChunk objects.
---
 scripts/ingest/cast_chunker.py        | 19 ++++++++---
 scripts/ingest/chunk_deduplication.py | 46 +++++++++++++++++++++++++++
 scripts/ingest/search_chunker.py      | 12 +++++--
 3 files changed, 71 insertions(+), 6 deletions(-)

diff --git a/scripts/ingest/cast_chunker.py b/scripts/ingest/cast_chunker.py
index 05155e4c..db8f0f8a 100644
--- a/scripts/ingest/cast_chunker.py
+++ b/scripts/ingest/cast_chunker.py
@@ -181,7 +181,7 @@ def _non_whitespace_chars(self, text: str) -> int:
     # Deduplication
     # -------------------------------------------------------------------------
     def _deduplicate_chunks(self, chunks: List[SemanticChunk]) -> List[SemanticChunk]:
-        """Remove chunks with identical content, keeping most specific."""
+        """Remove chunks with identical content, keeping most specific (legacy)."""
         if not self.config.deduplicate or not chunks:
             return chunks
 
@@ -189,7 +189,6 @@ def _deduplicate_chunks(self, chunks: List[SemanticChunk]) -> List[SemanticChunk
         for chunk in chunks:
             key = chunk.content.strip()
             if key in seen_content:
-                # Keep the more specific one (DEFINITION > BLOCK > COMMENT)
                 existing = seen_content[key]
                 priority = {ConceptType.DEFINITION: 3, ConceptType.BLOCK: 2,
                            ConceptType.COMMENT: 1, ConceptType.IMPORT: 2,
@@ -201,6 +200,18 @@ def _deduplicate_chunks(self, chunks: List[SemanticChunk]) -> List[SemanticChunk
 
         return list(seen_content.values())
 
+    def _deduplicate_chunks_v2(
+        self, chunks: List[SemanticChunk], language: str
+    ) -> List[SemanticChunk]:
+        """O(n log n) deduplication with substring detection."""
+        if not self.config.deduplicate or not chunks:
+            return chunks
+        try:
+            from scripts.ingest.chunk_deduplication import deduplicate_semantic_chunks
+            return deduplicate_semantic_chunks(chunks, language)
+        except ImportError:
+            return self._deduplicate_chunks(chunks)
+
     # -------------------------------------------------------------------------
     # Merge Logic
     # -------------------------------------------------------------------------
@@ -604,8 +615,8 @@ def chunk(
                 parent=None,
             )]
 
-        # Step 2: Deduplicate
-        chunks = self._deduplicate_chunks(chunks)
+        # Step 2: Deduplicate (O(n log n) with substring detection)
+        chunks = self._deduplicate_chunks_v2(chunks, language)
 
         # Step 3: Group by concept type
         by_concept: Dict[ConceptType, List[SemanticChunk]] = {}
diff --git a/scripts/ingest/chunk_deduplication.py b/scripts/ingest/chunk_deduplication.py
index 176cd817..4971ae49 100644
--- a/scripts/ingest/chunk_deduplication.py
+++ b/scripts/ingest/chunk_deduplication.py
@@ -187,3 +187,49 @@ def _find_overlapping(sorted_chunks: list[T], query_start: int, query_end: int)
         overlapping.append(chunk)
 
     return overlapping
+
+
+def deduplicate_semantic_chunks(
+    chunks: Sequence,
+    language: str | None = None,
+) -> list:
+    """Deduplicate SemanticChunk objects using O(n log n) algorithm.
+    
+    Converts SemanticChunk dataclass objects to dicts, deduplicates,
+    and returns the original objects.
+    
+    Args:
+        chunks: List of SemanticChunk objects (with content, start_line, end_line, concept)
+        language: Optional language for exemptions (Vue, Haskell)
+    
+    Returns:
+        Deduplicated list of SemanticChunk objects
+    """
+    if not chunks:
+        return []
+    
+    chunk_dicts = []
+    for i, c in enumerate(chunks):
+        concept = getattr(c, "concept", None)
+        if concept is not None:
+            if hasattr(concept, "value"):
+                concept_str = concept.value
+            elif hasattr(concept, "name"):
+                concept_str = concept.name
+            else:
+                concept_str = str(concept)
+        else:
+            concept_str = ""
+        
+        chunk_dicts.append({
+            "content": getattr(c, "content", ""),
+            "start_line": getattr(c, "start_line", 0),
+            "end_line": getattr(c, "end_line", 0),
+            "concept": concept_str,
+            "_idx": i,
+        })
+    
+    deduped_dicts = deduplicate_chunks(chunk_dicts, language, content_key="content")
+    
+    kept_indices = {d["_idx"] for d in deduped_dicts}
+    return [c for i, c in enumerate(chunks) if i in kept_indices]
diff --git a/scripts/ingest/search_chunker.py b/scripts/ingest/search_chunker.py
index f56b2f91..bb287087 100644
--- a/scripts/ingest/search_chunker.py
+++ b/scripts/ingest/search_chunker.py
@@ -190,7 +190,7 @@ def chunk(self, content: str, language: str) -> List[ChunkResult]:
             return [self._content_to_result(content, 1, len(content.splitlines()))]
         
         if self.config.deduplicate:
-            chunks = self._deduplicate(chunks)
+            chunks = self._deduplicate_v2(chunks, language)
         
         chunks = self._split_oversized(chunks, content)
         chunks = self._merge_compatible(chunks, content)
@@ -322,7 +322,7 @@ def _classify_concept(self, text: str, kind: Optional[str]) -> ConceptType:
         return ConceptType.BLOCK  # Default
     
     def _deduplicate(self, chunks: List[SemanticChunk]) -> List[SemanticChunk]:
-        """Remove chunks with identical content."""
+        """Remove chunks with identical content (legacy, hash-based)."""
         result = []
         for chunk in chunks:
             if chunk.content_hash not in self._seen_hashes:
@@ -330,6 +330,14 @@ def _deduplicate(self, chunks: List[SemanticChunk]) -> List[SemanticChunk]:
                 result.append(chunk)
         return result
     
+    def _deduplicate_v2(self, chunks: List[SemanticChunk], language: str) -> List[SemanticChunk]:
+        """Remove chunks using O(n log n) deduplication with substring detection."""
+        try:
+            from scripts.ingest.chunk_deduplication import deduplicate_semantic_chunks
+            return deduplicate_semantic_chunks(chunks, language)
+        except ImportError:
+            return self._deduplicate(chunks)
+    
     def _split_oversized(self, chunks: List[SemanticChunk], content: str) -> List[SemanticChunk]:
         """Split chunks that exceed size limits."""
         result = []

From 4da21771dbeaa00269dc3f27c98ee73e9e9c72d8 Mon Sep 17 00:00:00 2001
From: John Donalson <mirlok@dr.com>
Date: Sat, 24 Jan 2026 07:59:07 -0500
Subject: [PATCH 03/29] Create termination.py

---
 scripts/hybrid/termination.py | 126 ++++++++++++++++++++++++++++++++++
 1 file changed, 126 insertions(+)
 create mode 100644 scripts/hybrid/termination.py

diff --git a/scripts/hybrid/termination.py b/scripts/hybrid/termination.py
new file mode 100644
index 00000000..6dcc85e8
--- /dev/null
+++ b/scripts/hybrid/termination.py
@@ -0,0 +1,126 @@
+"""Smart termination conditions for iterative search operations.
+
+Implements 5 termination conditions from ChunkHound's multi-hop strategy:
+1. Time limit (default 5 seconds)
+2. Result limit (default 500 chunks)
+3. Candidate quality (need N+ high-scoring for expansion)
+4. Score degradation (stop if tracked scores drop by threshold)
+5. Minimum relevance (stop if top-N min score below threshold)
+"""
+
+from __future__ import annotations
+
+import logging
+import time
+from dataclasses import dataclass, field
+from typing import Dict, List, Tuple, Sequence
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class TerminationConfig:
+    time_limit: float = 5.0
+    result_limit: int = 500
+    min_candidates_for_expansion: int = 5
+    score_degradation_threshold: float = 0.15
+    min_relevance_score: float = 0.3
+    top_n_to_track: int = 5
+
+
+class TerminationChecker:
+    """Checks 5 termination conditions for iterative search operations."""
+    
+    def __init__(self, config: TerminationConfig | None = None):
+        self.config = config or TerminationConfig()
+        self.start_time = time.perf_counter()
+        self.tracked_chunk_scores: Dict[str, float] = {}
+        self.iteration = 0
+    
+    def reset(self) -> None:
+        self.start_time = time.perf_counter()
+        self.tracked_chunk_scores.clear()
+        self.iteration = 0
+    
+    def elapsed(self) -> float:
+        return time.perf_counter() - self.start_time
+    
+    def check(
+        self,
+        results: Sequence[dict],
+        score_key: str = "score",
+        id_key: str = "chunk_id",
+    ) -> Tuple[bool, str]:
+        """Check all termination conditions.
+        
+        Returns:
+            (should_terminate, reason) - reason is empty string if should continue
+        """
+        self.iteration += 1
+        
+        # 1. Time limit
+        if self.elapsed() >= self.config.time_limit:
+            logger.debug(f"Termination: time limit {self.config.time_limit}s reached")
+            return True, "time_limit"
+        
+        # 2. Result limit
+        if len(results) >= self.config.result_limit:
+            logger.debug(f"Termination: result limit {self.config.result_limit} reached")
+            return True, "result_limit"
+        
+        # 3. Insufficient high-scoring candidates
+        high_scoring = [r for r in results if r.get(score_key, 0) > 0]
+        if len(high_scoring) < self.config.min_candidates_for_expansion:
+            logger.debug(
+                f"Termination: insufficient candidates "
+                f"({len(high_scoring)} < {self.config.min_candidates_for_expansion})"
+            )
+            return True, "insufficient_candidates"
+        
+        # Sort by score descending
+        sorted_results = sorted(results, key=lambda x: -x.get(score_key, 0))
+        top_n = sorted_results[:self.config.top_n_to_track]
+        
+        # 4. Score degradation - track specific chunks across iterations
+        if self.tracked_chunk_scores:
+            max_drop = 0.0
+            for chunk_id, prev_score in self.tracked_chunk_scores.items():
+                current_score = next(
+                    (r.get(score_key, 0) for r in results if r.get(id_key) == chunk_id),
+                    0.0
+                )
+                if current_score < prev_score:
+                    max_drop = max(max_drop, prev_score - current_score)
+            
+            if max_drop >= self.config.score_degradation_threshold:
+                logger.debug(
+                    f"Termination: score degradation {max_drop:.3f} >= "
+                    f"{self.config.score_degradation_threshold}"
+                )
+                return True, "score_degradation"
+        
+        # Update tracked scores for next iteration
+        self.tracked_chunk_scores.clear()
+        for r in top_n:
+            chunk_id = r.get(id_key)
+            if chunk_id:
+                self.tracked_chunk_scores[chunk_id] = r.get(score_key, 0)
+        
+        # 5. Minimum relevance - stop if top-N min score too low
+        if top_n:
+            min_score = min(r.get(score_key, 0) for r in top_n)
+            if min_score < self.config.min_relevance_score:
+                logger.debug(
+                    f"Termination: min relevance {min_score:.3f} < "
+                    f"{self.config.min_relevance_score}"
+                )
+                return True, "min_relevance"
+        
+        return False, ""
+    
+    def get_stats(self) -> Dict[str, any]:
+        return {
+            "iterations": self.iteration,
+            "elapsed_seconds": round(self.elapsed(), 3),
+            "tracked_chunks": len(self.tracked_chunk_scores),
+        }

From 19993f56216bcab913b48420e4cf95516acefe71 Mon Sep 17 00:00:00 2001
From: John Donalson <mirlok@dr.com>
Date: Sat, 24 Jan 2026 08:49:49 -0500
Subject: [PATCH 04/29] Integrate unified language mappings and improve
 analysis

Adds concept-based extraction to ast_analyzer using declarative tree-sitter queries for 32+ languages, supporting universal concepts (definition, block, comment, import, structure). Updates language mapping classes for correct language keys, improves Go and TypeScript import queries, and enhances Redis connection handling with pooling and retries. Introduces elbow detection filtering for hybrid search, adds multi-hop chunk similarity search in Qdrant, and improves upload bundle manifest validation. Includes comprehensive tests for language mappings and analyzer integration.
---
 .env.example                                  |   5 +
 docs/CONFIGURATION.md                         |   1 +
 scripts/ast_analyzer.py                       | 533 +++++++++++++++++-
 scripts/hybrid/qdrant.py                      | 122 +++-
 scripts/hybrid/termination.py                 |   8 -
 scripts/hybrid_search.py                      |  26 +
 scripts/ingest/language_mappings/go.py        |   8 +-
 .../ingest/language_mappings/javascript.py    |   2 +-
 scripts/ingest/language_mappings/jsx.py       |   2 +-
 scripts/ingest/language_mappings/svelte.py    |   2 +-
 scripts/ingest/language_mappings/tsx.py       |   2 +-
 .../ingest/language_mappings/typescript.py    |   8 +-
 scripts/ingest/language_mappings/vue.py       |   2 +-
 scripts/upload_service.py                     |  20 +-
 scripts/workspace_state.py                    |  44 +-
 tests/test_ast_analyzer_mappings.py           | 533 ++++++++++++++++++
 16 files changed, 1265 insertions(+), 53 deletions(-)
 create mode 100644 tests/test_ast_analyzer_mappings.py

diff --git a/.env.example b/.env.example
index 7e23581d..562d84b6 100644
--- a/.env.example
+++ b/.env.example
@@ -149,6 +149,11 @@ SEMANTIC_EXPANSION_CACHE_TTL=3600
 # HYBRID_RECENCY_WEIGHT=0.1
 # RERANK_EXPAND=1
 
+# Elbow detection filter: adaptive threshold based on score distribution (Kneedle algorithm)
+# Filters out low-relevance results by detecting the "elbow" point in the score curve
+# Improves precision by only returning results above the natural relevance drop-off
+# HYBRID_ELBOW_FILTER=0
+
 # Caching (embeddings and search results)
 # MAX_EMBED_CACHE=16384
 # HYBRID_RESULTS_CACHE=128
diff --git a/docs/CONFIGURATION.md b/docs/CONFIGURATION.md
index 0e62d3d6..fe1f1575 100644
--- a/docs/CONFIGURATION.md
+++ b/docs/CONFIGURATION.md
@@ -523,6 +523,7 @@ Useful for Kubernetes deployments where a shared filesystem is not reliable.
 | CODEBASE_STATE_REDIS_LOCK_WAIT_MS | Redis lock wait in ms | 2000 |
 | CODEBASE_STATE_REDIS_SOCKET_TIMEOUT | Redis socket timeout in seconds | 2 |
 | CODEBASE_STATE_REDIS_CONNECT_TIMEOUT | Redis connect timeout in seconds | 2 |
+| CODEBASE_STATE_REDIS_MAX_CONNECTIONS | Redis connection pool size limit | 10 |
 
 ### Semantic Expansion
 
diff --git a/scripts/ast_analyzer.py b/scripts/ast_analyzer.py
index 0b87e443..4b16c5f7 100644
--- a/scripts/ast_analyzer.py
+++ b/scripts/ast_analyzer.py
@@ -26,6 +26,19 @@
 
 logger = logging.getLogger("ast_analyzer")
 
+# ---------------------------------------------------------------------------
+# Language Mappings Integration
+# ---------------------------------------------------------------------------
+# Context-Engine's unified concept-based extraction supporting 32 languages.
+# Uses declarative tree-sitter queries organized by semantic concept type:
+#   DEFINITION, BLOCK, COMMENT, IMPORT, STRUCTURE
+_LANGUAGE_MAPPINGS_AVAILABLE = False
+try:
+    from scripts.ingest.language_mappings import get_mapping, supported_languages as lm_supported_languages, ConceptType
+    _LANGUAGE_MAPPINGS_AVAILABLE = True
+except ImportError:
+    pass
+
 # Optional tree-sitter support - tree-sitter 0.25+ API
 _TS_LANGUAGES: Dict[str, Any] = {}
 _TS_AVAILABLE = False
@@ -131,6 +144,27 @@ class CodeSymbol:
     parent: Optional[str] = None  # Parent class/module
     complexity: int = 0  # Cyclomatic complexity estimate
     content_hash: Optional[str] = None
+    concept: Optional[str] = None  # Universal concept type (definition, block, comment, etc.)
+
+
+@dataclass
+class ConceptUnit:
+    """A semantic code unit with universal concept classification.
+    
+    Context-Engine's 5 universal concepts for language-agnostic analysis:
+    - DEFINITION: functions, classes, types, constants
+    - BLOCK: control flow, scoped regions
+    - COMMENT: comments, docstrings
+    - IMPORT: import/include statements
+    - STRUCTURE: file-level organization
+    """
+    concept: str  # definition, block, comment, import, structure
+    name: str
+    content: str
+    start_line: int
+    end_line: int
+    kind: str = ""  # More specific: function, class, if, for, etc.
+    metadata: Dict[str, Any] = field(default_factory=dict)
 
 
 @dataclass
@@ -223,7 +257,13 @@ def analyze_file(
                 logger.error(f"Failed to read {file_path}: {e}")
                 return self._empty_analysis()
         
-        # Route to appropriate analyzer
+        # Use language mappings (32 languages, declarative queries)
+        if _LANGUAGE_MAPPINGS_AVAILABLE and self.use_tree_sitter:
+            result = self._analyze_with_mapping(content, file_path, language)
+            if result and (result.get("symbols") or result.get("imports")):
+                return result
+        
+        # Fallback to legacy per-language analyzers
         if language == "python":
             return self._analyze_python(content, file_path)
         elif language in ("javascript", "typescript") and self.use_tree_sitter:
@@ -396,6 +436,497 @@ def extract_dependencies(
             "local": list(set(local))
         }
     
+    # ---- Language Mappings Analysis (unified, concept-based) ----
+    
+    def _analyze_with_mapping(self, content: str, file_path: str, language: str) -> Dict[str, Any]:
+        """Analyze code using language mappings (concept-based extraction).
+        
+        This uses the declarative tree-sitter queries from language_mappings
+        to extract symbols, imports, and calls. Supports 34 languages.
+        """
+        if not _LANGUAGE_MAPPINGS_AVAILABLE:
+            return self._empty_analysis()
+        
+        try:
+            mapping = get_mapping(language)
+        except (TypeError, Exception) as e:
+            logger.debug(f"Mapping instantiation failed for {language}: {e}")
+            return self._empty_analysis()
+        
+        if not mapping:
+            return self._empty_analysis()
+        
+        # Get parser for this language
+        parser = self._get_ts_parser(language)
+        if not parser:
+            return self._empty_analysis()
+        
+        try:
+            tree = parser.parse(content.encode("utf-8"))
+            root = tree.root_node
+        except Exception as e:
+            logger.debug(f"Tree-sitter parse failed for {language}: {e}")
+            return self._empty_analysis()
+        
+        content_bytes = content.encode("utf-8")
+        symbols: List[CodeSymbol] = []
+        imports: List[ImportReference] = []
+        calls: List[CallReference] = []
+        
+        # Get tree-sitter language object for queries
+        ts_lang = _TS_LANGUAGES.get(language) or _TS_LANGUAGES.get(self._normalize_lang(language))
+        if not ts_lang:
+            return self._empty_analysis()
+        
+        try:
+            from tree_sitter import Query, QueryCursor
+        except ImportError:
+            return self._empty_analysis()
+        
+        # Extract DEFINITION concepts -> symbols
+        def_query_str = mapping.get_query_for_concept(ConceptType.DEFINITION)
+        if def_query_str:
+            try:
+                query = Query(ts_lang, def_query_str)
+                cursor = QueryCursor(query)
+                seen_ranges: Set[Tuple[int, int]] = set()
+                
+                for match in cursor.matches(root):
+                    _, captures_dict = match
+                    main_node = None
+                    name_node = None
+                    
+                    for capture_name, nodes in captures_dict.items():
+                        if not nodes:
+                            continue
+                        node = nodes[0]
+                        if capture_name in ("definition", "function_def", "class_def", 
+                                           "method_def", "type_def", "const_def"):
+                            main_node = node
+                        elif capture_name in ("name", "function_name", "class_name", 
+                                             "method_name", "type_name", "const_name"):
+                            name_node = node
+                        elif main_node is None:
+                            main_node = node
+                    
+                    if main_node is None:
+                        continue
+                    
+                    range_key = (main_node.start_byte, main_node.end_byte)
+                    if range_key in seen_ranges:
+                        continue
+                    seen_ranges.add(range_key)
+                    
+                    # Extract name
+                    if name_node:
+                        name = content_bytes[name_node.start_byte:name_node.end_byte].decode("utf-8", errors="replace")
+                    else:
+                        name = self._extract_name_from_ts_node(main_node, content_bytes)
+                    
+                    # Infer kind from node type
+                    kind = self._node_type_to_kind(main_node.type)
+                    
+                    # Extract docstring if available
+                    docstring = self._extract_ts_docstring(main_node, content_bytes)
+                    
+                    # Extract signature
+                    signature = self._extract_ts_signature(main_node, content_bytes, name, kind)
+                    
+                    # Extract decorators (for Python, etc.)
+                    decorators = self._extract_ts_decorators(main_node, content_bytes)
+                    
+                    # Determine parent
+                    parent = self._find_ts_parent_name(main_node, content_bytes)
+                    
+                    symbols.append(CodeSymbol(
+                        name=name,
+                        kind=kind,
+                        start_line=main_node.start_point[0] + 1,
+                        end_line=main_node.end_point[0] + 1,
+                        path=f"{parent}.{name}" if parent else name,
+                        docstring=docstring,
+                        signature=signature,
+                        decorators=decorators,
+                        parent=parent,
+                    ))
+            except Exception as e:
+                logger.debug(f"DEFINITION query failed for {language}: {e}")
+        
+        # Extract IMPORT concepts -> imports
+        import_query_str = mapping.get_query_for_concept(ConceptType.IMPORT)
+        if import_query_str:
+            try:
+                query = Query(ts_lang, import_query_str)
+                cursor = QueryCursor(query)
+                seen_ranges: Set[Tuple[int, int]] = set()
+                
+                for match in cursor.matches(root):
+                    _, captures_dict = match
+                    main_node = None
+                    path_node = None
+                    
+                    for capture_name, nodes in captures_dict.items():
+                        if not nodes:
+                            continue
+                        node = nodes[0]
+                        # Look for import path specifically
+                        if capture_name in ("import_path", "path", "module", "source"):
+                            path_node = node
+                        # Look for import statement container
+                        elif capture_name in ("import", "import_from", "import_statement", 
+                                             "import_spec", "import_declaration",
+                                             "include", "require", "use", "definition"):
+                            if main_node is None or node.start_byte < main_node.start_byte:
+                                main_node = node
+                    
+                    # Use path_node if available for cleaner import text
+                    import_node = path_node or main_node
+                    if import_node is None:
+                        continue
+                    
+                    range_key = (import_node.start_byte, import_node.end_byte)
+                    if range_key in seen_ranges:
+                        continue
+                    seen_ranges.add(range_key)
+                    
+                    import_text = content_bytes[import_node.start_byte:import_node.end_byte].decode("utf-8", errors="replace")
+                    module, names, is_from = self._parse_import_text(import_text, language)
+                    
+                    # If path_node was used directly, the text might be just the path
+                    if not module and path_node:
+                        module = import_text.strip().strip('"\'')
+                    
+                    if module:
+                        imports.append(ImportReference(
+                            module=module,
+                            names=names,
+                            line=import_node.start_point[0] + 1,
+                            is_from=is_from,
+                        ))
+            except Exception as e:
+                logger.debug(f"IMPORT query failed for {language}: {e}")
+        
+        # Extract calls by walking the tree for call expressions
+        calls = self._extract_calls_from_tree(root, content_bytes, symbols, language)
+        
+        # Extract all concepts for comprehensive analysis
+        concepts: List[ConceptUnit] = []
+        for concept_type in ConceptType:
+            query_str = mapping.get_query_for_concept(concept_type)
+            if not query_str:
+                continue
+            try:
+                query = Query(ts_lang, query_str)
+                cursor = QueryCursor(query)
+                seen: Set[Tuple[int, int]] = set()
+                
+                for match in cursor.matches(root):
+                    _, captures_dict = match
+                    main_node = None
+                    name_node = None
+                    
+                    for cname, nodes in captures_dict.items():
+                        if not nodes:
+                            continue
+                        node = nodes[0]
+                        if cname in ("definition", "block", "import", "comment", "structure"):
+                            main_node = node
+                        elif cname == "name" or cname.endswith("_name"):
+                            name_node = node
+                        elif main_node is None:
+                            main_node = node
+                    
+                    if main_node is None:
+                        continue
+                    
+                    rkey = (main_node.start_byte, main_node.end_byte)
+                    if rkey in seen:
+                        continue
+                    seen.add(rkey)
+                    
+                    if name_node:
+                        name = content_bytes[name_node.start_byte:name_node.end_byte].decode("utf-8", errors="replace")
+                    else:
+                        name = self._extract_name_from_ts_node(main_node, content_bytes)
+                    
+                    unit_content = content_bytes[main_node.start_byte:main_node.end_byte].decode("utf-8", errors="replace")
+                    
+                    concepts.append(ConceptUnit(
+                        concept=concept_type.value,
+                        name=name,
+                        content=unit_content,
+                        start_line=main_node.start_point[0] + 1,
+                        end_line=main_node.end_point[0] + 1,
+                        kind=self._node_type_to_kind(main_node.type),
+                    ))
+            except Exception as e:
+                logger.debug(f"{concept_type.value} query failed for {language}: {e}")
+        
+        return {
+            "symbols": symbols,
+            "imports": imports,
+            "calls": calls,
+            "concepts": concepts,  # All semantic units by concept type
+            "language": language,
+        }
+    
+    def _normalize_lang(self, language: str) -> str:
+        """Normalize language name to tree-sitter key."""
+        lang = language.lower().strip()
+        aliases = {
+            "js": "javascript", "jsx": "javascript",
+            "ts": "typescript", "tsx": "typescript",
+            "c++": "cpp", "cxx": "cpp",
+            "c#": "csharp", "cs": "csharp",
+            "shell": "bash", "sh": "bash",
+        }
+        return aliases.get(lang, lang)
+    
+    def _extract_name_from_ts_node(self, node, content_bytes: bytes) -> str:
+        """Extract name from tree-sitter node."""
+        # Try field 'name' first
+        if hasattr(node, 'child_by_field_name'):
+            name_node = node.child_by_field_name('name')
+            if name_node:
+                return content_bytes[name_node.start_byte:name_node.end_byte].decode("utf-8", errors="replace")
+        
+        # Look for identifier child
+        for i in range(node.child_count):
+            child = node.child(i)
+            if child and child.type in ("identifier", "name", "type_identifier"):
+                return content_bytes[child.start_byte:child.end_byte].decode("utf-8", errors="replace")
+        
+        return f"anonymous_{node.start_point[0] + 1}"
+    
+    def _node_type_to_kind(self, node_type: str) -> str:
+        """Map tree-sitter node type to symbol kind."""
+        mapping = {
+            # Functions
+            "function_definition": "function",
+            "async_function_definition": "function",
+            "function_declaration": "function",
+            "arrow_function": "function",
+            "function_item": "function",
+            "generator_function_declaration": "function",
+            # Methods
+            "method_definition": "method",
+            "method_declaration": "method",
+            # Classes
+            "class_definition": "class",
+            "class_declaration": "class",
+            "class_specifier": "class",
+            # Structs (Go, Rust, C/C++)
+            "struct_item": "struct",
+            "struct_specifier": "struct",
+            "type_declaration": "struct",  # Go uses this for struct/interface
+            "type_spec": "struct",
+            # Interfaces
+            "interface_declaration": "interface",
+            "interface_type": "interface",
+            # Types
+            "type_alias_declaration": "type",
+            "type_item": "type",
+            # Enums
+            "enum_declaration": "enum",
+            "enum_item": "enum",
+            # Rust-specific
+            "impl_item": "impl",
+            "trait_item": "trait",
+            "mod_item": "module",
+            # Constants/Variables
+            "const_item": "constant",
+            "const_declaration": "constant",
+            "variable_declaration": "variable",
+            "lexical_declaration": "variable",
+            # Imports
+            "import_statement": "import",
+            "import_declaration": "import",
+            "import_spec": "import",
+            # Comments
+            "comment": "comment",
+            "block_comment": "comment",
+            "line_comment": "comment",
+            # Control flow (for BLOCK concepts)
+            "if_statement": "if",
+            "for_statement": "for",
+            "while_statement": "while",
+            "try_statement": "try",
+            "switch_statement": "switch",
+            "match_expression": "match",
+        }
+        return mapping.get(node_type, "symbol")
+    
+    def _extract_ts_docstring(self, node, content_bytes: bytes) -> Optional[str]:
+        """Extract docstring from node body."""
+        body = node.child_by_field_name('body') if hasattr(node, 'child_by_field_name') else None
+        if not body:
+            return None
+        
+        for i in range(min(2, body.child_count)):
+            child = body.child(i)
+            if child and child.type == "expression_statement":
+                for j in range(child.child_count):
+                    expr = child.child(j)
+                    if expr and expr.type == "string":
+                        text = content_bytes[expr.start_byte:expr.end_byte].decode("utf-8", errors="replace")
+                        # Strip quotes
+                        if text.startswith('"""') or text.startswith("'''"):
+                            return text[3:-3].strip()
+                        elif text.startswith('"') or text.startswith("'"):
+                            return text[1:-1].strip()
+        return None
+    
+    def _extract_ts_signature(self, node, content_bytes: bytes, name: str, kind: str) -> str:
+        """Build signature from node."""
+        if kind in ("function", "method"):
+            params_node = node.child_by_field_name('parameters') if hasattr(node, 'child_by_field_name') else None
+            if params_node:
+                params_text = content_bytes[params_node.start_byte:params_node.end_byte].decode("utf-8", errors="replace")
+                return f"def {name}{params_text}"
+            return f"def {name}()"
+        elif kind == "class":
+            return f"class {name}"
+        return name
+    
+    def _extract_ts_decorators(self, node, content_bytes: bytes) -> List[str]:
+        """Extract decorators from preceding siblings."""
+        decorators = []
+        prev = node.prev_sibling
+        while prev and prev.type == "decorator":
+            dec_text = content_bytes[prev.start_byte:prev.end_byte].decode("utf-8", errors="replace")
+            dec_name = dec_text.lstrip("@").split("(")[0]
+            decorators.insert(0, dec_name)
+            prev = prev.prev_sibling
+        return decorators
+    
+    def _find_ts_parent_name(self, node, content_bytes: bytes) -> Optional[str]:
+        """Find parent class/module name."""
+        parent = node.parent
+        while parent:
+            if parent.type in ("class_definition", "class_declaration", "class_specifier",
+                              "impl_item", "module"):
+                name_node = parent.child_by_field_name('name') if hasattr(parent, 'child_by_field_name') else None
+                if name_node:
+                    return content_bytes[name_node.start_byte:name_node.end_byte].decode("utf-8", errors="replace")
+            parent = parent.parent
+        return None
+    
+    def _parse_import_text(self, text: str, language: str) -> Tuple[str, List[str], bool]:
+        """Parse import statement text to extract module and names."""
+        text = text.strip()
+        
+        # Python: from X import Y or import X
+        if language == "python":
+            if text.startswith("from "):
+                match = re.match(r"from\s+([\w.]+)\s+import\s+(.+)", text)
+                if match:
+                    module = match.group(1)
+                    names_str = match.group(2)
+                    names = [n.strip().split(" as ")[0] for n in names_str.split(",")]
+                    return module, names, True
+            elif text.startswith("import "):
+                match = re.match(r"import\s+([\w.]+)", text)
+                if match:
+                    return match.group(1), [], False
+        
+        # JavaScript/TypeScript: import X from 'Y' or require('Y')
+        elif language in ("javascript", "typescript", "jsx", "tsx"):
+            if "from" in text:
+                match = re.search(r"from\s+['\"]([^'\"]+)['\"]", text)
+                if match:
+                    return match.group(1), [], True
+            elif "require" in text:
+                match = re.search(r"require\s*\(\s*['\"]([^'\"]+)['\"]", text)
+                if match:
+                    return match.group(1), [], False
+        
+        # Go: import "path"
+        elif language == "go":
+            match = re.search(r'"([^"]+)"', text)
+            if match:
+                return match.group(1), [], False
+        
+        # Rust: use path::to::module
+        elif language == "rust":
+            match = re.match(r"use\s+([\w:]+)", text)
+            if match:
+                return match.group(1), [], False
+        
+        # Java/Kotlin: import package.Class;
+        elif language in ("java", "kotlin"):
+            match = re.match(r"import\s+([\w.]+);?", text)
+            if match:
+                return match.group(1), [], False
+        
+        # C/C++: #include <header> or #include "header"
+        elif language in ("c", "cpp"):
+            match = re.search(r'#include\s*[<"]([^>"]+)[>"]', text)
+            if match:
+                return match.group(1), [], False
+        
+        # C#: using Namespace;
+        elif language == "csharp":
+            match = re.match(r"using\s+([\w.]+);?", text)
+            if match:
+                return match.group(1), [], False
+        
+        # Generic: try to find quoted string
+        match = re.search(r"['\"]([^'\"]+)['\"]", text)
+        if match:
+            return match.group(1), [], False
+        
+        return "", [], False
+    
+    def _extract_calls_from_tree(self, root, content_bytes: bytes, symbols: List[CodeSymbol], language: str) -> List[CallReference]:
+        """Walk tree to extract function calls."""
+        calls: List[CallReference] = []
+        symbol_ranges = [(s.start_line, s.end_line, s.path or s.name) for s in symbols]
+        
+        def find_enclosing_symbol(line: int) -> str:
+            for start, end, path in symbol_ranges:
+                if start <= line <= end:
+                    return path
+            return ""
+        
+        def walk(node):
+            node_type = node.type
+            
+            # Call expressions
+            if node_type in ("call", "call_expression", "function_call", "method_call"):
+                func_node = node.child_by_field_name('function') if hasattr(node, 'child_by_field_name') else None
+                if not func_node:
+                    # Try first child
+                    for i in range(node.child_count):
+                        child = node.child(i)
+                        if child and child.type in ("identifier", "member_expression", "attribute"):
+                            func_node = child
+                            break
+                
+                if func_node:
+                    callee = content_bytes[func_node.start_byte:func_node.end_byte].decode("utf-8", errors="replace")
+                    # Clean up callee (get last part of attribute access)
+                    if "." in callee:
+                        callee = callee.split(".")[-1]
+                    
+                    line = node.start_point[0] + 1
+                    caller = find_enclosing_symbol(line)
+                    
+                    calls.append(CallReference(
+                        caller=caller,
+                        callee=callee,
+                        line=line,
+                        context="call",
+                    ))
+            
+            # Recurse
+            for i in range(node.child_count):
+                child = node.child(i)
+                if child:
+                    walk(child)
+        
+        walk(root)
+        return calls
+
     # ---- Python-specific analysis (using ast module) ----
 
     def _analyze_python(self, content: str, file_path: str) -> Dict[str, Any]:
diff --git a/scripts/hybrid/qdrant.py b/scripts/hybrid/qdrant.py
index 33925c47..20a77ae8 100644
--- a/scripts/hybrid/qdrant.py
+++ b/scripts/hybrid/qdrant.py
@@ -844,30 +844,105 @@ def multi_granular_query(
 # Module exports
 # ---------------------------------------------------------------------------
 
+def find_similar_chunks(
+    client,
+    chunk_id: str,
+    collection: str,
+    vec_name: str,
+    limit: int = 20,
+    threshold: float | None = None,
+    path_filter: str | None = None,
+) -> List[Dict[str, Any]]:
+    """Find chunks similar to a given chunk by retrieving its vector and searching.
+    
+    Used for multi-hop search expansion - given a high-scoring chunk,
+    find its nearest neighbors in the vector space.
+    
+    Args:
+        client: QdrantClient instance
+        chunk_id: ID of the chunk to find similar chunks for
+        collection: Collection name
+        vec_name: Vector name to use for similarity
+        limit: Maximum number of similar chunks to return
+        threshold: Optional minimum similarity score
+        path_filter: Optional path prefix to filter results
+    
+    Returns:
+        List of similar chunks with score, content, path, and full payload
+    """
+    try:
+        points = client.retrieve(
+            collection_name=collection,
+            ids=[chunk_id],
+            with_vectors=[vec_name],
+        )
+    except Exception:
+        points = client.retrieve(
+            collection_name=collection,
+            ids=[chunk_id],
+            with_vectors=True,
+        )
+    
+    if not points:
+        return []
+    
+    point = points[0]
+    vector = point.vector
+    if isinstance(vector, dict):
+        vector = vector.get(vec_name)
+    if not vector:
+        return []
+    
+    must_not = [models.HasIdCondition(has_id=[chunk_id])]
+    must = []
+    
+    if path_filter:
+        must.append(models.FieldCondition(
+            key="metadata.path",
+            match=models.MatchText(text=path_filter),
+        ))
+    
+    flt = models.Filter(must=must, must_not=must_not) if must or must_not else None
+    
+    try:
+        results = client.search(
+            collection_name=collection,
+            query_vector=(vec_name, vector),
+            query_filter=flt,
+            limit=limit,
+            score_threshold=threshold,
+            with_payload=True,
+        )
+    except TypeError:
+        results = client.search(
+            collection_name=collection,
+            query_vector=vector,
+            query_filter=flt,
+            limit=limit,
+            score_threshold=threshold,
+            with_payload=True,
+        )
+    
+    output = []
+    for r in results:
+        md = (r.payload or {}).get("metadata", {})
+        output.append({
+            "chunk_id": str(r.id),
+            "score": r.score,
+            "similarity": r.score,
+            "content": md.get("text", ""),
+            "path": md.get("path", ""),
+            "start_line": md.get("start_line"),
+            "end_line": md.get("end_line"),
+            "symbol": md.get("symbol"),
+            "kind": md.get("kind"),
+            "payload": r.payload,
+        })
+    
+    return output
+
+
 __all__ = [
-    # Pool availability flag
-    "_POOL_AVAILABLE",
-    # Connection pooling
-    "get_qdrant_client",
-    "return_qdrant_client",
-    "pooled_qdrant_client",
-    # Thread executor
-    "_QUERY_EXECUTOR",
-    "_EXECUTOR_LOCK",
-    "_get_query_executor",
-    # Point coercion
-    "_coerce_points",
-    # Legacy search
-    "_legacy_vector_search",
-    # Collection caching
-    "_ENSURED_COLLECTIONS",
-    "_get_client_endpoint",
-    "_ensure_collection",
-    "clear_ensured_collections",
-    # Collection name resolution
-    "_collection",
-    # Filter sanitization
-    "_sanitize_filter_obj",
     # Lexical vector functions
     "_split_ident_lex",
     "lex_hash_vector",
@@ -877,6 +952,7 @@ def multi_granular_query(
     "sparse_lex_query",
     "dense_query",
     "multi_granular_query",
+    "find_similar_chunks",
     # Multi-granular config
     "MULTI_GRANULAR_VECTORS",
     "ENTITY_DENSE_NAME",
diff --git a/scripts/hybrid/termination.py b/scripts/hybrid/termination.py
index 6dcc85e8..1907fa79 100644
--- a/scripts/hybrid/termination.py
+++ b/scripts/hybrid/termination.py
@@ -58,17 +58,14 @@ def check(
         """
         self.iteration += 1
         
-        # 1. Time limit
         if self.elapsed() >= self.config.time_limit:
             logger.debug(f"Termination: time limit {self.config.time_limit}s reached")
             return True, "time_limit"
         
-        # 2. Result limit
         if len(results) >= self.config.result_limit:
             logger.debug(f"Termination: result limit {self.config.result_limit} reached")
             return True, "result_limit"
         
-        # 3. Insufficient high-scoring candidates
         high_scoring = [r for r in results if r.get(score_key, 0) > 0]
         if len(high_scoring) < self.config.min_candidates_for_expansion:
             logger.debug(
@@ -77,11 +74,9 @@ def check(
             )
             return True, "insufficient_candidates"
         
-        # Sort by score descending
         sorted_results = sorted(results, key=lambda x: -x.get(score_key, 0))
         top_n = sorted_results[:self.config.top_n_to_track]
         
-        # 4. Score degradation - track specific chunks across iterations
         if self.tracked_chunk_scores:
             max_drop = 0.0
             for chunk_id, prev_score in self.tracked_chunk_scores.items():
@@ -99,14 +94,11 @@ def check(
                 )
                 return True, "score_degradation"
         
-        # Update tracked scores for next iteration
         self.tracked_chunk_scores.clear()
         for r in top_n:
             chunk_id = r.get(id_key)
             if chunk_id:
                 self.tracked_chunk_scores[chunk_id] = r.get(score_key, 0)
-        
-        # 5. Minimum relevance - stop if top-N min score too low
         if top_n:
             min_score = min(r.get(score_key, 0) for r in top_n)
             if min_score < self.config.min_relevance_score:
diff --git a/scripts/hybrid_search.py b/scripts/hybrid_search.py
index 56805341..d9ff3c53 100644
--- a/scripts/hybrid_search.py
+++ b/scripts/hybrid_search.py
@@ -245,6 +245,14 @@
     _IMPL_INTENT_PATTERNS,
 )
 
+# ---------------------------------------------------------------------------
+# Elbow detection for adaptive filtering
+# ---------------------------------------------------------------------------
+from scripts.hybrid.elbow_detection import filter_by_elbow
+
+# Environment variable for elbow filtering (opt-in)
+ELBOW_FILTER_ENABLED = _env_truthy(os.environ.get("HYBRID_ELBOW_FILTER"), False)
+
 # ---------------------------------------------------------------------------
 # Re-exports from hybrid_expand
 # ---------------------------------------------------------------------------
@@ -3011,6 +3019,24 @@ def _resolve(seg: str) -> list[str]:
         if why is not None:
             item["why"] = why
         items.append(item)
+
+    # Apply elbow detection filter if enabled (adaptive threshold based on score distribution)
+    if ELBOW_FILTER_ENABLED and items:
+        original_count = len(items)
+        # Use rerank_score if available, otherwise use score
+        items = filter_by_elbow(
+            items,
+            score_key="rerank_score",
+            fallback_score_key="score",
+            min_results=max(1, limit // 2),  # Keep at least half the requested limit
+        )
+        if os.environ.get("DEBUG_HYBRID_SEARCH"):
+            logger.debug(
+                f"Elbow filter: {original_count} -> {len(items)} results "
+                f"(threshold based on Kneedle algorithm)"
+            )
+        _dt("elbow_filter")
+
     if _USE_CACHE and cache_key is not None:
         if UNIFIED_CACHE_AVAILABLE:
             _RESULTS_CACHE.set(cache_key, items)
diff --git a/scripts/ingest/language_mappings/go.py b/scripts/ingest/language_mappings/go.py
index 514952b6..c67cda7f 100644
--- a/scripts/ingest/language_mappings/go.py
+++ b/scripts/ingest/language_mappings/go.py
@@ -146,11 +146,9 @@ def get_query_for_concept(self, concept: ConceptType) -> str | None:
 
         elif concept == ConceptType.IMPORT:
             return """
-            (import_declaration
-                (import_spec
-                    path: (interpreted_string_literal) @import_path
-                ) @import_spec
-            ) @definition
+            (import_spec
+                path: (interpreted_string_literal) @import_path
+            ) @import
 
             (package_clause
                 (package_identifier) @package_name
diff --git a/scripts/ingest/language_mappings/javascript.py b/scripts/ingest/language_mappings/javascript.py
index 882eca55..06297a4e 100644
--- a/scripts/ingest/language_mappings/javascript.py
+++ b/scripts/ingest/language_mappings/javascript.py
@@ -34,7 +34,7 @@
     TSNode = None
 
 
-class JavaScriptMapping(BaseMapping, JSFamilyExtraction):
+class JavaScriptMapping(JSFamilyExtraction, BaseMapping):
     """JavaScript language mapping for tree-sitter parsing.
 
     Provides JavaScript-specific queries and extraction methods for:
diff --git a/scripts/ingest/language_mappings/jsx.py b/scripts/ingest/language_mappings/jsx.py
index bc96f155..bc0fe94e 100644
--- a/scripts/ingest/language_mappings/jsx.py
+++ b/scripts/ingest/language_mappings/jsx.py
@@ -46,7 +46,7 @@ def __init__(self):
         """Initialize JSX mapping."""
         # Initialize with JSX language instead of JavaScript
         super().__init__()
-        self.language = Language.JSX
+        self.language = "jsx"
 
     def get_function_query(self) -> str:
         """Get tree-sitter query for JSX function definitions including React components.
diff --git a/scripts/ingest/language_mappings/svelte.py b/scripts/ingest/language_mappings/svelte.py
index dc59f63b..8ad0ead8 100644
--- a/scripts/ingest/language_mappings/svelte.py
+++ b/scripts/ingest/language_mappings/svelte.py
@@ -36,7 +36,7 @@ class SvelteMapping(TypeScriptMapping):
     def __init__(self) -> None:
         """Initialize Svelte mapping (delegates to TypeScript for script parsing)."""
         super().__init__()
-        self.language = Language.SVELTE  # Override to SVELTE
+        self.language = "svelte"  # Override to SVELTE
 
     # Section extraction patterns
     SCRIPT_PATTERN = re.compile(
diff --git a/scripts/ingest/language_mappings/tsx.py b/scripts/ingest/language_mappings/tsx.py
index 7bc1b4a5..d0fe11e7 100644
--- a/scripts/ingest/language_mappings/tsx.py
+++ b/scripts/ingest/language_mappings/tsx.py
@@ -43,7 +43,7 @@ class TSXMapping(TypeScriptMapping):
     def __init__(self):
         """Initialize TSX mapping."""
         # Initialize with TSX language instead of TypeScript
-        BaseMapping.__init__(self, Language.TSX)
+        BaseMapping.__init__(self, "tsx")
 
     def get_function_query(self) -> str:
         """Get tree-sitter query for TSX function definitions including typed React components.
diff --git a/scripts/ingest/language_mappings/typescript.py b/scripts/ingest/language_mappings/typescript.py
index f17ffddf..7acd9920 100644
--- a/scripts/ingest/language_mappings/typescript.py
+++ b/scripts/ingest/language_mappings/typescript.py
@@ -35,7 +35,7 @@
     # TSNode is already defined in TYPE_CHECKING block
 
 
-class TypeScriptMapping(BaseMapping, JSFamilyExtraction):
+class TypeScriptMapping(JSFamilyExtraction, BaseMapping):
     """TypeScript language mapping for tree-sitter parsing.
 
     This mapping handles TypeScript-specific AST patterns including:
@@ -172,6 +172,12 @@ def get_query_for_concept(self, concept: "ConceptType") -> str | None:  # type:
             return """
             (comment) @definition
             """
+        elif concept == ConceptType.IMPORT:
+            return """
+            (import_statement
+                source: (string) @import_path
+            ) @import
+            """
         return None
 
     # extract_name / extract_metadata / extract_content are inherited
diff --git a/scripts/ingest/language_mappings/vue.py b/scripts/ingest/language_mappings/vue.py
index 99c672ee..96085a54 100644
--- a/scripts/ingest/language_mappings/vue.py
+++ b/scripts/ingest/language_mappings/vue.py
@@ -52,7 +52,7 @@ class VueMapping(TypeScriptMapping):
     def __init__(self) -> None:
         """Initialize Vue mapping (delegates to TypeScript for script parsing)."""
         super().__init__()
-        self.language = Language.VUE  # Override to VUE
+        self.language = "vue"  # Override to VUE
 
     # Section extraction patterns
     SCRIPT_PATTERN = re.compile(
diff --git a/scripts/upload_service.py b/scripts/upload_service.py
index de774366..3c718ffd 100644
--- a/scripts/upload_service.py
+++ b/scripts/upload_service.py
@@ -467,14 +467,28 @@ def validate_bundle_format(bundle_path: Path) -> Dict[str, Any]:
                 if not any(req_file in member for member in members):
                     raise ValueError(f"Missing required file: {req_file}")
 
-            # Extract and validate manifest
+            # Extract and validate manifest - look for root-level manifest.json only
+            # The bundle structure is {bundle_id}/manifest.json at the root
             manifest_member = None
+            manifest_candidates = [m for m in members if m.endswith("manifest.json")]
+            logger.debug(f"[upload_service] Bundle members: {members[:20]}...")
+            logger.debug(f"[upload_service] Manifest candidates: {manifest_candidates}")
+            
+            # Prefer root-level manifest (exactly one path component before manifest.json)
             for member in members:
-                if member.endswith("manifest.json"):
+                if member.endswith("/manifest.json") and member.count("/") == 1:
                     manifest_member = member
                     break
+            
+            # Fallback: if no root-level manifest, try any manifest.json (but NOT in files/ subdirs)
+            if not manifest_member:
+                for member in members:
+                    if member.endswith("manifest.json") and "/files/" not in member:
+                        manifest_member = member
+                        break
 
             if not manifest_member:
+                logger.error(f"[upload_service] No valid manifest.json found. Candidates were: {manifest_candidates}")
                 raise ValueError("manifest.json not found in bundle")
 
             manifest_file = tar.extractfile(manifest_member)
@@ -482,11 +496,13 @@ def validate_bundle_format(bundle_path: Path) -> Dict[str, Any]:
                 raise ValueError("Cannot extract manifest.json")
 
             manifest = json.loads(manifest_file.read().decode('utf-8'))
+            logger.debug(f"[upload_service] Parsed manifest keys: {list(manifest.keys())}")
 
             # Validate manifest structure
             required_fields = ["version", "bundle_id", "workspace_path", "created_at", "sequence_number"]
             for field in required_fields:
                 if field not in manifest:
+                    logger.error(f"[upload_service] Manifest missing field '{field}'. Got keys: {list(manifest.keys())}")
                     raise ValueError(f"Missing required field in manifest: {field}")
 
             return manifest
diff --git a/scripts/workspace_state.py b/scripts/workspace_state.py
index aec35330..99eae638 100644
--- a/scripts/workspace_state.py
+++ b/scripts/workspace_state.py
@@ -79,6 +79,7 @@ def _get_redis_client():
             return _REDIS_CLIENT
         try:
             import redis  # type: ignore
+            from redis.connection import ConnectionPool
         except Exception as e:
             logger.warning(f"Redis backend enabled but redis package not available: {e}")
             return None
@@ -86,21 +87,26 @@ def _get_redis_client():
         try:
             socket_timeout = float(os.environ.get("CODEBASE_STATE_REDIS_SOCKET_TIMEOUT", "2") or 2)
             connect_timeout = float(os.environ.get("CODEBASE_STATE_REDIS_CONNECT_TIMEOUT", "2") or 2)
+            max_connections = int(os.environ.get("CODEBASE_STATE_REDIS_MAX_CONNECTIONS", "10") or 10)
         except Exception:
             socket_timeout = 2.0
             connect_timeout = 2.0
+            max_connections = 10
         try:
             client = redis.Redis.from_url(
                 url,
                 decode_responses=True,
                 socket_timeout=socket_timeout,
                 socket_connect_timeout=connect_timeout,
+                max_connections=max_connections,
+                retry_on_timeout=True,
             )
             try:
                 client.ping()
             except Exception as e:
                 logger.warning(f"Redis backend enabled but ping failed: {e}")
                 return None
+            logger.info(f"Redis client initialized (max_connections={max_connections})")
             _REDIS_CLIENT = client
             return _REDIS_CLIENT
         except Exception as e:
@@ -108,13 +114,31 @@ def _get_redis_client():
             return None
 
 
+def _redis_retry(fn, retries: int = 2, delay: float = 0.1):
+    """Retry a Redis operation on transient failures."""
+    last_err = None
+    for attempt in range(retries + 1):
+        try:
+            return fn()
+        except Exception as e:
+            last_err = e
+            err_str = str(e).lower()
+            # Retry on timeout/connection errors, not on logic errors
+            if any(x in err_str for x in ("timeout", "connection", "reset", "broken pipe")):
+                if attempt < retries:
+                    time.sleep(delay * (attempt + 1))
+                    continue
+            raise
+    raise last_err  # type: ignore
+
+
 def _redis_get_json(kind: str, path: Path) -> Optional[Dict[str, Any]]:
     client = _get_redis_client()
     if client is None:
         return None
     key = _redis_key_for_path(kind, path)
     try:
-        raw = client.get(key)
+        raw = _redis_retry(lambda: client.get(key))
     except Exception as e:
         logger.debug(f"Redis get failed for {key}: {e}")
         return None
@@ -141,7 +165,7 @@ def _redis_set_json(kind: str, path: Path, obj: Dict[str, Any]) -> bool:
         logger.debug(f"Failed to JSON serialize redis payload for {key}: {e}")
         return False
     try:
-        client.set(key, payload)
+        _redis_retry(lambda: client.set(key, payload))
         return True
     except Exception as e:
         logger.debug(f"Redis set failed for {key}: {e}")
@@ -154,7 +178,7 @@ def _redis_exists(kind: str, path: Path) -> bool:
         return False
     key = _redis_key_for_path(kind, path)
     try:
-        return bool(client.exists(key))
+        return bool(_redis_retry(lambda: client.exists(key)))
     except Exception as e:
         logger.debug(f"Redis exists failed for {key}: {e}")
         return False
@@ -179,7 +203,7 @@ def _redis_get_json_by_key(key: str) -> Optional[Dict[str, Any]]:
     if client is None:
         return None
     try:
-        raw = client.get(key)
+        raw = _redis_retry(lambda: client.get(key))
     except Exception as e:
         logger.debug(f"Redis get failed for {key}: {e}")
         return None
@@ -201,7 +225,7 @@ def _redis_delete(kind: str, path: Path) -> bool:
         return False
     key = _redis_key_for_path(kind, path)
     try:
-        client.delete(key)
+        _redis_retry(lambda: client.delete(key))
         return True
     except Exception as e:
         logger.debug(f"Redis delete failed for {key}: {e}")
@@ -226,19 +250,22 @@ def _redis_lock(kind: str, path: Path):
         wait_ms = 2000
     deadline = time.time() + (wait_ms / 1000.0)
     acquired = False
+    attempts = 0
     while time.time() < deadline:
+        attempts += 1
         try:
             if client.set(lock_key, token, nx=True, px=ttl_ms):
                 acquired = True
                 break
         except Exception as e:
-            logger.debug(f"Redis lock set failed for {lock_key}: {e}")
+            logger.warning(f"Redis lock set failed for {lock_key}: {e}")
             break
         time.sleep(0.05)
     if not acquired:
-        logger.debug(f"Redis lock not acquired for {lock_key}, proceeding without lock")
+        logger.info(f"Redis lock not acquired for {lock_key} after {attempts} attempts, proceeding without lock")
         yield
         return
+    logger.info(f"Redis lock acquired for {lock_key} (attempts={attempts}, ttl={ttl_ms}ms)")
     try:
         yield
     finally:
@@ -249,8 +276,9 @@ def _redis_lock(kind: str, path: Path):
                 lock_key,
                 token,
             )
+            logger.debug(f"Redis lock released for {lock_key}")
         except Exception as e:
-            logger.debug(f"Redis lock release failed for {lock_key}: {e}")
+            logger.warning(f"Redis lock release failed for {lock_key}: {e}")
 
 
 def is_staging_enabled() -> bool:
diff --git a/tests/test_ast_analyzer_mappings.py b/tests/test_ast_analyzer_mappings.py
new file mode 100644
index 00000000..88017ec2
--- /dev/null
+++ b/tests/test_ast_analyzer_mappings.py
@@ -0,0 +1,533 @@
+#!/usr/bin/env python3
+"""
+Comprehensive tests for ast_analyzer language mappings integration.
+
+Tests that:
+1. All 32 language mappings can be instantiated
+2. ast_analyzer correctly uses mappings for symbol extraction
+3. Import extraction works across languages
+4. Call extraction works
+5. Fallback to legacy analyzers works when needed
+"""
+
+import pytest
+from scripts.ast_analyzer import (
+    get_ast_analyzer,
+    ASTAnalyzer,
+    CodeSymbol,
+    ConceptUnit,
+    ImportReference,
+    CallReference,
+    _LANGUAGE_MAPPINGS_AVAILABLE,
+    _TS_AVAILABLE,
+)
+from scripts.ingest.language_mappings import _MAPPINGS, get_mapping, ConceptType
+
+
+# =============================================================================
+# Test: All Language Mappings Instantiate
+# =============================================================================
+
+class TestLanguageMappingsComplete:
+    """Verify all 32 language mappings can be instantiated."""
+
+    def test_all_mappings_instantiate(self):
+        """Every registered mapping class should instantiate without error."""
+        failed = []
+        passed = []
+
+        for lang, mapping_class in _MAPPINGS.items():
+            try:
+                instance = mapping_class()
+                assert instance is not None
+                assert hasattr(instance, 'get_query_for_concept')
+                passed.append(lang)
+            except Exception as e:
+                failed.append((lang, str(e)))
+
+        assert len(failed) == 0, f"Failed mappings: {failed}"
+        assert len(passed) == 32, f"Expected 32 mappings, got {len(passed)}"
+
+    def test_all_mappings_have_definition_query(self):
+        """All mappings should provide a DEFINITION query."""
+        missing = []
+        for lang, mapping_class in _MAPPINGS.items():
+            try:
+                instance = mapping_class()
+                query = instance.get_query_for_concept(ConceptType.DEFINITION)
+                if query is None:
+                    missing.append(lang)
+            except Exception:
+                pass  # Tested separately
+
+        # Some mappings (text, markdown) may not have DEFINITION queries
+        assert len(missing) <= 5, f"Too many missing DEFINITION queries: {missing}"
+
+
+# =============================================================================
+# Test: Python Analysis
+# =============================================================================
+
+class TestPythonAnalysis:
+    """Test Python code analysis via mappings."""
+
+    @pytest.fixture
+    def analyzer(self):
+        return get_ast_analyzer(reset=True)
+
+    def test_python_function_extraction(self, analyzer):
+        """Extract Python functions."""
+        code = '''
+def hello(name: str) -> str:
+    """Say hello."""
+    return f"Hello {name}"
+
+async def async_hello():
+    pass
+'''
+        result = analyzer.analyze_file('/test.py', 'python', code)
+        symbols = result.get('symbols', [])
+        
+        names = [s.name for s in symbols]
+        assert 'hello' in names
+        assert 'async_hello' in names
+
+    def test_python_class_extraction(self, analyzer):
+        """Extract Python classes and methods."""
+        code = '''
+class MyClass:
+    """A test class."""
+    
+    def __init__(self, value):
+        self.value = value
+    
+    def get_value(self):
+        return self.value
+'''
+        result = analyzer.analyze_file('/test.py', 'python', code)
+        symbols = result.get('symbols', [])
+        
+        names = [s.name for s in symbols]
+        assert 'MyClass' in names
+        
+        kinds = {s.name: s.kind for s in symbols}
+        assert kinds.get('MyClass') == 'class'
+
+    def test_python_imports(self, analyzer):
+        """Extract Python imports."""
+        code = '''
+import os
+import sys
+from pathlib import Path
+from typing import List, Dict
+'''
+        result = analyzer.analyze_file('/test.py', 'python', code)
+        imports = result.get('imports', [])
+        
+        modules = [i.module for i in imports]
+        assert 'os' in modules
+        assert 'sys' in modules
+        assert 'pathlib' in modules
+        assert 'typing' in modules
+
+    def test_python_calls(self, analyzer):
+        """Extract Python function calls."""
+        code = '''
+def main():
+    print("Hello")
+    os.path.join("a", "b")
+    helper()
+
+def helper():
+    pass
+'''
+        result = analyzer.analyze_file('/test.py', 'python', code)
+        calls = result.get('calls', [])
+        
+        callees = [c.callee for c in calls]
+        assert 'print' in callees
+
+
+# =============================================================================
+# Test: JavaScript/TypeScript Analysis
+# =============================================================================
+
+class TestJavaScriptAnalysis:
+    """Test JavaScript/TypeScript analysis via mappings."""
+
+    @pytest.fixture
+    def analyzer(self):
+        return get_ast_analyzer(reset=True)
+
+    @pytest.mark.skipif(not _TS_AVAILABLE, reason="tree-sitter not available")
+    def test_javascript_functions(self, analyzer):
+        """Extract JavaScript functions."""
+        code = '''
+function greet(name) {
+    console.log("Hello " + name);
+}
+
+const arrow = () => {
+    return 42;
+};
+'''
+        result = analyzer.analyze_file('/test.js', 'javascript', code)
+        symbols = result.get('symbols', [])
+        
+        names = [s.name for s in symbols]
+        assert 'greet' in names
+
+    @pytest.mark.skipif(not _TS_AVAILABLE, reason="tree-sitter not available")
+    def test_typescript_imports(self, analyzer):
+        """Extract TypeScript imports."""
+        code = '''
+import { useState, useEffect } from "react";
+import axios from "axios";
+import * as fs from "fs";
+'''
+        result = analyzer.analyze_file('/test.ts', 'typescript', code)
+        imports = result.get('imports', [])
+        
+        modules = [i.module for i in imports]
+        assert 'react' in modules
+        assert 'axios' in modules
+        assert 'fs' in modules
+
+
+# =============================================================================
+# Test: Go Analysis
+# =============================================================================
+
+class TestGoAnalysis:
+    """Test Go analysis via mappings."""
+
+    @pytest.fixture
+    def analyzer(self):
+        return get_ast_analyzer(reset=True)
+
+    @pytest.mark.skipif(not _TS_AVAILABLE, reason="tree-sitter not available")
+    def test_go_functions(self, analyzer):
+        """Extract Go functions."""
+        code = '''
+package main
+
+func main() {
+    fmt.Println("Hello")
+}
+
+func helper(x int) int {
+    return x * 2
+}
+'''
+        result = analyzer.analyze_file('/test.go', 'go', code)
+        symbols = result.get('symbols', [])
+        
+        names = [s.name for s in symbols]
+        assert 'main' in names
+        assert 'helper' in names
+
+    @pytest.mark.skipif(not _TS_AVAILABLE, reason="tree-sitter not available")
+    def test_go_imports(self, analyzer):
+        """Extract Go imports."""
+        code = '''
+package main
+
+import (
+    "fmt"
+    "os"
+    "strings"
+)
+
+func main() {}
+'''
+        result = analyzer.analyze_file('/test.go', 'go', code)
+        imports = result.get('imports', [])
+        
+        modules = [i.module for i in imports]
+        assert 'fmt' in modules
+        assert 'os' in modules
+        assert 'strings' in modules
+
+
+# =============================================================================
+# Test: Rust Analysis
+# =============================================================================
+
+class TestRustAnalysis:
+    """Test Rust analysis via mappings."""
+
+    @pytest.fixture
+    def analyzer(self):
+        return get_ast_analyzer(reset=True)
+
+    @pytest.mark.skipif(not _TS_AVAILABLE, reason="tree-sitter not available")
+    def test_rust_functions(self, analyzer):
+        """Extract Rust functions."""
+        code = '''
+fn main() {
+    println!("Hello");
+}
+
+pub fn helper(x: i32) -> i32 {
+    x * 2
+}
+'''
+        result = analyzer.analyze_file('/test.rs', 'rust', code)
+        symbols = result.get('symbols', [])
+        
+        names = [s.name for s in symbols]
+        assert 'main' in names
+        assert 'helper' in names
+
+    @pytest.mark.skipif(not _TS_AVAILABLE, reason="tree-sitter not available")
+    def test_rust_imports(self, analyzer):
+        """Extract Rust use statements."""
+        code = '''
+use std::io;
+use std::collections::HashMap;
+
+fn main() {}
+'''
+        result = analyzer.analyze_file('/test.rs', 'rust', code)
+        imports = result.get('imports', [])
+        
+        modules = [i.module for i in imports]
+        assert any('std' in m for m in modules)
+
+
+# =============================================================================
+# Test: Java Analysis
+# =============================================================================
+
+class TestJavaAnalysis:
+    """Test Java analysis via mappings."""
+
+    @pytest.fixture
+    def analyzer(self):
+        return get_ast_analyzer(reset=True)
+
+    @pytest.mark.skipif(not _TS_AVAILABLE, reason="tree-sitter not available")
+    def test_java_class(self, analyzer):
+        """Extract Java class and methods."""
+        code = '''
+public class Hello {
+    public static void main(String[] args) {
+        System.out.println("Hello");
+    }
+    
+    private int helper(int x) {
+        return x * 2;
+    }
+}
+'''
+        result = analyzer.analyze_file('/Hello.java', 'java', code)
+        symbols = result.get('symbols', [])
+        
+        names = [s.name for s in symbols]
+        assert 'Hello' in names
+        assert 'main' in names
+
+    @pytest.mark.skipif(not _TS_AVAILABLE, reason="tree-sitter not available")
+    def test_java_imports(self, analyzer):
+        """Extract Java imports."""
+        code = '''
+import java.util.List;
+import java.util.ArrayList;
+import java.io.*;
+
+public class Test {}
+'''
+        result = analyzer.analyze_file('/Test.java', 'java', code)
+        imports = result.get('imports', [])
+        
+        modules = [i.module for i in imports]
+        assert 'java.util.List' in modules
+        assert 'java.util.ArrayList' in modules
+
+
+# =============================================================================
+# Test: C/C++ Analysis
+# =============================================================================
+
+class TestCppAnalysis:
+    """Test C/C++ analysis via mappings."""
+
+    @pytest.fixture
+    def analyzer(self):
+        return get_ast_analyzer(reset=True)
+
+    @pytest.mark.skipif(not _TS_AVAILABLE, reason="tree-sitter not available")
+    def test_cpp_functions(self, analyzer):
+        """Extract C++ functions."""
+        code = '''
+#include <iostream>
+
+int main() {
+    std::cout << "Hello" << std::endl;
+    return 0;
+}
+
+int helper(int x) {
+    return x * 2;
+}
+'''
+        result = analyzer.analyze_file('/test.cpp', 'cpp', code)
+        symbols = result.get('symbols', [])
+        
+        names = [s.name for s in symbols]
+        assert 'main' in names
+
+    @pytest.mark.skipif(not _TS_AVAILABLE, reason="tree-sitter not available")
+    def test_cpp_includes(self, analyzer):
+        """Extract C++ includes."""
+        code = '''
+#include <iostream>
+#include <vector>
+#include "myheader.h"
+
+int main() { return 0; }
+'''
+        result = analyzer.analyze_file('/test.cpp', 'cpp', code)
+        imports = result.get('imports', [])
+        
+        modules = [i.module for i in imports]
+        assert 'iostream' in modules
+        assert 'vector' in modules
+
+
+# =============================================================================
+# Test: Multi-Language Consistency
+# =============================================================================
+
+class TestMultiLanguageConsistency:
+    """Test that analysis is consistent across languages."""
+
+    @pytest.fixture
+    def analyzer(self):
+        return get_ast_analyzer(reset=True)
+
+    @pytest.mark.skipif(not _TS_AVAILABLE, reason="tree-sitter not available")
+    def test_all_return_correct_types(self, analyzer):
+        """All analyses should return correct types."""
+        test_cases = [
+            ('python', 'def foo(): pass'),
+            ('javascript', 'function foo() {}'),
+            ('go', 'package main\nfunc foo() {}'),
+            ('rust', 'fn foo() {}'),
+            ('java', 'public class Foo {}'),
+            ('cpp', 'int foo() { return 0; }'),
+        ]
+        
+        for lang, code in test_cases:
+            result = analyzer.analyze_file(f'/test.{lang}', lang, code)
+            
+            assert isinstance(result, dict), f"{lang}: result should be dict"
+            assert 'symbols' in result, f"{lang}: should have symbols"
+            assert 'imports' in result, f"{lang}: should have imports"
+            assert 'calls' in result, f"{lang}: should have calls"
+            
+            for sym in result.get('symbols', []):
+                assert isinstance(sym, CodeSymbol), f"{lang}: symbols should be CodeSymbol"
+            for imp in result.get('imports', []):
+                assert isinstance(imp, ImportReference), f"{lang}: imports should be ImportReference"
+            for call in result.get('calls', []):
+                assert isinstance(call, CallReference), f"{lang}: calls should be CallReference"
+
+    @pytest.mark.skipif(not _TS_AVAILABLE, reason="tree-sitter not available")
+    def test_empty_file_handling(self, analyzer):
+        """Empty files should not crash."""
+        for lang in ['python', 'javascript', 'go', 'rust', 'java']:
+            result = analyzer.analyze_file(f'/empty.{lang}', lang, '')
+            assert isinstance(result, dict)
+            
+            result = analyzer.analyze_file(f'/whitespace.{lang}', lang, '   \n\n   ')
+            assert isinstance(result, dict)
+
+
+# =============================================================================
+# Test: Fallback Behavior
+# =============================================================================
+
+class TestFallbackBehavior:
+    """Test fallback to legacy analyzers."""
+
+    @pytest.fixture
+    def analyzer(self):
+        return get_ast_analyzer(reset=True)
+
+    def test_unsupported_language_fallback(self, analyzer):
+        """Unsupported languages should fall back gracefully."""
+        code = 'some unknown code here'
+        result = analyzer.analyze_file('/test.xyz', 'unknown_language', code)
+        
+        # Should return empty analysis, not crash
+        assert isinstance(result, dict)
+        assert 'symbols' in result
+        assert 'imports' in result
+
+    def test_syntax_error_handling(self, analyzer):
+        """Syntax errors should be handled gracefully."""
+        # Malformed Python
+        code = 'def foo(\n  broken syntax here'
+        result = analyzer.analyze_file('/test.py', 'python', code)
+        
+        # Should not crash
+        assert isinstance(result, dict)
+
+
+# =============================================================================
+# Test: Symbol Metadata
+# =============================================================================
+
+class TestSymbolMetadata:
+    """Test that symbol metadata is extracted correctly."""
+
+    @pytest.fixture
+    def analyzer(self):
+        return get_ast_analyzer(reset=True)
+
+    def test_python_symbol_metadata(self, analyzer):
+        """Python symbols should have rich metadata."""
+        code = '''
+@decorator
+def my_function(a: int, b: str) -> bool:
+    """This is the docstring."""
+    return True
+'''
+        result = analyzer.analyze_file('/test.py', 'python', code)
+        symbols = result.get('symbols', [])
+        
+        func = next((s for s in symbols if s.name == 'my_function'), None)
+        assert func is not None
+        assert func.kind == 'function'
+        assert func.start_line > 0
+        assert func.end_line >= func.start_line
+
+    def test_symbol_line_numbers(self, analyzer):
+        """Symbol line numbers should be accurate."""
+        code = '''# Line 1
+# Line 2
+def foo():  # Line 3
+    pass    # Line 4
+# Line 5
+def bar():  # Line 6
+    pass    # Line 7
+'''
+        result = analyzer.analyze_file('/test.py', 'python', code)
+        symbols = result.get('symbols', [])
+        
+        foo = next((s for s in symbols if s.name == 'foo'), None)
+        bar = next((s for s in symbols if s.name == 'bar'), None)
+        
+        assert foo is not None
+        assert bar is not None
+        assert foo.start_line == 3
+        assert bar.start_line == 6
+
+
+# =============================================================================
+# Run tests
+# =============================================================================
+
+if __name__ == '__main__':
+    pytest.main([__file__, '-v'])

From 636bd522b8dcd90a6368e3f88d7f88ed5f69a7d6 Mon Sep 17 00:00:00 2001
From: John Donalson <mirlok@dr.com>
Date: Sat, 24 Jan 2026 08:55:25 -0500
Subject: [PATCH 05/29] Add postinstall script to set execute permission

Added a postinstall script in package.json to ensure bin/ctxce.js is executable after installation. This helps prevent permission issues when running the start script.
---
 ctx-mcp-bridge/package.json | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/ctx-mcp-bridge/package.json b/ctx-mcp-bridge/package.json
index 04504c10..a1c4f2fc 100644
--- a/ctx-mcp-bridge/package.json
+++ b/ctx-mcp-bridge/package.json
@@ -8,7 +8,8 @@
   },
   "type": "module",
   "scripts": {
-    "start": "node bin/ctxce.js"
+    "start": "node bin/ctxce.js",
+    "postinstall": "chmod +x bin/ctxce.js 2>/dev/null || true"
   },
   "dependencies": {
     "@modelcontextprotocol/sdk": "^1.24.3",
@@ -20,4 +21,4 @@
   "engines": {
     "node": ">=18.0.0"
   }
-}
+}
\ No newline at end of file

From 4512de238b5e28d6b55c9f6d83861374ac1a29cb Mon Sep 17 00:00:00 2001
From: John Donalson <mirlok@dr.com>
Date: Sat, 24 Jan 2026 09:15:51 -0500
Subject: [PATCH 06/29] Expand __all__ exports in qdrant.py and update shim

Added internal and utility symbols to the __all__ list in scripts/hybrid/qdrant.py for more explicit exports. Updated scripts/hybrid_qdrant.py shim to import __all__ for improved backward compatibility.
---
 scripts/hybrid/qdrant.py | 24 ++++++++++++++++++++++++
 scripts/hybrid_qdrant.py |  1 +
 2 files changed, 25 insertions(+)

diff --git a/scripts/hybrid/qdrant.py b/scripts/hybrid/qdrant.py
index 20a77ae8..f14ea549 100644
--- a/scripts/hybrid/qdrant.py
+++ b/scripts/hybrid/qdrant.py
@@ -943,6 +943,29 @@ def find_similar_chunks(
 
 
 __all__ = [
+    # Pool availability flag
+    "_POOL_AVAILABLE",
+    # Connection pooling
+    "get_qdrant_client",
+    "return_qdrant_client",
+    "pooled_qdrant_client",
+    # Thread executor
+    "_QUERY_EXECUTOR",
+    "_EXECUTOR_LOCK",
+    "_get_query_executor",
+    # Point coercion
+    "_coerce_points",
+    # Legacy search
+    "_legacy_vector_search",
+    # Collection caching
+    "_ENSURED_COLLECTIONS",
+    "_get_client_endpoint",
+    "_ensure_collection",
+    "clear_ensured_collections",
+    # Collection resolution
+    "_collection",
+    # Filter sanitization
+    "_sanitize_filter_obj",
     # Lexical vector functions
     "_split_ident_lex",
     "lex_hash_vector",
@@ -966,3 +989,4 @@ def find_similar_chunks(
     "LEX_SPARSE_MODE",
     "EF_SEARCH",
 ]
+
diff --git a/scripts/hybrid_qdrant.py b/scripts/hybrid_qdrant.py
index 2498ef8e..789700b9 100644
--- a/scripts/hybrid_qdrant.py
+++ b/scripts/hybrid_qdrant.py
@@ -1,3 +1,4 @@
 #!/usr/bin/env python3
 """Shim for backward compatibility. See scripts/hybrid/qdrant.py"""
 from scripts.hybrid.qdrant import *
+from scripts.hybrid.qdrant import __all__

From bbf0a1d02cf0e0e5f51cb6c0d10ae6c253110c0b Mon Sep 17 00:00:00 2001
From: John Donalson <mirlok@dr.com>
Date: Sat, 24 Jan 2026 09:18:12 -0500
Subject: [PATCH 07/29] Fix score handling and concept type casing issues

Refactored score extraction in elbow_detection.py to handle missing keys more robustly. Updated CONCEPT_SPECIFICITY in chunk_deduplication.py to use lowercase keys for CAST+ concept types, ensuring consistency with get_chunk_specificity().
---
 scripts/hybrid/elbow_detection.py     |  7 ++++++-
 scripts/ingest/chunk_deduplication.py | 12 ++++++------
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/scripts/hybrid/elbow_detection.py b/scripts/hybrid/elbow_detection.py
index 52fda978..31615f61 100644
--- a/scripts/hybrid/elbow_detection.py
+++ b/scripts/hybrid/elbow_detection.py
@@ -239,9 +239,14 @@ def filter_by_elbow(
     # Ensure minimum results
     if len(filtered) < min_results and len(results) >= min_results:
         # Return top min_results by score
+        def _get_score(x):
+            score = x.get(score_key)
+            if score is None:
+                score = x.get(fallback_score_key, 0.0)
+            return float(score)
         sorted_results = sorted(
             results,
-            key=lambda x: float(x.get(score_key) or x.get(fallback_score_key, 0.0)),
+            key=_get_score,
             reverse=True
         )
         return sorted_results[:min_results]
diff --git a/scripts/ingest/chunk_deduplication.py b/scripts/ingest/chunk_deduplication.py
index 4971ae49..0798261d 100644
--- a/scripts/ingest/chunk_deduplication.py
+++ b/scripts/ingest/chunk_deduplication.py
@@ -34,12 +34,12 @@
     "block": 1,
     "array": 1,
     "structure": 0,
-    # CAST+ concept types (from concept_extractor)
-    "DEFINITION": 4,
-    "IMPORT": 3,
-    "COMMENT": 2,
-    "BLOCK": 1,
-    "STRUCTURE": 0,
+    # CAST+ concept types (from concept_extractor) - lowercase to match get_chunk_specificity()
+    "definition": 4,
+    "import": 3,
+    "comment": 2,
+    "block": 1,
+    # Note: "structure" already defined above
 }
 
 

From fa78782e597c7ee39c7f457b700ed37a41f31947 Mon Sep 17 00:00:00 2001
From: John Donalson <mirlok@dr.com>
Date: Sat, 24 Jan 2026 09:30:23 -0500
Subject: [PATCH 08/29] Add tests for chunking, deduplication, elbow, and
 termination

Introduce comprehensive test suites for CAST+ chunker, chunk deduplication core, elbow detection, and smart termination logic. These tests cover configuration defaults, algorithm correctness, edge cases, and expected behaviors for each module.
---
 tests/test_cast_chunker.py             | 182 +++++++++++++++++++++++
 tests/test_chunk_deduplication_core.py | 197 +++++++++++++++++++++++++
 tests/test_elbow_detection.py          | 170 +++++++++++++++++++++
 tests/test_termination.py              | 187 +++++++++++++++++++++++
 4 files changed, 736 insertions(+)
 create mode 100644 tests/test_cast_chunker.py
 create mode 100644 tests/test_chunk_deduplication_core.py
 create mode 100644 tests/test_elbow_detection.py
 create mode 100644 tests/test_termination.py

diff --git a/tests/test_cast_chunker.py b/tests/test_cast_chunker.py
new file mode 100644
index 00000000..cfebccb2
--- /dev/null
+++ b/tests/test_cast_chunker.py
@@ -0,0 +1,182 @@
+"""Tests for scripts/ingest/cast_chunker.py - CAST+ Hybrid Chunker."""
+
+import pytest
+from scripts.ingest.cast_chunker import (
+    CASTPlusConfig,
+    CASTPlusChunker,
+    ConceptType,
+    SemanticChunk,
+    ChunkResult,
+    COMPATIBLE_PAIRS,
+    chunk_cast_plus,
+    get_cast_chunker,
+)
+
+
+class TestCASTPlusConfig:
+    """Tests for CASTPlusConfig dataclass."""
+
+    def test_default_values(self):
+        """Test default configuration values."""
+        config = CASTPlusConfig()
+        assert config.max_chunk_size == 1200
+        assert config.min_chunk_size == 50
+        assert config.safe_token_limit == 6000
+        assert config.merge_threshold == 0.8
+        assert config.deduplicate is True
+
+    def test_custom_values(self):
+        """Test custom configuration values."""
+        config = CASTPlusConfig(
+            max_chunk_size=2000,
+            min_chunk_size=100,
+            deduplicate=False,
+        )
+        assert config.max_chunk_size == 2000
+        assert config.min_chunk_size == 100
+        assert config.deduplicate is False
+
+
+class TestConceptType:
+    """Tests for ConceptType enum."""
+
+    def test_concept_values(self):
+        """Test concept type values."""
+        assert ConceptType.DEFINITION.value == "definition"
+        assert ConceptType.BLOCK.value == "block"
+        assert ConceptType.COMMENT.value == "comment"
+        assert ConceptType.IMPORT.value == "import"
+        assert ConceptType.STRUCTURE.value == "structure"
+
+
+class TestCompatiblePairs:
+    """Tests for compatible concept pairs."""
+
+    def test_comment_definition_compatible(self):
+        """Test that COMMENT and DEFINITION are compatible."""
+        assert (ConceptType.COMMENT, ConceptType.DEFINITION) in COMPATIBLE_PAIRS
+        assert (ConceptType.DEFINITION, ConceptType.COMMENT) in COMPATIBLE_PAIRS
+
+    def test_block_definition_not_compatible(self):
+        """Test that BLOCK and DEFINITION are NOT compatible."""
+        assert (ConceptType.BLOCK, ConceptType.DEFINITION) not in COMPATIBLE_PAIRS
+
+
+class TestSemanticChunk:
+    """Tests for SemanticChunk dataclass."""
+
+    def test_post_init_computes_metrics(self):
+        """Test that __post_init__ computes metrics."""
+        chunk = SemanticChunk(
+            concept=ConceptType.DEFINITION,
+            name="foo",
+            content="def foo(): pass",
+            start_line=1,
+            end_line=1,
+        )
+        assert chunk.non_whitespace_chars > 0
+        assert chunk.estimated_tokens > 0
+        assert 0.0 <= chunk.density_score <= 1.0
+
+    def test_empty_content_density(self):
+        """Test density calculation with empty content."""
+        chunk = SemanticChunk(
+            concept=ConceptType.DEFINITION,
+            name="empty",
+            content="",
+            start_line=1,
+            end_line=1,
+        )
+        assert chunk.density_score == 0.0
+
+
+class TestCASTPlusChunker:
+    """Tests for CASTPlusChunker class."""
+
+    def test_initialization(self):
+        """Test chunker initialization."""
+        chunker = CASTPlusChunker()
+        assert chunker.config is not None
+        assert isinstance(chunker.config, CASTPlusConfig)
+
+    def test_custom_config(self):
+        """Test chunker with custom config."""
+        config = CASTPlusConfig(max_chunk_size=500)
+        chunker = CASTPlusChunker(config)
+        assert chunker.config.max_chunk_size == 500
+
+    def test_chunk_simple_function(self):
+        """Test chunking a simple function."""
+        chunker = CASTPlusChunker()
+        content = '''def hello():
+    """Say hello."""
+    print("Hello, World!")
+'''
+        results = chunker.chunk(content, "python")
+        assert len(results) >= 1
+        assert all(isinstance(r, ChunkResult) for r in results)
+
+    def test_chunk_to_dicts(self):
+        """Test chunk_to_dicts returns dictionaries."""
+        chunker = CASTPlusChunker()
+        content = "def foo(): pass"
+        results = chunker.chunk_to_dicts(content, "python")
+        assert all(isinstance(r, dict) for r in results)
+        if results:
+            assert "text" in results[0]
+            # Uses 'start' and 'end' keys, not 'start_line'
+            assert "start" in results[0] or "start_line" in results[0]
+
+    def test_deduplication_enabled(self):
+        """Test that deduplication removes duplicates."""
+        config = CASTPlusConfig(deduplicate=True)
+        chunker = CASTPlusChunker(config)
+        # Content with duplicate blocks
+        content = '''x = 1
+x = 1
+'''
+        results = chunker.chunk(content, "python")
+        # Should have fewer chunks due to dedup
+        assert len(results) >= 1
+
+    def test_deduplication_disabled(self):
+        """Test that deduplication can be disabled."""
+        config = CASTPlusConfig(deduplicate=False)
+        chunker = CASTPlusChunker(config)
+        content = "x = 1"
+        results = chunker.chunk(content, "python")
+        assert len(results) >= 1
+
+
+class TestChunkCastPlus:
+    """Tests for chunk_cast_plus convenience function."""
+
+    def test_basic_usage(self):
+        """Test basic usage of chunk_cast_plus."""
+        content = "def foo(): pass"
+        results = chunk_cast_plus(content, "python")
+        assert isinstance(results, list)
+        assert all(isinstance(r, dict) for r in results)
+
+    def test_with_custom_config(self):
+        """Test with custom config."""
+        config = CASTPlusConfig(max_chunk_size=500)
+        content = "def foo(): pass"
+        results = chunk_cast_plus(content, "python", config=config)
+        assert isinstance(results, list)
+
+
+class TestGetCastChunker:
+    """Tests for get_cast_chunker factory function."""
+
+    def test_returns_chunker(self):
+        """Test that get_cast_chunker returns a chunker."""
+        chunker = get_cast_chunker()
+        assert isinstance(chunker, CASTPlusChunker)
+
+    def test_with_custom_config(self):
+        """Test with custom config returns new instance."""
+        config = CASTPlusConfig(max_chunk_size=999)
+        chunker = get_cast_chunker(config)
+        assert chunker.config.max_chunk_size == 999
+
diff --git a/tests/test_chunk_deduplication_core.py b/tests/test_chunk_deduplication_core.py
new file mode 100644
index 00000000..a3a8c777
--- /dev/null
+++ b/tests/test_chunk_deduplication_core.py
@@ -0,0 +1,197 @@
+"""Tests for scripts/ingest/chunk_deduplication.py - O(n log n) deduplication."""
+
+import pytest
+from scripts.ingest.chunk_deduplication import (
+    normalize_content,
+    get_chunk_specificity,
+    deduplicate_chunks,
+    deduplicate_semantic_chunks,
+    CONCEPT_SPECIFICITY,
+)
+
+
+class TestNormalizeContent:
+    """Tests for content normalization."""
+
+    def test_strips_whitespace(self):
+        """Test that leading/trailing whitespace is stripped."""
+        assert normalize_content("  hello  ") == "hello"
+        assert normalize_content("\n\nhello\n\n") == "hello"
+
+    def test_normalizes_line_endings(self):
+        """Test that different line endings are normalized."""
+        assert normalize_content("a\r\nb") == "a\nb"
+        assert normalize_content("a\rb") == "a\nb"
+        assert normalize_content("a\r\n\rb") == "a\n\nb"
+
+    def test_empty_string(self):
+        """Test empty string handling."""
+        assert normalize_content("") == ""
+        assert normalize_content("   ") == ""
+
+
+class TestGetChunkSpecificity:
+    """Tests for chunk specificity ranking."""
+
+    def test_function_has_high_specificity(self):
+        """Test that function chunks have high specificity."""
+        chunk = {"chunk_type": "function"}
+        assert get_chunk_specificity(chunk) == 4
+
+    def test_block_has_low_specificity(self):
+        """Test that block chunks have low specificity."""
+        chunk = {"chunk_type": "block"}
+        assert get_chunk_specificity(chunk) == 1
+
+    def test_definition_concept_type(self):
+        """Test DEFINITION concept type (from CAST+)."""
+        chunk = {"chunk_type": "DEFINITION"}
+        assert get_chunk_specificity(chunk) == 4
+
+    def test_unknown_type_returns_negative(self):
+        """Test unknown type returns -1."""
+        chunk = {"chunk_type": "unknown_type"}
+        assert get_chunk_specificity(chunk) == -1
+
+    def test_concept_key_fallback(self):
+        """Test fallback to 'concept' key."""
+        chunk = {"concept": "function"}
+        assert get_chunk_specificity(chunk) == 4
+
+    def test_type_key_fallback(self):
+        """Test fallback to 'type' key."""
+        chunk = {"type": "class"}
+        assert get_chunk_specificity(chunk) == 4
+
+    def test_enum_value_handling(self):
+        """Test handling of enum-like objects with .value."""
+        from enum import Enum
+        class MockConcept(Enum):
+            DEFINITION = "definition"
+        chunk = {"chunk_type": MockConcept.DEFINITION}
+        assert get_chunk_specificity(chunk) == 4
+
+
+class TestDeduplicateChunks:
+    """Tests for deduplicate_chunks function."""
+
+    def test_empty_input(self):
+        """Test empty input returns empty list."""
+        assert deduplicate_chunks([]) == []
+
+    def test_no_duplicates(self):
+        """Test chunks without duplicates are preserved."""
+        chunks = [
+            {"code": "def foo(): pass", "chunk_type": "function"},
+            {"code": "def bar(): pass", "chunk_type": "function"},
+        ]
+        result = deduplicate_chunks(chunks)
+        assert len(result) == 2
+
+    def test_exact_duplicates_removed(self):
+        """Test exact duplicate content is removed."""
+        chunks = [
+            {"code": "def foo(): pass", "chunk_type": "function"},
+            {"code": "def foo(): pass", "chunk_type": "function"},
+        ]
+        result = deduplicate_chunks(chunks)
+        assert len(result) == 1
+
+    def test_keeps_higher_specificity(self):
+        """Test that higher specificity chunk is kept on duplicate."""
+        chunks = [
+            {"code": "x = 1", "chunk_type": "block"},      # specificity 1
+            {"code": "x = 1", "chunk_type": "function"},   # specificity 4
+        ]
+        result = deduplicate_chunks(chunks)
+        assert len(result) == 1
+        assert result[0]["chunk_type"] == "function"
+
+    def test_vue_language_exemption(self):
+        """Test Vue language is exempt from deduplication."""
+        chunks = [
+            {"code": "same content", "chunk_type": "block"},
+            {"code": "same content", "chunk_type": "block"},
+        ]
+        result = deduplicate_chunks(chunks, language="vue")
+        assert len(result) == 2
+
+    def test_haskell_language_exemption(self):
+        """Test Haskell language is exempt from deduplication."""
+        chunks = [
+            {"code": "same content", "chunk_type": "block"},
+            {"code": "same content", "chunk_type": "block"},
+        ]
+        result = deduplicate_chunks(chunks, language="haskell")
+        assert len(result) == 2
+
+    def test_substring_removal(self):
+        """Test that block substrings of definitions are removed."""
+        chunks = [
+            {
+                "code": "def foo():\n    x = 1\n    return x",
+                "chunk_type": "function",
+                "start_line": 1,
+                "end_line": 3,
+            },
+            {
+                "code": "x = 1",
+                "chunk_type": "block",
+                "start_line": 2,
+                "end_line": 2,
+            },
+        ]
+        result = deduplicate_chunks(chunks)
+        # Block should be removed as it's a substring of the function
+        assert len(result) == 1
+        assert result[0]["chunk_type"] == "function"
+
+    def test_custom_content_key(self):
+        """Test custom content key."""
+        chunks = [
+            {"text": "same", "chunk_type": "block"},
+            {"text": "same", "chunk_type": "block"},
+        ]
+        result = deduplicate_chunks(chunks, content_key="text")
+        assert len(result) == 1
+
+    def test_whitespace_normalization_in_dedup(self):
+        """Test that whitespace differences don't prevent dedup."""
+        chunks = [
+            {"code": "def foo(): pass", "chunk_type": "function"},
+            {"code": "def foo(): pass  ", "chunk_type": "function"},  # trailing space
+        ]
+        result = deduplicate_chunks(chunks)
+        assert len(result) == 1
+
+
+class TestDeduplicateSemanticChunks:
+    """Tests for deduplicate_semantic_chunks function."""
+
+    def test_empty_input(self):
+        """Test empty input returns empty list."""
+        assert deduplicate_semantic_chunks([]) == []
+
+    def test_preserves_original_objects(self):
+        """Test that original objects are returned, not copies."""
+        from dataclasses import dataclass
+        from enum import Enum
+
+        class ConceptType(Enum):
+            DEFINITION = "definition"
+
+        @dataclass
+        class MockChunk:
+            content: str
+            start_line: int
+            end_line: int
+            concept: ConceptType
+
+        chunk1 = MockChunk("def foo(): pass", 1, 1, ConceptType.DEFINITION)
+        chunk2 = MockChunk("def bar(): pass", 2, 2, ConceptType.DEFINITION)
+        
+        result = deduplicate_semantic_chunks([chunk1, chunk2])
+        assert len(result) == 2
+        assert chunk1 in result
+        assert chunk2 in result
+
diff --git a/tests/test_elbow_detection.py b/tests/test_elbow_detection.py
new file mode 100644
index 00000000..2296853b
--- /dev/null
+++ b/tests/test_elbow_detection.py
@@ -0,0 +1,170 @@
+"""Tests for scripts/hybrid/elbow_detection.py - Kneedle algorithm and adaptive thresholds."""
+
+import pytest
+from scripts.hybrid.elbow_detection import (
+    find_elbow_kneedle,
+    compute_elbow_threshold,
+    filter_by_elbow,
+)
+
+
+class TestFindElbowKneedle:
+    """Tests for the Kneedle algorithm implementation."""
+
+    def test_clear_elbow_detected(self):
+        """Test detection of a clear elbow point."""
+        # Clear drop after index 2
+        scores = [0.95, 0.92, 0.88, 0.45, 0.42, 0.40]
+        elbow_idx = find_elbow_kneedle(scores)
+        assert elbow_idx is not None
+        # Elbow should be around the drop point
+        assert 1 <= elbow_idx <= 3
+
+    def test_too_few_points_returns_none(self):
+        """Test that fewer than 3 points returns None."""
+        assert find_elbow_kneedle([0.9]) is None
+        assert find_elbow_kneedle([0.9, 0.8]) is None
+        assert find_elbow_kneedle([]) is None
+
+    def test_identical_scores_returns_none(self):
+        """Test that identical scores return None (no elbow)."""
+        scores = [0.5, 0.5, 0.5, 0.5, 0.5]
+        assert find_elbow_kneedle(scores) is None
+
+    def test_linear_decrease_minimal_elbow(self):
+        """Test linear decrease - may or may not detect elbow."""
+        scores = [1.0, 0.8, 0.6, 0.4, 0.2]
+        # Linear decrease has no clear elbow
+        result = find_elbow_kneedle(scores)
+        # Should return None or a middle index
+        assert result is None or 0 <= result < len(scores)
+
+    def test_sharp_drop_at_end(self):
+        """Test sharp drop at the end of the curve."""
+        scores = [0.95, 0.94, 0.93, 0.92, 0.10]
+        elbow_idx = find_elbow_kneedle(scores)
+        assert elbow_idx is not None
+        # Elbow should be near the drop
+        assert elbow_idx >= 2
+
+    def test_gradual_then_sharp_drop(self):
+        """Test gradual decrease followed by sharp drop."""
+        scores = [0.99, 0.98, 0.97, 0.96, 0.30, 0.29, 0.28]
+        elbow_idx = find_elbow_kneedle(scores)
+        assert elbow_idx is not None
+        # Elbow should be around index 3-4
+        assert 2 <= elbow_idx <= 5
+
+
+class TestComputeElbowThreshold:
+    """Tests for compute_elbow_threshold function."""
+
+    def test_empty_input_returns_default(self):
+        """Test empty input returns default threshold."""
+        assert compute_elbow_threshold([]) == 0.5
+        # Single dict with no score extracts 0.0, which is a valid score
+        result = compute_elbow_threshold([{}])
+        assert 0.0 <= result <= 0.5
+
+    def test_with_raw_scores(self):
+        """Test with raw float scores."""
+        scores = [0.95, 0.88, 0.45, 0.42]
+        threshold = compute_elbow_threshold(scores)
+        assert 0.0 <= threshold <= 1.0
+        # Threshold should be around the elbow
+        assert threshold >= 0.40
+
+    def test_with_dict_chunks(self):
+        """Test with dict chunks containing score key."""
+        chunks = [
+            {"score": 0.95},
+            {"score": 0.88},
+            {"score": 0.45},
+            {"score": 0.42},
+        ]
+        threshold = compute_elbow_threshold(chunks)
+        assert 0.0 <= threshold <= 1.0
+
+    def test_with_fallback_score_key(self):
+        """Test fallback to rerank_score when score is missing."""
+        chunks = [
+            {"rerank_score": 0.95},
+            {"rerank_score": 0.45},
+            {"rerank_score": 0.20},
+        ]
+        threshold = compute_elbow_threshold(chunks, score_key="score")
+        assert 0.0 <= threshold <= 1.0
+
+    def test_zero_scores_handled_correctly(self):
+        """Test that 0.0 scores are handled correctly (not treated as missing)."""
+        chunks = [
+            {"score": 0.95},
+            {"score": 0.0},  # Real zero score
+            {"score": 0.0},
+        ]
+        threshold = compute_elbow_threshold(chunks)
+        # Should not crash and should return valid threshold
+        assert 0.0 <= threshold <= 1.0
+
+    def test_custom_score_key(self):
+        """Test with custom score key."""
+        chunks = [
+            {"my_score": 0.9},
+            {"my_score": 0.5},
+            {"my_score": 0.1},
+        ]
+        threshold = compute_elbow_threshold(chunks, score_key="my_score")
+        assert 0.0 <= threshold <= 1.0
+
+
+class TestFilterByElbow:
+    """Tests for filter_by_elbow function."""
+
+    def test_empty_results(self):
+        """Test empty input returns empty list."""
+        assert filter_by_elbow([]) == []
+
+    def test_filters_below_threshold(self):
+        """Test that results below threshold are filtered."""
+        results = [
+            {"id": 1, "score": 0.95},
+            {"id": 2, "score": 0.90},
+            {"id": 3, "score": 0.30},  # Below elbow
+            {"id": 4, "score": 0.25},  # Below elbow
+        ]
+        filtered = filter_by_elbow(results)
+        # Should keep high-scoring results
+        assert len(filtered) >= 1
+        assert all(r["score"] >= 0.25 for r in filtered)
+
+    def test_min_results_guaranteed(self):
+        """Test that min_results are always returned."""
+        results = [
+            {"id": 1, "score": 0.95},
+            {"id": 2, "score": 0.10},
+            {"id": 3, "score": 0.05},
+        ]
+        filtered = filter_by_elbow(results, min_results=2)
+        assert len(filtered) >= 2
+
+    def test_zero_score_not_treated_as_missing(self):
+        """Test that 0.0 score is not treated as missing."""
+        results = [
+            {"id": 1, "score": 0.9},
+            {"id": 2, "score": 0.0},  # Real zero, not missing
+            {"id": 3, "score": 0.0},
+        ]
+        # Should not crash
+        filtered = filter_by_elbow(results)
+        assert isinstance(filtered, list)
+
+    def test_fallback_score_key_used(self):
+        """Test that fallback score key is used when primary is missing."""
+        results = [
+            {"id": 1, "rerank_score": 0.95},
+            {"id": 2, "rerank_score": 0.50},
+            {"id": 3, "rerank_score": 0.10},
+        ]
+        filtered = filter_by_elbow(results, score_key="score", fallback_score_key="rerank_score")
+        assert len(filtered) >= 1
+
diff --git a/tests/test_termination.py b/tests/test_termination.py
new file mode 100644
index 00000000..93f38903
--- /dev/null
+++ b/tests/test_termination.py
@@ -0,0 +1,187 @@
+"""Tests for scripts/hybrid/termination.py - Smart termination conditions."""
+
+import time
+import pytest
+from scripts.hybrid.termination import TerminationConfig, TerminationChecker
+
+
+class TestTerminationConfig:
+    """Tests for TerminationConfig dataclass."""
+
+    def test_default_values(self):
+        """Test default configuration values."""
+        config = TerminationConfig()
+        assert config.time_limit == 5.0
+        assert config.result_limit == 500
+        assert config.min_candidates_for_expansion == 5
+        assert config.score_degradation_threshold == 0.15
+        assert config.min_relevance_score == 0.3
+        assert config.top_n_to_track == 5
+
+    def test_custom_values(self):
+        """Test custom configuration values."""
+        config = TerminationConfig(
+            time_limit=10.0,
+            result_limit=1000,
+            min_candidates_for_expansion=10,
+        )
+        assert config.time_limit == 10.0
+        assert config.result_limit == 1000
+        assert config.min_candidates_for_expansion == 10
+
+
+class TestTerminationChecker:
+    """Tests for TerminationChecker class."""
+
+    def test_initialization(self):
+        """Test checker initialization."""
+        checker = TerminationChecker()
+        assert checker.iteration == 0
+        assert checker.tracked_chunk_scores == {}
+        assert checker.elapsed() >= 0
+
+    def test_reset(self):
+        """Test reset clears state."""
+        checker = TerminationChecker()
+        checker.iteration = 5
+        checker.tracked_chunk_scores = {"a": 0.9}
+        checker.reset()
+        assert checker.iteration == 0
+        assert checker.tracked_chunk_scores == {}
+
+    def test_time_limit_termination(self):
+        """Test termination on time limit."""
+        config = TerminationConfig(time_limit=0.01)  # 10ms
+        checker = TerminationChecker(config)
+        
+        # Wait for time limit
+        time.sleep(0.02)
+        
+        results = [{"chunk_id": "a", "score": 0.9} for _ in range(10)]
+        should_terminate, reason = checker.check(results)
+        
+        assert should_terminate is True
+        assert reason == "time_limit"
+
+    def test_result_limit_termination(self):
+        """Test termination on result limit."""
+        config = TerminationConfig(result_limit=5)
+        checker = TerminationChecker(config)
+        
+        results = [{"chunk_id": f"c{i}", "score": 0.9} for i in range(10)]
+        should_terminate, reason = checker.check(results)
+        
+        assert should_terminate is True
+        assert reason == "result_limit"
+
+    def test_insufficient_candidates_termination(self):
+        """Test termination when not enough high-scoring candidates."""
+        config = TerminationConfig(min_candidates_for_expansion=5)
+        checker = TerminationChecker(config)
+        
+        # Only 3 results with positive scores
+        results = [
+            {"chunk_id": "a", "score": 0.9},
+            {"chunk_id": "b", "score": 0.8},
+            {"chunk_id": "c", "score": 0.7},
+        ]
+        should_terminate, reason = checker.check(results)
+        
+        assert should_terminate is True
+        assert reason == "insufficient_candidates"
+
+    def test_score_degradation_termination(self):
+        """Test termination on score degradation."""
+        config = TerminationConfig(
+            score_degradation_threshold=0.1,
+            top_n_to_track=3,
+            min_candidates_for_expansion=1,
+            min_relevance_score=0.0,
+        )
+        checker = TerminationChecker(config)
+        
+        # First iteration - establish baseline
+        results1 = [
+            {"chunk_id": "a", "score": 0.9},
+            {"chunk_id": "b", "score": 0.8},
+            {"chunk_id": "c", "score": 0.7},
+        ]
+        should_terminate, reason = checker.check(results1)
+        assert should_terminate is False
+        
+        # Second iteration - scores dropped significantly
+        results2 = [
+            {"chunk_id": "a", "score": 0.7},  # Dropped 0.2
+            {"chunk_id": "b", "score": 0.6},
+            {"chunk_id": "c", "score": 0.5},
+        ]
+        should_terminate, reason = checker.check(results2)
+        
+        assert should_terminate is True
+        assert reason == "score_degradation"
+
+    def test_min_relevance_termination(self):
+        """Test termination when min relevance score is too low."""
+        config = TerminationConfig(
+            min_relevance_score=0.5,
+            top_n_to_track=3,
+            min_candidates_for_expansion=1,
+        )
+        checker = TerminationChecker(config)
+        
+        # Results with low minimum score in top-N
+        results = [
+            {"chunk_id": "a", "score": 0.9},
+            {"chunk_id": "b", "score": 0.6},
+            {"chunk_id": "c", "score": 0.3},  # Below min_relevance_score
+        ]
+        should_terminate, reason = checker.check(results)
+        
+        assert should_terminate is True
+        assert reason == "min_relevance"
+
+    def test_no_termination_when_conditions_not_met(self):
+        """Test that checker continues when no conditions are met."""
+        config = TerminationConfig(
+            time_limit=60.0,
+            result_limit=1000,
+            min_candidates_for_expansion=3,
+            min_relevance_score=0.3,
+        )
+        checker = TerminationChecker(config)
+        
+        results = [
+            {"chunk_id": "a", "score": 0.9},
+            {"chunk_id": "b", "score": 0.8},
+            {"chunk_id": "c", "score": 0.7},
+            {"chunk_id": "d", "score": 0.6},
+            {"chunk_id": "e", "score": 0.5},
+        ]
+        should_terminate, reason = checker.check(results)
+        
+        assert should_terminate is False
+        assert reason == ""
+
+    def test_get_stats(self):
+        """Test get_stats returns correct information."""
+        checker = TerminationChecker()
+        results = [{"chunk_id": "a", "score": 0.9} for _ in range(10)]
+        checker.check(results)
+        checker.check(results)
+        
+        stats = checker.get_stats()
+        assert stats["iterations"] == 2
+        assert "elapsed_seconds" in stats
+        assert stats["elapsed_seconds"] >= 0
+
+    def test_iteration_counter_increments(self):
+        """Test that iteration counter increments on each check."""
+        checker = TerminationChecker()
+        results = [{"chunk_id": f"c{i}", "score": 0.9} for i in range(10)]
+        
+        checker.check(results)
+        assert checker.iteration == 1
+        
+        checker.check(results)
+        assert checker.iteration == 2
+

From 558b08d5ff8cd11823f7a9004c3839d1e609b405 Mon Sep 17 00:00:00 2001
From: John Donalson <mirlok@dr.com>
Date: Sat, 24 Jan 2026 09:35:29 -0500
Subject: [PATCH 09/29] Update ci.yml

---
 .github/workflows/ci.yml | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 47b85ae8..a3b785ae 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -65,14 +65,18 @@ jobs:
         python -c "from fastembed import TextEmbedding; m = TextEmbedding(model_name='BAAI/bge-base-en-v1.5'); list(m.embed(['test']))"
 
     - name: Run tests
-      run: pytest -q
-      
+      run: pytest -q --junitxml=test-results.xml
+
     - name: Upload test results
       uses: actions/upload-artifact@v4
       if: always()
       with:
         name: test-results
-        path: |
-          .pytest_cache/
-          test-results.xml
+        path: test-results.xml
         retention-days: 7
+
+    - name: Test Summary
+      uses: test-summary/action@v2
+      if: always()
+      with:
+        paths: test-results.xml

From ca0f533d4831215da1e018c94ce982570e7522d6 Mon Sep 17 00:00:00 2001
From: John Donalson <mirlok@dr.com>
Date: Sat, 24 Jan 2026 09:42:26 -0500
Subject: [PATCH 10/29] Handle TOON-formatted results in search command

Added logic to decode TOON-formatted result strings in the search command. If decoding fails, an error message is shown with a hint to install the required package or disable TOON. This improves compatibility with different result formats returned by the server.
---
 scripts/ctx_cli/commands/search.py | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/scripts/ctx_cli/commands/search.py b/scripts/ctx_cli/commands/search.py
index 38eafd04..0f63975a 100755
--- a/scripts/ctx_cli/commands/search.py
+++ b/scripts/ctx_cli/commands/search.py
@@ -299,9 +299,27 @@ def search_command(
         print(json.dumps(data, indent=2))
         return 0
 
-    # Extract results
+    # Extract results - handle TOON format if present
     results = data.get("results", [])
-    total = data.get("total", len(results))
+
+    # If results is a TOON string, try to decode it or use results_json fallback
+    if isinstance(results, str):
+        # First try results_json (preserved by server for internal callers)
+        if "results_json" in data and isinstance(data["results_json"], list):
+            results = data["results_json"]
+        else:
+            # Try to decode TOON string
+            try:
+                from toon import decode as toon_decode
+                decoded = toon_decode(results)
+                results = decoded.get("results", [])
+            except Exception:
+                # If TOON decode fails, return error
+                print("Error: Received TOON-formatted results but could not decode", file=sys.stderr)
+                print("Hint: Install toon package or set TOON_ENABLED=0", file=sys.stderr)
+                return 1
+
+    total = data.get("total", len(results) if isinstance(results, list) else 0)
 
     # Handle no results
     if not results:

From d4e36857217befde4b9422a955208449534682d0 Mon Sep 17 00:00:00 2001
From: John Donalson <mirlok@dr.com>
Date: Sat, 24 Jan 2026 09:55:47 -0500
Subject: [PATCH 11/29] Make symbol graph edges always enabled and update elbow
 detection

Symbol graph edges (Qdrant flat graph) are now always enabled and no longer configurable via the INDEX_GRAPH_EDGES env var; related config, comments, and tests updated to reflect unconditional activation. Elbow detection utilities have been refactored to use curvature-based, changepoint, and Kneedle methods for adaptive thresholding, with improved statistical termination logic in iterative search. Specificity scoring for chunk deduplication now uses a weighted formula for more granular ranking.
---
 docker-compose.yml                    |   8 +-
 scripts/ctx_cli/commands/init.py      |   3 -
 scripts/hybrid/elbow_detection.py     | 431 ++++++++++++++++----------
 scripts/hybrid/termination.py         | 260 +++++++++++++---
 scripts/indexing_admin.py             |   2 +-
 scripts/ingest/chunk_deduplication.py | 109 +++++--
 scripts/ingest/pipeline.py            | 336 ++++++++++----------
 scripts/workspace_state.py            |   2 +-
 tests/test_workspace_state.py         |  47 ++-
 9 files changed, 762 insertions(+), 436 deletions(-)

diff --git a/docker-compose.yml b/docker-compose.yml
index ccb219b5..e15f099b 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -453,8 +453,7 @@ services:
       - LEX_SPARSE_NAME=${LEX_SPARSE_NAME:-}
       # Pattern vectors for structural code similarity
       - PATTERN_VECTORS=${PATTERN_VECTORS:-}
-      # Graph edges for symbol relationships
-      - INDEX_GRAPH_EDGES=${INDEX_GRAPH_EDGES:-1}
+      # Graph edges for symbol relationships (always on)
       - INDEX_GRAPH_EDGES_MODE=${INDEX_GRAPH_EDGES_MODE:-symbol}
     volumes:
       - workspace_pvc:/work:rw
@@ -514,11 +513,10 @@ services:
       - LEX_SPARSE_NAME=${LEX_SPARSE_NAME:-}
       # Pattern vectors for structural code similarity
       - PATTERN_VECTORS=${PATTERN_VECTORS:-}
-      # Graph edges for symbol relationships
-      - INDEX_GRAPH_EDGES=${INDEX_GRAPH_EDGES:-1}
+      # Graph edges for symbol relationships (always on - Qdrant flat graph)
       - INDEX_GRAPH_EDGES_MODE=${INDEX_GRAPH_EDGES_MODE:-symbol}
       - GRAPH_BACKFILL_ENABLED=${GRAPH_BACKFILL_ENABLED:-1}
-      # Neo4j graph backend (when set, edges go to Neo4j instead of Qdrant _graph collection)
+      # Neo4j graph backend (optional - takes precedence over Qdrant flat graph)
       - NEO4J_GRAPH=${NEO4J_GRAPH:-}
     volumes:
       - workspace_pvc:/work:rw
diff --git a/scripts/ctx_cli/commands/init.py b/scripts/ctx_cli/commands/init.py
index 473f2c2a..c3d7d7fa 100644
--- a/scripts/ctx_cli/commands/init.py
+++ b/scripts/ctx_cli/commands/init.py
@@ -651,9 +651,6 @@ def configure_env_file(skip_if_exists: bool = False) -> bool:
 NEO4J_URI=bolt://neo4j:7687
 NEO4J_USER=neo4j
 NEO4J_PASSWORD=contextengine
-
-# Symbol graph
-SYMBOL_GRAPH_ENABLED=1
 """
 
         # Add API keys if configured
diff --git a/scripts/hybrid/elbow_detection.py b/scripts/hybrid/elbow_detection.py
index 31615f61..cadc8d84 100644
--- a/scripts/hybrid/elbow_detection.py
+++ b/scripts/hybrid/elbow_detection.py
@@ -1,118 +1,221 @@
-"""Elbow detection utilities for adaptive threshold computation.
+"""Elbow detection for adaptive threshold computation.
 
-Implements the Kneedle algorithm (Satopaa et al. 2011) for finding elbow points
-in score curves. Used for adaptive threshold computation in hybrid search.
+Mathematical approaches:
+1. Curvature-based detection - finds point of maximum bending (2nd derivative)
+2. Multi-changepoint detection - finds multiple quality tiers via recursive segmentation
+3. Kneedle fallback - perpendicular distance method for edge cases
 
-Ported from ChunkHound to Context-Engine.
-
-Usage:
-    from scripts.hybrid.elbow_detection import compute_elbow_threshold, find_elbow_kneedle
-    
-    # With raw scores
-    scores = [0.95, 0.88, 0.45, 0.42, 0.40]
-    threshold = compute_elbow_threshold(scores)
-    
-    # With search results (dicts with 'score' or 'rerank_score' keys)
-    results = [{"score": 0.95}, {"score": 0.88}, {"score": 0.45}]
-    threshold = compute_elbow_threshold(results)
-    
-    # Filter results by elbow threshold
-    filtered = [r for r in results if r.get("score", 0) >= threshold]
+Curvature formula: κ(i) = |f''(i)| / (1 + f'(i)²)^(3/2)
+where f'(i) and f''(i) are discrete derivatives using central differences.
 """
 
 from __future__ import annotations
 
 import logging
-from typing import Sequence, Union
+from typing import Sequence, Union, List, Tuple
 
 import numpy as np
 
 logger = logging.getLogger(__name__)
 
 
-def find_elbow_kneedle(sorted_scores: Sequence[float]) -> int | None:
-    """Find elbow point in score curve using simplified Kneedle algorithm.
+def _discrete_curvature(y: np.ndarray) -> np.ndarray:
+    """Compute discrete curvature using central differences.
+    
+    κ(i) = |y''(i)| / (1 + y'(i)²)^(3/2)
+    
+    First derivative:  y'(i) = (y[i+1] - y[i-1]) / 2
+    Second derivative: y''(i) = y[i+1] - 2*y[i] + y[i-1]
+    """
+    n = len(y)
+    if n < 3:
+        return np.zeros(n)
+    
+    curvature = np.zeros(n)
+    
+    for i in range(1, n - 1):
+        y_prime = (y[i + 1] - y[i - 1]) / 2.0
+        y_double_prime = y[i + 1] - 2.0 * y[i] + y[i - 1]
+        
+        denominator = (1.0 + y_prime ** 2) ** 1.5
+        if denominator > 1e-10:
+            curvature[i] = abs(y_double_prime) / denominator
+    
+    return curvature
 
-    Implementation based on Kneedle algorithm (Satopaa et al. 2011):
-    1. Normalize scores to [0,1]
-    2. Draw line from first to last point
-    3. Find point with maximum perpendicular distance to line
-    4. That's the elbow/knee point
 
-    Args:
-        sorted_scores: Scores sorted DESCENDING (highest to lowest)
+def _segment_cost(y: np.ndarray) -> float:
+    """Compute segment cost as negative log-likelihood under Gaussian model.
+    
+    Cost = n * log(variance) where variance = Σ(y - mean)² / n
+    Lower cost = more homogeneous segment.
+    """
+    if len(y) < 2:
+        return 0.0
+    variance = np.var(y)
+    if variance < 1e-10:
+        return 0.0
+    return len(y) * np.log(variance)
 
-    Returns:
-        Index of elbow point (0-based array index), or None if no clear elbow detected.
-        Return value can be used to threshold: scores[:elbow_idx+1] are above elbow.
 
-    Examples:
-        >>> scores = [0.95, 0.92, 0.88, 0.45, 0.42, 0.40]  # Clear drop at index 2
-        >>> find_elbow_kneedle(scores)
-        2  # Select first 3 items (indices 0, 1, 2)
+def find_elbow_curvature(sorted_scores: Sequence[float]) -> int | None:
+    """Find elbow using maximum curvature (2nd derivative method).
+    
+    More mathematically rigorous than perpendicular distance:
+    - Curvature measures local bending intensity
+    - Invariant to linear transformation of axes
+    - Maximum curvature = point of diminishing returns
+    
+    Args:
+        sorted_scores: Scores sorted DESCENDING
+        
+    Returns:
+        Index of elbow point, or None if no significant elbow
+    """
+    if len(sorted_scores) < 4:
+        return None
+    
+    scores = np.array(sorted_scores, dtype=np.float64)
+    
+    min_s, max_s = scores.min(), scores.max()
+    if max_s - min_s < 1e-10:
+        return None
+    
+    normalized = (scores - min_s) / (max_s - min_s)
+    
+    x = np.linspace(0, 1, len(normalized))
+    
+    curvature = _discrete_curvature(normalized)
+    
+    search_start = 1
+    search_end = len(curvature) - 1
+    if search_end <= search_start:
+        return None
+    
+    max_idx = search_start + int(np.argmax(curvature[search_start:search_end]))
+    max_curvature = curvature[max_idx]
+    
+    if max_curvature < 0.1:
+        logger.debug(f"Curvature: No significant elbow (max_κ={max_curvature:.4f} < 0.1)")
+        return None
+    
+    logger.debug(
+        f"Curvature: Found elbow at index {max_idx} "
+        f"(κ={max_curvature:.4f}, score={sorted_scores[max_idx]:.3f})"
+    )
+    return max_idx
 
-        >>> scores = [0.5, 0.5, 0.5, 0.5]  # All identical
-        >>> find_elbow_kneedle(scores)
-        None  # No elbow
 
-        >>> scores = [0.9, 0.8]  # Too few points
-        >>> find_elbow_kneedle(scores)
-        None  # Need at least 3 points
+def find_changepoints(
+    sorted_scores: Sequence[float],
+    max_changepoints: int = 3,
+    min_segment_size: int = 2,
+) -> List[int]:
+    """Find multiple changepoints using recursive binary segmentation.
+    
+    Uses BIC penalty: β = log(n) to prevent overfitting.
+    
+    Args:
+        sorted_scores: Scores sorted DESCENDING
+        max_changepoints: Maximum number of changepoints to find
+        min_segment_size: Minimum segment size
+        
+    Returns:
+        List of changepoint indices (sorted), empty if none found
     """
-    if len(sorted_scores) < 3:
-        logger.debug("Kneedle: Too few points (<3), cannot detect elbow")
-        return None  # Need at least 3 points for elbow
+    if len(sorted_scores) < 2 * min_segment_size:
+        return []
+    
+    scores = np.array(sorted_scores, dtype=np.float64)
+    n = len(scores)
+    
+    penalty = np.log(n)
+    
+    def find_best_split(start: int, end: int) -> Tuple[int, float]:
+        """Find best split point in segment [start, end)."""
+        if end - start < 2 * min_segment_size:
+            return -1, 0.0
+        
+        segment = scores[start:end]
+        base_cost = _segment_cost(segment)
+        
+        best_idx = -1
+        best_gain = 0.0
+        
+        for split in range(start + min_segment_size, end - min_segment_size + 1):
+            left_cost = _segment_cost(scores[start:split])
+            right_cost = _segment_cost(scores[split:end])
+            
+            gain = base_cost - (left_cost + right_cost) - penalty
+            
+            if gain > best_gain:
+                best_gain = gain
+                best_idx = split
+        
+        return best_idx, best_gain
+    
+    changepoints = []
+    segments = [(0, n)]
+    
+    while len(changepoints) < max_changepoints and segments:
+        best_segment_idx = -1
+        best_split = -1
+        best_gain = 0.0
+        
+        for seg_idx, (start, end) in enumerate(segments):
+            split, gain = find_best_split(start, end)
+            if gain > best_gain:
+                best_gain = gain
+                best_split = split
+                best_segment_idx = seg_idx
+        
+        if best_split == -1:
+            break
+        
+        changepoints.append(best_split)
+        
+        start, end = segments.pop(best_segment_idx)
+        segments.append((start, best_split))
+        segments.append((best_split, end))
+    
+    return sorted(changepoints)
 
-    # Extract scores as numpy array
-    scores = np.array(sorted_scores)
 
-    # Normalize scores to [0, 1]
-    min_score = scores.min()
-    max_score = scores.max()
-    if max_score == min_score:
-        logger.debug("Kneedle: All scores identical, no elbow")
-        return None  # All scores identical, no elbow
+def find_elbow_kneedle(sorted_scores: Sequence[float]) -> int | None:
+    """Find elbow using perpendicular distance (Kneedle algorithm).
+    
+    Fallback method when curvature-based detection fails.
+    """
+    if len(sorted_scores) < 3:
+        return None
 
-    normalized_scores = (scores - min_score) / (max_score - min_score)
+    scores = np.array(sorted_scores, dtype=np.float64)
+    
+    min_score, max_score = scores.min(), scores.max()
+    if max_score - min_score < 1e-10:
+        return None
 
-    # X-axis: normalized positions [0, 1]
-    x = np.linspace(0, 1, len(normalized_scores))
+    normalized = (scores - min_score) / (max_score - min_score)
+    x = np.linspace(0, 1, len(normalized))
 
-    # Draw line from first point to last point
-    # Line equation: y = mx + b
-    x1, y1 = x[0], normalized_scores[0]
-    x2, y2 = x[-1], normalized_scores[-1]
+    x1, y1 = x[0], normalized[0]
+    x2, y2 = x[-1], normalized[-1]
 
-    # Handle vertical line case (shouldn't happen with normalized x)
-    if x2 == x1:
-        logger.debug("Kneedle: Vertical line case, no elbow")
+    if abs(x2 - x1) < 1e-10:
         return None
 
     m = (y2 - y1) / (x2 - x1)
     b = y1 - m * x1
 
-    # Compute perpendicular distance from each point to line
-    # Formula: |mx - y + b| / sqrt(m^2 + 1)
-    numerator = np.abs(m * x - normalized_scores + b)
-    denominator = np.sqrt(m**2 + 1)
+    numerator = np.abs(m * x - normalized + b)
+    denominator = np.sqrt(m ** 2 + 1)
     distances = numerator / denominator
 
-    # Find point with maximum distance (that's the elbow)
     elbow_idx = int(np.argmax(distances))
 
-    # Validate elbow is significant (distance > 1% of normalized range)
     if distances[elbow_idx] < 0.01:
-        logger.debug(
-            f"Kneedle: Elbow not significant (distance={distances[elbow_idx]:.4f} < 0.01)"
-        )
-        return None  # Elbow not significant enough
-
-    logger.debug(
-        f"Kneedle: Found elbow at index {elbow_idx} "
-        f"(distance={distances[elbow_idx]:.4f}, score={sorted_scores[elbow_idx]:.3f})"
-    )
+        return None
 
-    # Return 0-based index (for array slicing: scores[:elbow_idx+1])
     return elbow_idx
 
 
@@ -120,79 +223,103 @@ def compute_elbow_threshold(
     chunks_or_scores: Union[Sequence[dict], Sequence[float]],
     score_key: str = "score",
     fallback_score_key: str = "rerank_score",
+    method: str = "curvature",
 ) -> float:
-    """Compute elbow threshold from chunks or scores using Kneedle algorithm.
+    """Compute elbow threshold using specified method.
+    
+    Args:
+        chunks_or_scores: List of chunks (dicts) or raw scores
+        score_key: Primary score key for dicts
+        fallback_score_key: Fallback score key
+        method: "curvature" (default), "kneedle", or "changepoint"
+        
+    Returns:
+        Threshold value at elbow point
+    """
+    if not chunks_or_scores:
+        return 0.5
 
-    Uses the Kneedle algorithm (Satopaa et al. 2011) to detect the elbow point
-    in the score distribution. Falls back to median if Kneedle fails to find
-    a significant elbow.
+    if isinstance(chunks_or_scores[0], dict):
+        chunk_list: Sequence[dict] = chunks_or_scores  # type: ignore
+        scores = []
+        for c in chunk_list:
+            score = c.get(score_key)
+            if score is None:
+                score = c.get(fallback_score_key, 0.0)
+            scores.append(float(score))
+    else:
+        scores = [float(s) for s in chunks_or_scores]
 
-    Args:
-        chunks_or_scores: Either:
-            - List of chunks (dicts with score_key)
-            - List of raw float scores
-        score_key: Primary key to extract scores from dicts (default: "score")
-        fallback_score_key: Fallback key if primary not found (default: "rerank_score")
+    if not scores:
+        return 0.5
 
-    Returns:
-        Threshold value (score at elbow point, or median if no elbow)
+    sorted_scores = sorted(scores, reverse=True)
+
+    elbow_idx = None
+    
+    if method == "curvature":
+        elbow_idx = find_elbow_curvature(sorted_scores)
+        if elbow_idx is None:
+            elbow_idx = find_elbow_kneedle(sorted_scores)
+    elif method == "changepoint":
+        changepoints = find_changepoints(sorted_scores, max_changepoints=1)
+        if changepoints:
+            elbow_idx = changepoints[0]
+    else:
+        elbow_idx = find_elbow_kneedle(sorted_scores)
+
+    if elbow_idx is not None and 0 <= elbow_idx < len(sorted_scores):
+        return float(sorted_scores[elbow_idx])
+
+    median_idx = len(sorted_scores) // 2
+    return float(sorted_scores[median_idx])
 
-    Examples:
-        >>> chunks = [{'score': 0.95}, {'score': 0.88}]
-        >>> compute_elbow_threshold(chunks)
-        0.88
 
-        >>> scores = [0.95, 0.88, 0.45, 0.42]
-        >>> compute_elbow_threshold(scores)
-        0.45
+def compute_tier_thresholds(
+    chunks_or_scores: Union[Sequence[dict], Sequence[float]],
+    score_key: str = "score",
+    fallback_score_key: str = "rerank_score",
+    max_tiers: int = 3,
+) -> List[float]:
+    """Compute multiple quality tier thresholds.
+    
+    Uses changepoint detection to find natural breaks in score distribution.
+    
+    Args:
+        chunks_or_scores: List of chunks or raw scores
+        score_key: Primary score key
+        fallback_score_key: Fallback score key
+        max_tiers: Maximum number of tiers (changepoints + 1)
         
-        >>> # With rerank scores
-        >>> chunks = [{'rerank_score': 0.95}, {'rerank_score': 0.45}]
-        >>> compute_elbow_threshold(chunks, score_key="rerank_score")
-        0.45
+    Returns:
+        List of threshold values (descending), one per tier boundary
     """
-    # Handle empty input
     if not chunks_or_scores:
-        return 0.5  # Default threshold
+        return []
 
-    # Extract scores from chunks or use raw scores
     if isinstance(chunks_or_scores[0], dict):
-        # Type narrowing: if first element is dict, all are dicts
-        chunk_list: Sequence[dict] = chunks_or_scores  # type: ignore[assignment]
+        chunk_list: Sequence[dict] = chunks_or_scores  # type: ignore
         scores = []
         for c in chunk_list:
-            # Try primary key, then fallback, then 0.0
             score = c.get(score_key)
             if score is None:
                 score = c.get(fallback_score_key, 0.0)
             scores.append(float(score))
     else:
-        # Type narrowing: if first element is not dict, all are floats
         scores = [float(s) for s in chunks_or_scores]
 
     if not scores:
-        return 0.5
+        return []
 
     sorted_scores = sorted(scores, reverse=True)
-
-    # Try Kneedle algorithm first
-    elbow_idx = find_elbow_kneedle(sorted_scores)
-    if elbow_idx is not None and elbow_idx < len(sorted_scores):
-        threshold = float(sorted_scores[elbow_idx])
-        logger.debug(
-            f"Elbow threshold: {threshold:.3f} (Kneedle at index {elbow_idx} "
-            f"of {len(scores)} scores)"
-        )
-        return threshold
-
-    # Fallback to median if Kneedle fails
-    median_idx = len(sorted_scores) // 2
-    threshold = float(sorted_scores[median_idx])
-    logger.debug(
-        f"Elbow threshold: {threshold:.3f} (median fallback, "
-        f"Kneedle found no significant elbow in {len(scores)} scores)"
+    
+    changepoints = find_changepoints(
+        sorted_scores, 
+        max_changepoints=max_tiers - 1,
+        min_segment_size=max(2, len(sorted_scores) // 10)
     )
-    return threshold
+    
+    return [float(sorted_scores[cp]) for cp in changepoints]
 
 
 def filter_by_elbow(
@@ -200,55 +327,37 @@ def filter_by_elbow(
     score_key: str = "score",
     fallback_score_key: str = "rerank_score",
     min_results: int = 1,
+    method: str = "curvature",
 ) -> list[dict]:
-    """Filter results using elbow detection for adaptive thresholding.
+    """Filter results using elbow detection.
     
     Args:
-        results: List of result dicts with score fields
-        score_key: Primary key to extract scores (default: "score")
-        fallback_score_key: Fallback key if primary not found (default: "rerank_score")
-        min_results: Minimum number of results to return (default: 1)
+        results: List of result dicts
+        score_key: Primary score key
+        fallback_score_key: Fallback score key
+        min_results: Minimum results to return
+        method: Detection method ("curvature", "kneedle", "changepoint")
         
     Returns:
-        Filtered list of results above elbow threshold
-        
-    Example:
-        >>> results = [
-        ...     {"id": 1, "score": 0.95},
-        ...     {"id": 2, "score": 0.88},
-        ...     {"id": 3, "score": 0.45},  # <- elbow here
-        ...     {"id": 4, "score": 0.42},
-        ... ]
-        >>> filtered = filter_by_elbow(results)
-        >>> len(filtered)
-        3  # Only items above elbow threshold (0.45)
+        Filtered results above elbow threshold
     """
     if not results:
         return []
     
-    threshold = compute_elbow_threshold(results, score_key, fallback_score_key)
+    threshold = compute_elbow_threshold(
+        results, score_key, fallback_score_key, method
+    )
     
-    filtered = []
-    for r in results:
+    def get_score(r: dict) -> float:
         score = r.get(score_key)
         if score is None:
             score = r.get(fallback_score_key, 0.0)
-        if float(score) >= threshold:
-            filtered.append(r)
+        return float(score)
+    
+    filtered = [r for r in results if get_score(r) >= threshold]
     
-    # Ensure minimum results
     if len(filtered) < min_results and len(results) >= min_results:
-        # Return top min_results by score
-        def _get_score(x):
-            score = x.get(score_key)
-            if score is None:
-                score = x.get(fallback_score_key, 0.0)
-            return float(score)
-        sorted_results = sorted(
-            results,
-            key=_get_score,
-            reverse=True
-        )
+        sorted_results = sorted(results, key=get_score, reverse=True)
         return sorted_results[:min_results]
     
     return filtered if filtered else results[:min_results]
diff --git a/scripts/hybrid/termination.py b/scripts/hybrid/termination.py
index 1907fa79..d1822c3f 100644
--- a/scripts/hybrid/termination.py
+++ b/scripts/hybrid/termination.py
@@ -1,11 +1,12 @@
-"""Smart termination conditions for iterative search operations.
-
-Implements 5 termination conditions from ChunkHound's multi-hop strategy:
-1. Time limit (default 5 seconds)
-2. Result limit (default 500 chunks)
-3. Candidate quality (need N+ high-scoring for expansion)
-4. Score degradation (stop if tracked scores drop by threshold)
-5. Minimum relevance (stop if top-N min score below threshold)
+"""Smart termination for iterative search operations.
+
+Mathematical foundations:
+1. Welford's algorithm - O(1) online variance for adaptive thresholds
+2. Page-Hinkley test - detects mean shift in streaming data
+3. Statistical termination - uses 2-sigma rule instead of fixed thresholds
+
+Welford's update: δ = x - μ, μ' = μ + δ/n, M2' = M2 + δ(x - μ')
+Page-Hinkley: cumsum of (x - μ - δ), detect when max deviation exceeds threshold
 """
 
 from __future__ import annotations
@@ -13,34 +14,124 @@
 import logging
 import time
 from dataclasses import dataclass, field
-from typing import Dict, List, Tuple, Sequence
+from typing import Dict, List, Tuple, Sequence, Optional
+import math
 
 logger = logging.getLogger(__name__)
 
 
+@dataclass
+class WelfordState:
+    """Online variance computation using Welford's algorithm."""
+    n: int = 0
+    mean: float = 0.0
+    m2: float = 0.0
+    
+    def update(self, x: float) -> None:
+        """O(1) update with new value."""
+        self.n += 1
+        delta = x - self.mean
+        self.mean += delta / self.n
+        delta2 = x - self.mean
+        self.m2 += delta * delta2
+    
+    @property
+    def variance(self) -> float:
+        return self.m2 / self.n if self.n > 1 else 0.0
+    
+    @property
+    def std(self) -> float:
+        return math.sqrt(self.variance)
+    
+    def adaptive_threshold(self, sigma_multiplier: float = 2.0) -> float:
+        """Return threshold as mean - sigma_multiplier * std."""
+        return self.mean - sigma_multiplier * self.std
+
+
+@dataclass 
+class PageHinkleyState:
+    """Page-Hinkley test for mean shift detection.
+    
+    Detects when cumulative deviation from mean exceeds threshold.
+    Good for detecting gradual degradation, not just sudden drops.
+    """
+    delta: float = 0.005
+    threshold: float = 15.0
+    n: int = 0
+    mean: float = 0.0
+    cumsum: float = 0.0
+    cumsum_min: float = 0.0
+    
+    def update(self, x: float) -> bool:
+        """Update and return True if drift detected."""
+        self.n += 1
+        
+        if self.n == 1:
+            self.mean = x
+            return False
+        
+        self.mean = ((self.n - 1) * self.mean + x) / self.n
+        
+        self.cumsum += x - self.mean - self.delta
+        self.cumsum_min = min(self.cumsum_min, self.cumsum)
+        
+        if self.cumsum - self.cumsum_min > self.threshold:
+            return True
+        
+        return False
+    
+    def reset(self) -> None:
+        self.n = 0
+        self.mean = 0.0
+        self.cumsum = 0.0
+        self.cumsum_min = 0.0
+
+
 @dataclass
 class TerminationConfig:
     time_limit: float = 5.0
     result_limit: int = 500
     min_candidates_for_expansion: int = 5
-    score_degradation_threshold: float = 0.15
+    
+    use_adaptive_threshold: bool = True
+    sigma_multiplier: float = 2.0
+    fixed_degradation_threshold: float = 0.15
+    
+    use_page_hinkley: bool = True
+    page_hinkley_delta: float = 0.005
+    page_hinkley_threshold: float = 15.0
+    
     min_relevance_score: float = 0.3
     top_n_to_track: int = 5
+    
+    min_iterations_before_stop: int = 2
 
 
 class TerminationChecker:
-    """Checks 5 termination conditions for iterative search operations."""
+    """Statistically-grounded termination for iterative search."""
     
     def __init__(self, config: TerminationConfig | None = None):
         self.config = config or TerminationConfig()
         self.start_time = time.perf_counter()
-        self.tracked_chunk_scores: Dict[str, float] = {}
         self.iteration = 0
+        
+        self.tracked_chunk_scores: Dict[str, float] = {}
+        
+        self.score_stats = WelfordState()
+        self.page_hinkley = PageHinkleyState(
+            delta=self.config.page_hinkley_delta,
+            threshold=self.config.page_hinkley_threshold,
+        )
+        
+        self.top_scores_history: List[float] = []
     
     def reset(self) -> None:
         self.start_time = time.perf_counter()
-        self.tracked_chunk_scores.clear()
         self.iteration = 0
+        self.tracked_chunk_scores.clear()
+        self.score_stats = WelfordState()
+        self.page_hinkley.reset()
+        self.top_scores_history.clear()
     
     def elapsed(self) -> float:
         return time.perf_counter() - self.start_time
@@ -51,68 +142,155 @@ def check(
         score_key: str = "score",
         id_key: str = "chunk_id",
     ) -> Tuple[bool, str]:
-        """Check all termination conditions.
+        """Check termination conditions with statistical methods.
         
         Returns:
-            (should_terminate, reason) - reason is empty string if should continue
+            (should_terminate, reason)
         """
         self.iteration += 1
         
         if self.elapsed() >= self.config.time_limit:
-            logger.debug(f"Termination: time limit {self.config.time_limit}s reached")
+            logger.debug(f"Termination: time limit {self.config.time_limit}s")
             return True, "time_limit"
         
         if len(results) >= self.config.result_limit:
-            logger.debug(f"Termination: result limit {self.config.result_limit} reached")
+            logger.debug(f"Termination: result limit {self.config.result_limit}")
             return True, "result_limit"
         
         high_scoring = [r for r in results if r.get(score_key, 0) > 0]
         if len(high_scoring) < self.config.min_candidates_for_expansion:
-            logger.debug(
-                f"Termination: insufficient candidates "
-                f"({len(high_scoring)} < {self.config.min_candidates_for_expansion})"
-            )
+            logger.debug(f"Termination: insufficient candidates ({len(high_scoring)})")
             return True, "insufficient_candidates"
         
         sorted_results = sorted(results, key=lambda x: -x.get(score_key, 0))
         top_n = sorted_results[:self.config.top_n_to_track]
         
-        if self.tracked_chunk_scores:
-            max_drop = 0.0
-            for chunk_id, prev_score in self.tracked_chunk_scores.items():
-                current_score = next(
-                    (r.get(score_key, 0) for r in results if r.get(id_key) == chunk_id),
-                    0.0
-                )
-                if current_score < prev_score:
-                    max_drop = max(max_drop, prev_score - current_score)
+        if top_n:
+            top_score = top_n[0].get(score_key, 0)
+            self.score_stats.update(top_score)
+            self.top_scores_history.append(top_score)
+        
+        if self.iteration >= self.config.min_iterations_before_stop:
             
-            if max_drop >= self.config.score_degradation_threshold:
-                logger.debug(
-                    f"Termination: score degradation {max_drop:.3f} >= "
-                    f"{self.config.score_degradation_threshold}"
-                )
-                return True, "score_degradation"
+            if self.config.use_page_hinkley and top_n:
+                top_score = top_n[0].get(score_key, 0)
+                if self.page_hinkley.update(top_score):
+                    logger.debug("Termination: Page-Hinkley detected score drift")
+                    return True, "score_drift_detected"
+            
+            if self.tracked_chunk_scores and self.iteration > 2:
+                if self.config.use_adaptive_threshold:
+                    threshold = self.score_stats.adaptive_threshold(
+                        self.config.sigma_multiplier
+                    )
+                    if threshold <= 0:
+                        threshold = self.config.fixed_degradation_threshold
+                else:
+                    threshold = self.config.fixed_degradation_threshold
+                
+                max_drop = 0.0
+                for chunk_id, prev_score in self.tracked_chunk_scores.items():
+                    current_score = next(
+                        (r.get(score_key, 0) for r in results if r.get(id_key) == chunk_id),
+                        0.0
+                    )
+                    if current_score < prev_score:
+                        max_drop = max(max_drop, prev_score - current_score)
+                
+                if max_drop >= threshold:
+                    logger.debug(
+                        f"Termination: score degradation {max_drop:.3f} >= "
+                        f"threshold {threshold:.3f}"
+                    )
+                    return True, "score_degradation"
         
         self.tracked_chunk_scores.clear()
         for r in top_n:
             chunk_id = r.get(id_key)
             if chunk_id:
                 self.tracked_chunk_scores[chunk_id] = r.get(score_key, 0)
+        
         if top_n:
             min_score = min(r.get(score_key, 0) for r in top_n)
             if min_score < self.config.min_relevance_score:
-                logger.debug(
-                    f"Termination: min relevance {min_score:.3f} < "
-                    f"{self.config.min_relevance_score}"
-                )
+                logger.debug(f"Termination: min relevance {min_score:.3f}")
                 return True, "min_relevance"
         
         return False, ""
     
-    def get_stats(self) -> Dict[str, any]:
+    def get_stats(self) -> Dict[str, float]:
         return {
             "iterations": self.iteration,
             "elapsed_seconds": round(self.elapsed(), 3),
             "tracked_chunks": len(self.tracked_chunk_scores),
+            "score_mean": round(self.score_stats.mean, 4),
+            "score_std": round(self.score_stats.std, 4),
+            "adaptive_threshold": round(
+                self.score_stats.adaptive_threshold(self.config.sigma_multiplier), 4
+            ),
+            "page_hinkley_cumsum": round(self.page_hinkley.cumsum, 4),
         }
+
+
+def mann_whitney_u(x: Sequence[float], y: Sequence[float]) -> Tuple[float, float]:
+    """Mann-Whitney U test for comparing two score distributions.
+    
+    Returns (U statistic, approximate p-value using normal approximation).
+    Useful for comparing score quality across iterations.
+    """
+    nx, ny = len(x), len(y)
+    if nx == 0 or ny == 0:
+        return 0.0, 1.0
+    
+    combined = [(v, 0) for v in x] + [(v, 1) for v in y]
+    combined.sort(key=lambda t: t[0])
+    
+    ranks = {}
+    i = 0
+    while i < len(combined):
+        j = i
+        while j < len(combined) and combined[j][0] == combined[i][0]:
+            j += 1
+        avg_rank = (i + j + 1) / 2.0
+        for k in range(i, j):
+            val = combined[k][0]
+            if val not in ranks:
+                ranks[val] = []
+            ranks[val].append(avg_rank)
+        i = j
+    
+    r1 = sum(ranks[v][0] if len(ranks[v]) == 1 else ranks[v].pop(0) for v in x)
+    
+    u1 = r1 - nx * (nx + 1) / 2
+    u2 = nx * ny - u1
+    u = min(u1, u2)
+    
+    mu = nx * ny / 2
+    sigma = math.sqrt(nx * ny * (nx + ny + 1) / 12)
+    
+    if sigma == 0:
+        return u, 1.0
+    
+    z = (u - mu) / sigma
+    
+    p = 2 * (1 - _normal_cdf(abs(z)))
+    
+    return u, p
+
+
+def _normal_cdf(x: float) -> float:
+    """Standard normal CDF approximation (Abramowitz & Stegun)."""
+    a1 = 0.254829592
+    a2 = -0.284496736
+    a3 = 1.421413741
+    a4 = -1.453152027
+    a5 = 1.061405429
+    p = 0.3275911
+    
+    sign = 1 if x >= 0 else -1
+    x = abs(x)
+    
+    t = 1.0 / (1.0 + p * x)
+    y = 1.0 - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * math.exp(-x * x / 2)
+    
+    return 0.5 * (1.0 + sign * y)
diff --git a/scripts/indexing_admin.py b/scripts/indexing_admin.py
index fd2389fd..1e8abf5f 100644
--- a/scripts/indexing_admin.py
+++ b/scripts/indexing_admin.py
@@ -1078,7 +1078,7 @@ def recreate_collection_qdrant(*, qdrant_url: str, api_key: Optional[str], colle
 
         # Also delete the graph collection if it exists
         # Graph collections are tightly coupled to their main collection
-        # The decision to recreate happens during ingest (based on INDEX_GRAPH_EDGES)
+        # Graph edges are always indexed (Qdrant flat graph is always on)
         if get_graph_collection_name_t is not None:
             graph_name = get_graph_collection_name_t(name)
             try:
diff --git a/scripts/ingest/chunk_deduplication.py b/scripts/ingest/chunk_deduplication.py
index 0798261d..5f38d066 100644
--- a/scripts/ingest/chunk_deduplication.py
+++ b/scripts/ingest/chunk_deduplication.py
@@ -4,14 +4,21 @@
 1. Exact content matching via hash table (O(n))
 2. Substring detection via sorted interval scan (O(n log n))
 
-Ported from ChunkHound to Context-Engine.
+Specificity scoring uses weighted formula:
+  score = w_type * type_weight + w_size * log(line_count) + w_name * has_name
+  
+where:
+  - type_weight: structural importance (definition > block > comment)
+  - log(line_count): information content (more lines = more context)
+  - has_name: named symbols are more referenceable
 """
 
 from __future__ import annotations
 
 import logging
+import math
 from collections import defaultdict
-from typing import Sequence, TypeVar
+from typing import Sequence, TypeVar, Dict, Any
 
 import xxhash
 
@@ -19,27 +26,28 @@
 
 T = TypeVar("T", bound=dict)
 
-# Specificity ranking (higher = more specific, keep over lower)
-CONCEPT_SPECIFICITY = {
-    # Context-Engine chunk types
-    "function": 4,
-    "method": 4,
-    "class": 4,
-    "interface": 4,
-    "struct": 4,
-    "enum": 4,
-    "type_alias": 3,
-    "import": 3,
-    "comment": 2,
-    "block": 1,
-    "array": 1,
-    "structure": 0,
-    # CAST+ concept types (from concept_extractor) - lowercase to match get_chunk_specificity()
-    "definition": 4,
-    "import": 3,
-    "comment": 2,
-    "block": 1,
-    # Note: "structure" already defined above
+TYPE_WEIGHTS: Dict[str, float] = {
+    "function": 1.0,
+    "method": 1.0,
+    "class": 1.0,
+    "interface": 1.0,
+    "struct": 1.0,
+    "enum": 1.0,
+    "definition": 1.0,
+    "type_alias": 0.8,
+    "type": 0.8,
+    "import": 0.6,
+    "comment": 0.4,
+    "docstring": 0.4,
+    "block": 0.3,
+    "array": 0.2,
+    "structure": 0.1,
+}
+
+SPECIFICITY_WEIGHTS = {
+    "type": 0.5,
+    "size": 0.3,
+    "name": 0.2,
 }
 
 
@@ -48,19 +56,58 @@ def normalize_content(content: str) -> str:
     return content.replace("\r\n", "\n").replace("\r", "\n").strip()
 
 
-def get_chunk_specificity(chunk: dict) -> int:
-    """Get specificity ranking for chunk's type. Higher = more specific."""
+def _extract_type_name(chunk: dict) -> str:
+    """Extract normalized type name from chunk."""
     chunk_type = chunk.get("chunk_type") or chunk.get("concept") or chunk.get("type", "")
     if isinstance(chunk_type, str):
-        type_name = chunk_type.lower()
+        return chunk_type.lower()
     elif hasattr(chunk_type, "value"):
-        type_name = str(chunk_type.value).lower()
+        return str(chunk_type.value).lower()
     elif hasattr(chunk_type, "name"):
-        type_name = chunk_type.name.lower()
-    else:
-        type_name = str(chunk_type).lower() if chunk_type else ""
+        return chunk_type.name.lower()
+    return str(chunk_type).lower() if chunk_type else ""
+
+
+def compute_specificity_score(chunk: dict) -> float:
+    """Compute specificity score using weighted formula.
+    
+    score = w_type * type_weight + w_size * log(1 + line_count) + w_name * has_name
+    
+    Higher score = more specific, should be kept over lower-scoring duplicates.
+    """
+    type_name = _extract_type_name(chunk)
+    type_weight = TYPE_WEIGHTS.get(type_name, 0.0)
+    
+    start_line = chunk.get("start_line", 0)
+    end_line = chunk.get("end_line", 0)
+    line_count = max(1, end_line - start_line + 1)
+    size_score = math.log(1 + line_count) / math.log(1000)
+    
+    has_name = 1.0 if chunk.get("name") or chunk.get("symbol") else 0.0
+    
+    score = (
+        SPECIFICITY_WEIGHTS["type"] * type_weight +
+        SPECIFICITY_WEIGHTS["size"] * min(1.0, size_score) +
+        SPECIFICITY_WEIGHTS["name"] * has_name
+    )
+    
+    return score
+
+
+def get_chunk_specificity(chunk: dict) -> int:
+    """Get integer specificity ranking (legacy interface, 0-4 scale)."""
+    type_name = _extract_type_name(chunk)
+    weight = TYPE_WEIGHTS.get(type_name, 0.0)
     
-    return CONCEPT_SPECIFICITY.get(type_name, -1)
+    if weight >= 0.9:
+        return 4
+    elif weight >= 0.7:
+        return 3
+    elif weight >= 0.5:
+        return 2
+    elif weight >= 0.3:
+        return 1
+    return 0
 
 
 def deduplicate_chunks(
diff --git a/scripts/ingest/pipeline.py b/scripts/ingest/pipeline.py
index 21d5ef3e..35fbfb55 100644
--- a/scripts/ingest/pipeline.py
+++ b/scripts/ingest/pipeline.py
@@ -1161,106 +1161,106 @@ def make_point(
         ]
         upsert_points(client, collection, points)
 
-        # Emit graph edges for symbol relationships
+        # Emit graph edges for symbol relationships (always on - Qdrant flat graph)
+        # Neo4j takes precedence when NEO4J_GRAPH=1 is set
         # Always try symbol-level edges first, fall back to file-level if no symbol_calls
         try:
-            if os.environ.get("INDEX_GRAPH_EDGES", "1").lower() in {"1", "true", "yes", "on"}:
-                graph_coll = ensure_graph_collection(client, collection)
-                # Delete old edges for this file before upserting new ones
-                delete_edges_by_path(client, graph_coll, str(file_path), repo=repo_tag)
-
-                all_edges = []
-                if symbol_calls:
-                    # Symbol-level edges: use AST-extracted caller→callee relationships
-                    for caller, callees in symbol_calls.items():
-                        if not caller or not callees:
-                            continue
-                        start_line = None
-                        end_line = None
-                        sym_info = symbol_meta_by_path.get(caller) or symbol_meta_by_name.get(caller)
-                        if sym_info is not None:
-                            try:
-                                start_line = int(getattr(sym_info, "start_line", 0) or 0)
-                                end_line = int(getattr(sym_info, "end_line", 0) or 0)
-                            except Exception:
-                                start_line = None
-                                end_line = None
-                        # Get caller_point_id from symbol_path mapping
-                        caller_pid = symbol_path_to_point_id.get(caller)
-                        all_edges.extend(
-                            _extract_call_edges_compat(
-                                symbol_path=caller,
-                                calls=callees,
-                                path=str(file_path),
-                                repo=repo_tag,
-                                start_line=start_line,
-                                end_line=end_line,
-                                language=language,
-                                caller_point_id=caller_pid,
-                                import_paths=import_map,
-                                collection=collection,
-                                qdrant_client=client,
-                            )
-                        )
-                    if imports:
-                        # For file-level imports, use first point ID if available
-                        file_pid = next(iter(symbol_path_to_point_id.values()), None) if symbol_path_to_point_id else None
-                        all_edges.extend(
-                            _extract_import_edges_compat(
-                                symbol_path=str(file_path),
-                                imports=imports,
-                                path=str(file_path),
-                                repo=repo_tag,
-                                language=language,
-                                caller_point_id=file_pid,
-                                collection=collection,
-                                qdrant_client=client,
-                            )
-                        )
-                else:
-                    # File-level fallback: emit file→symbol edges
-                    source_file_path = str(file_path)
-                    # Use first point ID for file-level edges
-                    file_pid = next(iter(symbol_path_to_point_id.values()), None) if symbol_path_to_point_id else None
-                    if calls:
-                        all_edges.extend(_extract_call_edges_compat(
-                            symbol_path=source_file_path,
-                            calls=calls,
-                            path=source_file_path,
+            graph_coll = ensure_graph_collection(client, collection)
+            # Delete old edges for this file before upserting new ones
+            delete_edges_by_path(client, graph_coll, str(file_path), repo=repo_tag)
+
+            all_edges = []
+            if symbol_calls:
+                # Symbol-level edges: use AST-extracted caller→callee relationships
+                for caller, callees in symbol_calls.items():
+                    if not caller or not callees:
+                        continue
+                    start_line = None
+                    end_line = None
+                    sym_info = symbol_meta_by_path.get(caller) or symbol_meta_by_name.get(caller)
+                    if sym_info is not None:
+                        try:
+                            start_line = int(getattr(sym_info, "start_line", 0) or 0)
+                            end_line = int(getattr(sym_info, "end_line", 0) or 0)
+                        except Exception:
+                            start_line = None
+                            end_line = None
+                    # Get caller_point_id from symbol_path mapping
+                    caller_pid = symbol_path_to_point_id.get(caller)
+                    all_edges.extend(
+                        _extract_call_edges_compat(
+                            symbol_path=caller,
+                            calls=callees,
+                            path=str(file_path),
                             repo=repo_tag,
-                            caller_point_id=file_pid,
+                            start_line=start_line,
+                            end_line=end_line,
+                            language=language,
+                            caller_point_id=caller_pid,
                             import_paths=import_map,
                             collection=collection,
                             qdrant_client=client,
-                        ))
-                    if imports:
-                        all_edges.extend(_extract_import_edges_compat(
-                            symbol_path=source_file_path,
+                        )
+                    )
+                if imports:
+                    # For file-level imports, use first point ID if available
+                    file_pid = next(iter(symbol_path_to_point_id.values()), None) if symbol_path_to_point_id else None
+                    all_edges.extend(
+                        _extract_import_edges_compat(
+                            symbol_path=str(file_path),
                             imports=imports,
-                            path=source_file_path,
+                            path=str(file_path),
                             repo=repo_tag,
+                            language=language,
                             caller_point_id=file_pid,
                             collection=collection,
                             qdrant_client=client,
+                        )
+                    )
+            else:
+                # File-level fallback: emit file→symbol edges
+                source_file_path = str(file_path)
+                # Use first point ID for file-level edges
+                file_pid = next(iter(symbol_path_to_point_id.values()), None) if symbol_path_to_point_id else None
+                if calls:
+                    all_edges.extend(_extract_call_edges_compat(
+                        symbol_path=source_file_path,
+                        calls=calls,
+                        path=source_file_path,
+                        repo=repo_tag,
+                        caller_point_id=file_pid,
+                        import_paths=import_map,
+                        collection=collection,
+                        qdrant_client=client,
+                    ))
+                if imports:
+                    all_edges.extend(_extract_import_edges_compat(
+                        symbol_path=source_file_path,
+                        imports=imports,
+                        path=source_file_path,
+                        repo=repo_tag,
+                        caller_point_id=file_pid,
+                        collection=collection,
+                        qdrant_client=client,
+                    ))
+
+            # Extract inheritance edges (INHERITS_FROM) for all classes
+            if inheritance_map:
+                for class_name, base_classes in inheritance_map.items():
+                    if class_name and base_classes:
+                        all_edges.extend(extract_inheritance_edges(
+                            class_name=class_name,
+                            base_classes=base_classes,
+                            path=str(file_path),
+                            repo=repo_tag,
+                            language=language,
+                            import_paths=import_map,
+                            collection=collection,
+                            qdrant_client=client,
                         ))
 
-                # Extract inheritance edges (INHERITS_FROM) for all classes
-                if inheritance_map:
-                    for class_name, base_classes in inheritance_map.items():
-                        if class_name and base_classes:
-                            all_edges.extend(extract_inheritance_edges(
-                                class_name=class_name,
-                                base_classes=base_classes,
-                                path=str(file_path),
-                                repo=repo_tag,
-                                language=language,
-                                import_paths=import_map,
-                                collection=collection,
-                                qdrant_client=client,
-                            ))
-
-                if all_edges:
-                    upsert_edges(client, graph_coll, all_edges)
+            if all_edges:
+                upsert_edges(client, graph_coll, all_edges)
         except Exception as e:
             # Don't fail indexing if graph edges fail
             logger.warning(f"Failed to emit graph edges for {file_path}: {e}")
@@ -2162,99 +2162,99 @@ def process_file_with_smart_reindexing(
     if all_points:
         _upsert_points_fn(client, current_collection, all_points)
 
-        # Emit graph edges for symbol relationships
+        # Emit graph edges for symbol relationships (always on - Qdrant flat graph)
+        # Neo4j takes precedence when NEO4J_GRAPH=1 is set
         # Always try symbol-level edges first, fall back to file-level if no symbol_calls
         try:
-            if os.environ.get("INDEX_GRAPH_EDGES", "1").lower() in {"1", "true", "yes", "on"}:
-                graph_coll = ensure_graph_collection(client, current_collection)
-                delete_edges_by_path(client, graph_coll, fp, repo=per_file_repo)
-
-                all_edges = []
-                if symbol_calls:
-                    # Symbol-level edges: use AST-extracted caller→callee relationships
-                    for caller, callees in symbol_calls.items():
-                        if not caller or not callees:
-                            continue
-                        start_line = None
-                        sym_info = symbol_meta_by_path.get(caller) or symbol_meta_by_name.get(caller)
-                        if sym_info is not None:
-                            try:
-                                start_line = int(getattr(sym_info, "start_line", 0) or 0)
-                            except Exception:
-                                start_line = None
-                        # Get caller_point_id from symbol_path mapping
-                        caller_pid = symbol_path_to_point_id_sr.get(caller)
-                        all_edges.extend(
-                            _extract_call_edges_compat(
-                                symbol_path=caller,
-                                calls=callees,
-                                path=fp,
-                                repo=per_file_repo,
-                                start_line=start_line,
-                                language=language,
-                                caller_point_id=caller_pid,
-                                import_paths=import_map,
-                            )
-                        )
-                    if imports:
-                        # For file-level imports, use first point ID if available
-                        file_pid = next(iter(symbol_path_to_point_id_sr.values()), None) if symbol_path_to_point_id_sr else None
-                        all_edges.extend(
-                            _extract_import_edges_compat(
-                                symbol_path=fp,
-                                imports=imports,
-                                path=fp,
-                                repo=per_file_repo,
-                                language=language,
-                                caller_point_id=file_pid,
-                            )
+            graph_coll = ensure_graph_collection(client, current_collection)
+            delete_edges_by_path(client, graph_coll, fp, repo=per_file_repo)
+
+            all_edges = []
+            if symbol_calls:
+                # Symbol-level edges: use AST-extracted caller→callee relationships
+                for caller, callees in symbol_calls.items():
+                    if not caller or not callees:
+                        continue
+                    start_line = None
+                    sym_info = symbol_meta_by_path.get(caller) or symbol_meta_by_name.get(caller)
+                    if sym_info is not None:
+                        try:
+                            start_line = int(getattr(sym_info, "start_line", 0) or 0)
+                        except Exception:
+                            start_line = None
+                    # Get caller_point_id from symbol_path mapping
+                    caller_pid = symbol_path_to_point_id_sr.get(caller)
+                    all_edges.extend(
+                        _extract_call_edges_compat(
+                            symbol_path=caller,
+                            calls=callees,
+                            path=fp,
+                            repo=per_file_repo,
+                            start_line=start_line,
+                            language=language,
+                            caller_point_id=caller_pid,
+                            import_paths=import_map,
                         )
-                else:
-                    # File-level fallback: emit file→symbol edges
-                    meta0 = {}
-                    try:
-                        if all_points and hasattr(all_points[0], "payload"):
-                            meta0 = all_points[0].payload.get("metadata", {}) or {}
-                    except Exception:
-                        meta0 = {}
-                    file_calls = meta0.get("calls", []) or []
-                    file_imports = meta0.get("imports", []) or []
-                    file_import_map = meta0.get("import_map", {}) or {}
-                    # Use first point ID for file-level edges
+                    )
+                if imports:
+                    # For file-level imports, use first point ID if available
                     file_pid = next(iter(symbol_path_to_point_id_sr.values()), None) if symbol_path_to_point_id_sr else None
-                    if file_calls:
-                        all_edges.extend(_extract_call_edges_compat(
+                    all_edges.extend(
+                        _extract_import_edges_compat(
                             symbol_path=fp,
-                            calls=file_calls,
+                            imports=imports,
                             path=fp,
                             repo=per_file_repo,
+                            language=language,
                             caller_point_id=file_pid,
-                            import_paths=file_import_map,
-                        ))
-                    if file_imports:
-                        all_edges.extend(_extract_import_edges_compat(
-                            symbol_path=fp,
-                            imports=file_imports,
+                        )
+                    )
+            else:
+                # File-level fallback: emit file→symbol edges
+                meta0 = {}
+                try:
+                    if all_points and hasattr(all_points[0], "payload"):
+                        meta0 = all_points[0].payload.get("metadata", {}) or {}
+                except Exception:
+                    meta0 = {}
+                file_calls = meta0.get("calls", []) or []
+                file_imports = meta0.get("imports", []) or []
+                file_import_map = meta0.get("import_map", {}) or {}
+                # Use first point ID for file-level edges
+                file_pid = next(iter(symbol_path_to_point_id_sr.values()), None) if symbol_path_to_point_id_sr else None
+                if file_calls:
+                    all_edges.extend(_extract_call_edges_compat(
+                        symbol_path=fp,
+                        calls=file_calls,
+                        path=fp,
+                        repo=per_file_repo,
+                        caller_point_id=file_pid,
+                        import_paths=file_import_map,
+                    ))
+                if file_imports:
+                    all_edges.extend(_extract_import_edges_compat(
+                        symbol_path=fp,
+                        imports=file_imports,
+                        path=fp,
+                        repo=per_file_repo,
+                        caller_point_id=file_pid,
+                    ))
+
+            # Extract inheritance edges (INHERITS_FROM) for all classes
+            if inheritance_map:
+                for class_name, base_classes in inheritance_map.items():
+                    if class_name and base_classes:
+                        all_edges.extend(extract_inheritance_edges(
+                            class_name=class_name,
+                            base_classes=base_classes,
                             path=fp,
                             repo=per_file_repo,
-                            caller_point_id=file_pid,
+                            language=language,
+                            import_paths=import_map,
                         ))
 
-                # Extract inheritance edges (INHERITS_FROM) for all classes
-                if inheritance_map:
-                    for class_name, base_classes in inheritance_map.items():
-                        if class_name and base_classes:
-                            all_edges.extend(extract_inheritance_edges(
-                                class_name=class_name,
-                                base_classes=base_classes,
-                                path=fp,
-                                repo=per_file_repo,
-                                language=language,
-                                import_paths=import_map,
-                            ))
-
-                if all_edges:
-                    upsert_edges(client, graph_coll, all_edges)
+            if all_edges:
+                upsert_edges(client, graph_coll, all_edges)
         except Exception as e:
             logger.warning(f"Failed to emit graph edges for {fp}: {e}")
 
diff --git a/scripts/workspace_state.py b/scripts/workspace_state.py
index 99eae638..ea63506a 100644
--- a/scripts/workspace_state.py
+++ b/scripts/workspace_state.py
@@ -2282,7 +2282,7 @@ def get_indexing_config_snapshot() -> Dict[str, Any]:
         "index_use_enhanced_ast": _env_truthy("INDEX_USE_ENHANCED_AST", False),
         "mini_vec_dim": _env_int("MINI_VEC_DIM"),
         "lex_sparse_mode": _env_truthy("LEX_SPARSE_MODE", False),
-        "index_graph_edges": _env_truthy("INDEX_GRAPH_EDGES", True),
+        "index_graph_edges": True,  # Always on - Qdrant flat graph is unconditional
     }
 
 
diff --git a/tests/test_workspace_state.py b/tests/test_workspace_state.py
index 6d812726..d21991c1 100644
--- a/tests/test_workspace_state.py
+++ b/tests/test_workspace_state.py
@@ -442,29 +442,34 @@ class TestConfigDrift:
     """Tests for indexing config drift detection."""
 
     def test_get_indexing_config_snapshot_includes_graph_edges(self, ws_module, monkeypatch):
-        """Verify index_graph_edges key exists in snapshot with default value True."""
-        # Clear any existing env vars to test defaults
-        monkeypatch.delenv("INDEX_GRAPH_EDGES", raising=False)
+        """Verify index_graph_edges key exists in snapshot and is always True.
 
+        Symbol graph (Qdrant flat graph) is always on - this value is no longer
+        configurable via env var. Use NEO4J_GRAPH=1 to enable Neo4j backend instead.
+        """
         snapshot = ws_module.get_indexing_config_snapshot()
 
         assert "index_graph_edges" in snapshot, "index_graph_edges should be in config snapshot"
-        assert snapshot["index_graph_edges"] is True, "Default value for index_graph_edges should be True"
+        assert snapshot["index_graph_edges"] is True, "index_graph_edges should always be True (always on)"
 
-    def test_get_indexing_config_snapshot_respects_env_var(self, ws_module, monkeypatch):
-        """Verify INDEX_GRAPH_EDGES env var is respected in snapshot."""
-        # Test with False
+    def test_get_indexing_config_snapshot_graph_edges_always_true(self, ws_module, monkeypatch):
+        """Verify index_graph_edges is always True regardless of env var (now unconditional)."""
+        # Even with env var set to 0, index_graph_edges should be True (always on)
         monkeypatch.setenv("INDEX_GRAPH_EDGES", "0")
         snapshot = ws_module.get_indexing_config_snapshot()
-        assert snapshot["index_graph_edges"] is False, "INDEX_GRAPH_EDGES=0 should set index_graph_edges to False"
+        assert snapshot["index_graph_edges"] is True, "index_graph_edges should always be True (env var ignored)"
 
-        # Test with True
+        # Same with env var set to 1
         monkeypatch.setenv("INDEX_GRAPH_EDGES", "1")
         snapshot = ws_module.get_indexing_config_snapshot()
-        assert snapshot["index_graph_edges"] is True, "INDEX_GRAPH_EDGES=1 should set index_graph_edges to True"
+        assert snapshot["index_graph_edges"] is True, "index_graph_edges should always be True"
 
     def test_config_drift_classifies_graph_edges_as_recreate(self, ws_module):
-        """Verify that changing INDEX_GRAPH_EDGES triggers recreate drift."""
+        """Verify that changing index_graph_edges triggers recreate drift.
+
+        Note: index_graph_edges is now always True, but drift rules still exist
+        for backwards compatibility with existing indexes that may have False.
+        """
         from scripts import indexing_admin
 
         # Verify the drift rule exists and is classified as "recreate"
@@ -473,23 +478,15 @@ def test_config_drift_classifies_graph_edges_as_recreate(self, ws_module):
         assert indexing_admin.CONFIG_DRIFT_RULES["index_graph_edges"] == "recreate", \
             "index_graph_edges drift should be classified as 'recreate'"
 
-    def test_config_drift_graph_edges_true_to_false(self, ws_module):
-        """Verify drift from True->False is classified as recreate."""
-        from scripts import indexing_admin
-
-        old_config = {"index_graph_edges": True}
-        new_config = {"index_graph_edges": False}
-
-        # The actual drift detection is more complex, but we can verify the rule
-        rule = indexing_admin.CONFIG_DRIFT_RULES.get("index_graph_edges")
-        assert rule == "recreate", "Changing index_graph_edges should require recreate"
+    def test_config_drift_graph_edges_legacy_false_to_true(self, ws_module):
+        """Verify drift from legacy False->True is classified as recreate.
 
-    def test_config_drift_graph_edges_false_to_true(self, ws_module):
-        """Verify drift from False->True is classified as recreate."""
+        This handles migration from old indexes where graph edges were disabled.
+        """
         from scripts import indexing_admin
 
-        old_config = {"index_graph_edges": False}
-        new_config = {"index_graph_edges": True}
+        old_config = {"index_graph_edges": False}  # Legacy: was disabled
+        new_config = {"index_graph_edges": True}   # Now: always on
 
         # The actual drift detection is more complex, but we can verify the rule
         rule = indexing_admin.CONFIG_DRIFT_RULES.get("index_graph_edges")

From 56e752246d912ddbd63537463dcdd7ad0f9133ad Mon Sep 17 00:00:00 2001
From: John Donalson <mirlok@dr.com>
Date: Sat, 24 Jan 2026 10:35:06 -0500
Subject: [PATCH 12/29] Refine Page-Hinkley test and update related tests

Modified the Page-Hinkley test in termination.py to detect downward mean shifts, updated its threshold and logic, and clarified docstrings. Adjusted TerminationConfig defaults and improved test coverage in test_termination.py to reflect the new Page-Hinkley behavior. Also updated chunk specificity logic and tests in test_chunk_deduplication_core.py to return 0 for unknown types.
---
 scripts/ctx.py                         |  8 ++++++--
 scripts/hybrid/termination.py          | 27 +++++++++++++++-----------
 tests/test_chunk_deduplication_core.py |  8 ++++----
 tests/test_termination.py              | 26 +++++++++++++++++++------
 4 files changed, 46 insertions(+), 23 deletions(-)

diff --git a/scripts/ctx.py b/scripts/ctx.py
index 07ae7496..c3e03f5c 100755
--- a/scripts/ctx.py
+++ b/scripts/ctx.py
@@ -1070,7 +1070,9 @@ def fetch_context(query: str, **filters) -> Tuple[str, str]:
         sys.stderr.flush()
         return "", "Context retrieval returned no data."
 
-    hits = data.get("results") or []
+    hits = data.get("results_json") or data.get("results") or []
+    if isinstance(hits, str):
+        hits = []
     relevance = _estimate_query_result_relevance(query, hits)
     sys.stderr.write(f"[DEBUG] repo_search returned {len(hits)} hits (relevance={relevance:.3f})\n")
     sys.stderr.flush()
@@ -1120,7 +1122,9 @@ def fetch_context(query: str, **filters) -> Tuple[str, str]:
         if "error" not in memory_result:
             memory_data = parse_mcp_response(memory_result)
             if memory_data:
-                memory_hits = memory_data.get("results") or []
+                memory_hits = memory_data.get("results_json") or memory_data.get("results") or []
+                if isinstance(memory_hits, str):
+                    memory_hits = []
                 if memory_hits:
                     return format_search_results(memory_hits, include_snippets=with_snippets), "Using memories and design docs"
         return "", "No relevant context found for the prompt."
diff --git a/scripts/hybrid/termination.py b/scripts/hybrid/termination.py
index d1822c3f..ba588f83 100644
--- a/scripts/hybrid/termination.py
+++ b/scripts/hybrid/termination.py
@@ -50,20 +50,24 @@ def adaptive_threshold(self, sigma_multiplier: float = 2.0) -> float:
 
 @dataclass 
 class PageHinkleyState:
-    """Page-Hinkley test for mean shift detection.
+    """Page-Hinkley test for DOWNWARD mean shift detection (score degradation).
     
-    Detects when cumulative deviation from mean exceeds threshold.
-    Good for detecting gradual degradation, not just sudden drops.
+    Detects when scores drop significantly below the running mean.
+    Cumsum formula: cumsum += (mean - x + delta)
+    When x consistently falls below mean, cumsum grows and triggers detection.
+    
+    This is the inverse of the standard PH test (which detects upward drift).
+    Optimized for search relevance degradation detection.
     """
     delta: float = 0.005
-    threshold: float = 15.0
+    threshold: float = 0.5
     n: int = 0
     mean: float = 0.0
     cumsum: float = 0.0
-    cumsum_min: float = 0.0
+    cumsum_max: float = 0.0
     
     def update(self, x: float) -> bool:
-        """Update and return True if drift detected."""
+        """Update and return True if downward drift detected."""
         self.n += 1
         
         if self.n == 1:
@@ -72,10 +76,11 @@ def update(self, x: float) -> bool:
         
         self.mean = ((self.n - 1) * self.mean + x) / self.n
         
-        self.cumsum += x - self.mean - self.delta
-        self.cumsum_min = min(self.cumsum_min, self.cumsum)
+        # cumsum += (mean - x + delta): grows when x < mean
+        self.cumsum += self.mean - x + self.delta
+        self.cumsum_max = max(self.cumsum_max, self.cumsum)
         
-        if self.cumsum - self.cumsum_min > self.threshold:
+        if self.cumsum > self.threshold:
             return True
         
         return False
@@ -84,7 +89,7 @@ def reset(self) -> None:
         self.n = 0
         self.mean = 0.0
         self.cumsum = 0.0
-        self.cumsum_min = 0.0
+        self.cumsum_max = 0.0
 
 
 @dataclass
@@ -99,7 +104,7 @@ class TerminationConfig:
     
     use_page_hinkley: bool = True
     page_hinkley_delta: float = 0.005
-    page_hinkley_threshold: float = 15.0
+    page_hinkley_threshold: float = 0.5
     
     min_relevance_score: float = 0.3
     top_n_to_track: int = 5
diff --git a/tests/test_chunk_deduplication_core.py b/tests/test_chunk_deduplication_core.py
index a3a8c777..7fd794cb 100644
--- a/tests/test_chunk_deduplication_core.py
+++ b/tests/test_chunk_deduplication_core.py
@@ -6,7 +6,7 @@
     get_chunk_specificity,
     deduplicate_chunks,
     deduplicate_semantic_chunks,
-    CONCEPT_SPECIFICITY,
+    TYPE_WEIGHTS,
 )
 
 
@@ -48,10 +48,10 @@ def test_definition_concept_type(self):
         chunk = {"chunk_type": "DEFINITION"}
         assert get_chunk_specificity(chunk) == 4
 
-    def test_unknown_type_returns_negative(self):
-        """Test unknown type returns -1."""
+    def test_unknown_type_returns_zero(self):
+        """Test unknown type returns 0 (lowest specificity)."""
         chunk = {"chunk_type": "unknown_type"}
-        assert get_chunk_specificity(chunk) == -1
+        assert get_chunk_specificity(chunk) == 0
 
     def test_concept_key_fallback(self):
         """Test fallback to 'concept' key."""
diff --git a/tests/test_termination.py b/tests/test_termination.py
index 93f38903..7e3c204b 100644
--- a/tests/test_termination.py
+++ b/tests/test_termination.py
@@ -14,9 +14,11 @@ def test_default_values(self):
         assert config.time_limit == 5.0
         assert config.result_limit == 500
         assert config.min_candidates_for_expansion == 5
-        assert config.score_degradation_threshold == 0.15
+        assert config.fixed_degradation_threshold == 0.15
         assert config.min_relevance_score == 0.3
         assert config.top_n_to_track == 5
+        assert config.use_page_hinkley is True
+        assert config.page_hinkley_threshold == 0.5
 
     def test_custom_values(self):
         """Test custom configuration values."""
@@ -91,12 +93,15 @@ def test_insufficient_candidates_termination(self):
         assert reason == "insufficient_candidates"
 
     def test_score_degradation_termination(self):
-        """Test termination on score degradation."""
+        """Test termination on score degradation via Page-Hinkley."""
         config = TerminationConfig(
-            score_degradation_threshold=0.1,
+            fixed_degradation_threshold=0.1,
             top_n_to_track=3,
             min_candidates_for_expansion=1,
             min_relevance_score=0.0,
+            use_page_hinkley=True,
+            page_hinkley_threshold=0.3,
+            min_iterations_before_stop=2,
         )
         checker = TerminationChecker(config)
         
@@ -109,16 +114,25 @@ def test_score_degradation_termination(self):
         should_terminate, reason = checker.check(results1)
         assert should_terminate is False
         
-        # Second iteration - scores dropped significantly
+        # Second iteration - scores start dropping
         results2 = [
-            {"chunk_id": "a", "score": 0.7},  # Dropped 0.2
+            {"chunk_id": "a", "score": 0.7},
             {"chunk_id": "b", "score": 0.6},
             {"chunk_id": "c", "score": 0.5},
         ]
         should_terminate, reason = checker.check(results2)
+        assert should_terminate is False
+        
+        # Third iteration - continued drop triggers Page-Hinkley
+        results3 = [
+            {"chunk_id": "a", "score": 0.4},
+            {"chunk_id": "b", "score": 0.3},
+            {"chunk_id": "c", "score": 0.2},
+        ]
+        should_terminate, reason = checker.check(results3)
         
         assert should_terminate is True
-        assert reason == "score_degradation"
+        assert reason in ("score_drift_detected", "score_degradation")
 
     def test_min_relevance_termination(self):
         """Test termination when min relevance score is too low."""

From 1e580476d4d6f32fdf5427a493fd319bc3ad1c8a Mon Sep 17 00:00:00 2001
From: John Donalson <mirlok@dr.com>
Date: Sat, 24 Jan 2026 10:43:50 -0500
Subject: [PATCH 13/29] Refactor graph backend selection and fallback logic

Updated _get_graph_backend to support both Neo4j and Qdrant backends through a unified interface, defaulting to Qdrant when Neo4j is not enabled. Simplified fallback logic in _symbol_graph_impl to handle empty results or backend failures, ensuring callees and callers queries use appropriate legacy array field lookups when necessary.
---
 scripts/mcp_impl/symbol_graph.py | 36 ++++++++++++++++++++------------
 1 file changed, 23 insertions(+), 13 deletions(-)

diff --git a/scripts/mcp_impl/symbol_graph.py b/scripts/mcp_impl/symbol_graph.py
index 8ad38a77..82436e06 100644
--- a/scripts/mcp_impl/symbol_graph.py
+++ b/scripts/mcp_impl/symbol_graph.py
@@ -91,11 +91,19 @@ def clear_graph_collection_cache() -> None:
 
 
 def _get_graph_backend():
-    """Return Neo4j graph backend when enabled, otherwise None."""
+    """Return graph backend (Neo4j or Qdrant).
+
+    Both backends are now supported through the unified GraphBackend interface:
+    - NEO4J_GRAPH=1: Uses Neo4j backend (takes precedence)
+    - Otherwise: Uses QdrantGraphBackend (default, always on)
+
+    Returns None only on error, never for Qdrant-as-default case.
+    """
     try:
         from scripts.graph_backends import get_graph_backend
         backend = get_graph_backend()
-        if backend.backend_type == "neo4j":
+        # Return any valid backend (neo4j or qdrant)
+        if backend is not None:
             return backend
     except Exception as e:
         logger.debug(f"Suppressed exception: {e} - graph backend lookup")
@@ -1371,18 +1379,10 @@ async def graph_query_fn(**kwargs):
                     results = []
                     used_graph = True
 
-        # Fallback for callees: use _query_callees which can use metadata.calls array
-        if query_type == "callees" and not results and not used_graph and not graph_backend:
-            results = await _query_callees(
-                client=client,
-                collection=coll,
-                symbol=symbol,
-                limit=limit,
-                language=language,
-                repo=repo,
-            )
         # Fall back to legacy array field query if graph is unavailable or we opted to fallback on empty.
-        elif not results and not used_graph:
+        # Both Qdrant and Neo4j backends are now supported, so graph_backend should always be set.
+        # This fallback is for when graph returns empty or when graph backend fails to initialize.
+        if not results and not used_graph:
             if query_type == "callers":
                 # Find chunks where metadata.calls array contains the symbol (exact match)
                 results = await _query_array_field(
@@ -1407,6 +1407,16 @@ async def graph_query_fn(**kwargs):
                     under=_norm_under(under),
                     repo=repo,
                 )
+            elif query_type == "callees":
+                # Find callees using metadata.calls array lookup
+                results = await _query_callees(
+                    client=client,
+                    collection=coll,
+                    symbol=symbol,
+                    limit=limit,
+                    language=language,
+                    repo=repo,
+                )
             elif query_type == "definition":
                 results = await _query_definition(
                     client=client,

From 2df5afa223bf5804426d8468d96d4f8fb493fa5a Mon Sep 17 00:00:00 2001
From: John Donalson <mirlok@dr.com>
Date: Sat, 24 Jan 2026 10:47:52 -0500
Subject: [PATCH 14/29] Add xxhash to project dependencies

Included the xxhash library (version 3.0.0 or higher) in the main dependencies to support fast non-cryptographic hashing.
---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index 2b57fdf3..abef2155 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -58,6 +58,7 @@ dependencies = [
     "rich>=13.0.0",
     "typer>=0.9.0",
     "requests>=2.28.0",
+    "xxhash>=3.0.0",
 ]
 
 [project.optional-dependencies]

From 76902178159e43f0c8f297bad78ded401da6c7de Mon Sep 17 00:00:00 2001
From: John Donalson <mirlok@dr.com>
Date: Sat, 24 Jan 2026 10:48:50 -0500
Subject: [PATCH 15/29] Fix min_results calculation for zero limit in hybrid
 search

Adjusts the min_results parameter to be zero when the limit is zero, preventing unintended behavior when no results are requested.
---
 scripts/hybrid_search.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/hybrid_search.py b/scripts/hybrid_search.py
index d9ff3c53..8e88253e 100644
--- a/scripts/hybrid_search.py
+++ b/scripts/hybrid_search.py
@@ -3028,7 +3028,7 @@ def _resolve(seg: str) -> list[str]:
             items,
             score_key="rerank_score",
             fallback_score_key="score",
-            min_results=max(1, limit // 2),  # Keep at least half the requested limit
+            min_results=max(1, limit // 2) if limit > 0 else 0,  # Keep at least half the requested limit
         )
         if os.environ.get("DEBUG_HYBRID_SEARCH"):
             logger.debug(

From e4143a54e0722bab8c9d570c637f5bb4a97ec910 Mon Sep 17 00:00:00 2001
From: John Donalson <mirlok@dr.com>
Date: Sat, 24 Jan 2026 10:51:52 -0500
Subject: [PATCH 16/29] Lazy load elbow detection to avoid hard numpy
 dependency

Refactors hybrid_search.py to lazily import filter_by_elbow only when elbow filtering is enabled, preventing unnecessary numpy dependency when the feature is disabled. Also updates the postinstall script in package.json to use a Node.js-based chmod for better cross-platform compatibility.
---
 ctx-mcp-bridge/package.json |  2 +-
 scripts/hybrid_search.py    | 13 +++++++++++--
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/ctx-mcp-bridge/package.json b/ctx-mcp-bridge/package.json
index a1c4f2fc..2bdbe8f6 100644
--- a/ctx-mcp-bridge/package.json
+++ b/ctx-mcp-bridge/package.json
@@ -9,7 +9,7 @@
   "type": "module",
   "scripts": {
     "start": "node bin/ctxce.js",
-    "postinstall": "chmod +x bin/ctxce.js 2>/dev/null || true"
+    "postinstall": "node -e \"try{require('fs').chmodSync('bin/ctxce.js',0o755)}catch(e){}\""
   },
   "dependencies": {
     "@modelcontextprotocol/sdk": "^1.24.3",
diff --git a/scripts/hybrid_search.py b/scripts/hybrid_search.py
index 8e88253e..89f3a7b9 100644
--- a/scripts/hybrid_search.py
+++ b/scripts/hybrid_search.py
@@ -248,7 +248,16 @@
 # ---------------------------------------------------------------------------
 # Elbow detection for adaptive filtering
 # ---------------------------------------------------------------------------
-from scripts.hybrid.elbow_detection import filter_by_elbow
+# Lazy import to avoid hard numpy dependency when feature is disabled
+_filter_by_elbow = None
+
+def _get_filter_by_elbow():
+    """Lazy load filter_by_elbow to avoid numpy import when disabled."""
+    global _filter_by_elbow
+    if _filter_by_elbow is None:
+        from scripts.hybrid.elbow_detection import filter_by_elbow
+        _filter_by_elbow = filter_by_elbow
+    return _filter_by_elbow
 
 # Environment variable for elbow filtering (opt-in)
 ELBOW_FILTER_ENABLED = _env_truthy(os.environ.get("HYBRID_ELBOW_FILTER"), False)
@@ -3024,7 +3033,7 @@ def _resolve(seg: str) -> list[str]:
     if ELBOW_FILTER_ENABLED and items:
         original_count = len(items)
         # Use rerank_score if available, otherwise use score
-        items = filter_by_elbow(
+        items = _get_filter_by_elbow()(
             items,
             score_key="rerank_score",
             fallback_score_key="score",

From 9dd2c0d2e0ade5f9a4fdc8d88e3aa5f27cd909af Mon Sep 17 00:00:00 2001
From: John Donalson <mirlok@dr.com>
Date: Sat, 24 Jan 2026 10:57:02 -0500
Subject: [PATCH 17/29] Enable and document deferred pseudo-tag generation

Set PSEUDO_DEFER_TO_WORKER=1 by default in Kubernetes and Docker Compose to defer LLM-based pseudo-tag generation to a background worker, improving initial indexing speed. Updated documentation to explain the new default, how the deferred worker operates, and its benefits for production deployments.
---
 deploy/kubernetes/configmap.yaml |  1 +
 docker-compose.yml               |  4 ++++
 docs/CONFIGURATION.md            | 18 ++++++++++++++++--
 3 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/deploy/kubernetes/configmap.yaml b/deploy/kubernetes/configmap.yaml
index 26c9e637..caf3e4c3 100644
--- a/deploy/kubernetes/configmap.yaml
+++ b/deploy/kubernetes/configmap.yaml
@@ -151,3 +151,4 @@ data:
   USE_GPU_DECODER: '0'
   USE_TREE_SITTER: '1'
   WATCH_DEBOUNCE_SECS: '4'
+  PSEUDO_DEFER_TO_WORKER: '1'
diff --git a/docker-compose.yml b/docker-compose.yml
index e15f099b..4e5b4a63 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -455,6 +455,8 @@ services:
       - PATTERN_VECTORS=${PATTERN_VECTORS:-}
       # Graph edges for symbol relationships (always on)
       - INDEX_GRAPH_EDGES_MODE=${INDEX_GRAPH_EDGES_MODE:-symbol}
+      # Defer pseudo-tag generation to watcher worker for faster initial indexing
+      - PSEUDO_DEFER_TO_WORKER=${PSEUDO_DEFER_TO_WORKER:-1}
     volumes:
       - workspace_pvc:/work:rw
       - codebase_pvc:/work/.codebase:rw
@@ -518,6 +520,8 @@ services:
       - GRAPH_BACKFILL_ENABLED=${GRAPH_BACKFILL_ENABLED:-1}
       # Neo4j graph backend (optional - takes precedence over Qdrant flat graph)
       - NEO4J_GRAPH=${NEO4J_GRAPH:-}
+      # Defer pseudo-tag generation - watcher runs backfill worker thread
+      - PSEUDO_DEFER_TO_WORKER=${PSEUDO_DEFER_TO_WORKER:-1}
     volumes:
       - workspace_pvc:/work:rw
       - codebase_pvc:/work/.codebase:rw
diff --git a/docs/CONFIGURATION.md b/docs/CONFIGURATION.md
index fe1f1575..cbe83710 100644
--- a/docs/CONFIGURATION.md
+++ b/docs/CONFIGURATION.md
@@ -377,12 +377,26 @@ REFRAG_RUNTIME=glm  # or openai, minimax, llamacpp
 
 ### Pseudo Backfill Worker
 
-Deferred pseudo/tag generation runs asynchronously after initial indexing.
+Deferred pseudo/tag generation runs asynchronously after initial indexing. This significantly speeds up initial indexing by skipping LLM-based pseudo-tag generation during the indexer run, deferring it to a background worker thread in the watcher service.
 
 | Name | Description | Default |
 |------|-------------|---------|
 | PSEUDO_BACKFILL_ENABLED | Enable async pseudo/tag backfill worker | 0 (disabled) |
-| PSEUDO_DEFER_TO_WORKER | Skip inline pseudo, defer to backfill worker | 0 (disabled) |
+| PSEUDO_DEFER_TO_WORKER | Skip inline pseudo, defer to backfill worker | 1 (enabled) |
+| GRAPH_BACKFILL_ENABLED | Enable graph edge backfill in watcher worker | 1 (enabled) |
+
+**How it works:**
+1. When `PSEUDO_DEFER_TO_WORKER=1`, the indexer generates only base chunks (no pseudo-tags)
+2. The watcher service starts a `_start_pseudo_backfill_worker` daemon thread
+3. This thread periodically calls `pseudo_backfill_tick()` to enrich chunks with LLM-generated tags
+4. If `GRAPH_BACKFILL_ENABLED=1`, it also calls `graph_backfill_tick()` to populate symbol graph edges
+
+**Benefits:**
+- Initial indexing is 2-5x faster (no LLM calls blocking indexer)
+- Background enrichment happens continuously without blocking searches
+- Failed LLM calls don't break indexing; worker retries automatically
+
+**Recommended for production:** Enable both for fastest initial indexing with eventual enrichment.
 
 ### Adaptive Span Sizing
 

From d6b15440f18ade3c2a62dc61f789d3406250aa62 Mon Sep 17 00:00:00 2001
From: John Donalson <mirlok@dr.com>
Date: Sat, 24 Jan 2026 11:23:15 -0500
Subject: [PATCH 18/29] Add Cursor MCP config support to VSCode extension

Introduces support for writing MCP config for Cursor (mcp.json) in the VSCode extension. Adds new settings, commands, and UI elements for Cursor integration, updates the dashboard and settings webviews to use a logo image, and bumps package versions. Also changes a warning print to logger.debug in the Qdrant ingest script.
---
 ctx-mcp-bridge/package.json                   |   2 +-
 scripts/ingest/qdrant.py                      |   4 +-
 .../context-engine-uploader/assets/logo.jpeg  | Bin 0 -> 3793 bytes
 .../context-engine-uploader/commands.js       |   8 ++
 .../context-engine-uploader/dashboard.js      |  16 ++--
 .../context-engine-uploader/mcp_config.js     |  70 +++++++++++++++++-
 .../context-engine-uploader/package.json      |  21 +++++-
 .../settings-webview.js                       |  15 ++--
 .../context-engine-uploader/sidebar.js        |  20 ++++-
 9 files changed, 127 insertions(+), 29 deletions(-)
 create mode 100644 vscode-extension/context-engine-uploader/assets/logo.jpeg

diff --git a/ctx-mcp-bridge/package.json b/ctx-mcp-bridge/package.json
index 2bdbe8f6..6c4e93cd 100644
--- a/ctx-mcp-bridge/package.json
+++ b/ctx-mcp-bridge/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@context-engine-bridge/context-engine-mcp-bridge",
-  "version": "0.0.15",
+  "version": "0.0.16",
   "description": "Context Engine MCP bridge (http/stdio proxy combining indexer + memory servers)",
   "bin": {
     "ctxce": "bin/ctxce.js",
diff --git a/scripts/ingest/qdrant.py b/scripts/ingest/qdrant.py
index f3267082..7711988e 100644
--- a/scripts/ingest/qdrant.py
+++ b/scripts/ingest/qdrant.py
@@ -466,8 +466,8 @@ def ensure_collection(
                         )
                         print(f"[COLLECTION_SUCCESS] Successfully updated collection {name} with missing vectors")
                     except Exception as update_e:
-                        print(
-                            f"[COLLECTION_WARNING] Cannot add missing vectors to {name} ({update_e}). "
+                        logger.debug(
+                            f"Cannot add missing vectors to {name} ({update_e}). "
                             "Continuing without them for this run."
                         )
         except Exception as e:
diff --git a/vscode-extension/context-engine-uploader/assets/logo.jpeg b/vscode-extension/context-engine-uploader/assets/logo.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..28ecace889777331b46627eff323b61bb526e6f8
GIT binary patch
literal 3793
zcmbVOc{r4P_rGWCW3PK;AG@(t#vn@!H8CX3lzoZOLsZs`eOH)dqNqob29Y&uNJTP)
zl(FYY55~S^iA0)r^!%>(djI^r@42pXU*B_|``q9AbI$pm`@<Yzjsb}PHzy|-CkHnd
z7Z(o?H;fPD=i}w&6FYEF5R?)}NK1)JO3El4R+f=dmzR`8>Z+(8)kLGw2xYzF$54ic
zwa_RC4-XF?FP|tszbHypQWo`J3$q0f<_0=}PBw@f02PL?2}77~Ayfc>a6o<w@P7h<
zvaxe;a&hy(c!dBcgpCc#&c?yP&dz!=isb{?g*gt$YMO8!#Jh9J1%Rly^vB%trd6$Q
zLf@)_7BMiM2PSezR7_k^Ng1i4iq_WAJ*KCB;w1KznK{nl+<6;YJ9`I5kBfhJdR_AN
z3Az?c3JDDhPe{CeBPltBoN?!FX4bv?e`Ztj3kshU6+bPhuBol7f8Nm8^rr1?JFTPh
zT^GH7U~p)7Wc1U-<P>9iW_E6VVQqcm+ve8y?H@Z3fbAbRtm{93{sWIN3lEf?osFGq
z9}ff?!csP2b`Duh&I2ZRF86?ga;P|N&@}yVRV$CY7GV`m4D5r6D4-`4*Y+X(M)dy!
z6#qXV`Zu6|<6(XR_}Cz<g0Tq$#=vYbHEnY#=P_B+5(_A)T%yLpG%<yF?Cl!hIUpY-
ze1sCS$Vh<7Qm__=1oH3}|5%0XibulTMDxMgv+QU*)|>;xpN7V<UIOA-O6bF|V(N<Q
zU*#8{@D^)LuX<!MkZp~Ty}?r<apmYcwQ#wN7>md)T~c>bgacK&QVjail}6(?8q`?+
zsES-n!)+RE8c@xzOPx>=RXt?_?_Cq0yQL>60or~gqeH)yZeBVT@9$u@TM&#WE>oRL
zy_?y`Z?A#{ZMd^}EgzKa<&HZ93=<Dhy7Du0Q@YGk)AUREgkAIVOmENGnt=R$YL5a>
zS#MNCblCPKc>9Aq0u1GW0LtQzAz0mPH?i?rBaKd<B>$Dpp#h$SjI&Yz<-?O5Re<}2
zR5F01X2l)^Yw;Kk@GKH&zn06B&@(K3rthuwq`xt(v1`SU$FKg=3WL%-0ombuH0v-(
zN>DX;7*LY^$HIb$O5jV;XFu;#<}FLCx2~$LxcL*5<QLP_BwDq?vgOTkJ?>Ik{$>KV
z0*c2qvcEV9Z%>Z((1|$$#JfRDW=x=@L44CH`pdFDoDup?_#VP$cE>Jr_>9zUy;kn@
zXXP_dcKEbG3D2c0Ka5(FHP<{W7JZl@VBZ-SqdIZzA@*DODYZhE*3dsTxr!_14Lu4_
z@vb&TH?-Q}jbB$1uMR9;sqMDEl9wWsXg+w}_c&0t7h?MC6&aqZ<W^?hmhLFx_^@{C
zs70zpvrpUf64q<+)AFsvv$ZQNT0%z?hs9@R(4?v!pJ;(<9p5dd$xQGzuL%G0d;iSO
zKCYV8yI|GmkBrj`HaKUF57C#eS9~(u`kq?o8%K7=V|~r>SP;Sw5>p94VLL1BWMn=F
z5GFISV)^h`(LCr&-8Z+xq@|uu{i&<>6Zh+WC8|Sv@zXVgXTOHqoJY#}y#RY6;stS{
z@z2T8w?@)AUYw@I#)eEl=CWYRJqcgm<&@n{Nx{dhzlcR^pJ{Tn^<1Us#HeG4o+0Jy
z%15`!BVJIe&KRMX3RUlp@24fcQ*c?D>PN)p7S8|Dp6b1!d(r&{!34f^zfxn3|Mh6y
zZf8pi$`7%6IHp`(Yw&ST56wFO3m@_fzn3(d76t2}wbq|39y!h}D%Z#aq?}LE=%e;x
zhYq)pxI`-m<0~^C@Aw6%pWGZ_c;<(MN#&mMA~9+t@7SyAJkmVlXzz=plK;B>``k5v
zFh_P1EJzS#B?zd+JL(%zjA}^k$%!^%VcebgU+4$*WuC$Cd68Uxdig+D@pl3Jnku$+
zc%@&F!>6gGt!Ff|V2t3bebd^UoxI@_DWmnNJ*$HCm=|K&PyHq|T%(_<pBOS@`*Cr&
zu+O3SWRvW=Jf%PKfJ{T$<+1CxW%;LR$CUrd{c@cIzEeyCkI0uGS301LSA*JD%?^%v
zzn?iBx{>txLebOW?+co9sfa_aye~=Eb9duTx=~z<HnlT7g0>8HBXYdD((Nf{`7t<!
zj6iDzy4@9zZV9pHqzDB<!J0zVvy1*>=5tu$<yT@9`#*gT#3;GFu<pLyRu_~jxu(12
z9P`HuyNJkX`(SHV1M*6_-&H@4FINo%D|a<=qNPgH@R@op;%Zv)4Xd7|0;f$?I(7vU
zb|$N5=-Lv2kz*>uQd<X%HC=14gi6JRKzUlHs((wuoVVJDQvHny(g(i4Bl*R~z11s`
z8U@aby?b3A<4S6jwxW4Qb<>3uhG8#xPWvk#uApNpSwYLmi#>fq`N`!+<x`3pctH%6
zd{(bSsFpSNSVbZhxMNsD5wOeOJ`7SY4j@0hvz_vBYR4|e&f)JplA7Gmp)&`R4f08I
z<#U%+t9<q0QoYL~RFU&u^3)1cjFB2__pNSL7%hp^5_3?P@;=C^B_1VUQdohehv%OP
zTmEosQ&#~C9u@p<i5UK=rWzGt(`elM`jK?<m>EOUHUD^K`N#J5S^^Vz70U$ldmp^n
zG2FY7X<edufpIx>Fn(8|p{7UL?DmYAmr+@e6@1)zTcEF1XPEFdN|F;nZ+vLiHvGx#
z11c#ohvzIK+9<Qx=akcdepGQjSoc^Kqj<(dX4j^aUw@I%uzr<z_%m(}48P-IQxwPf
z(eH@B%A(Nx_M_F}O!JO_g3<2Qk_8clXr=F=5%Yi_w&kx=>n;VnoXm--zi<3Rtt8=%
zzM+ig=R%`uJ>za@ts8wwBl+lPOR7OKEk>Va<F*4*dn6F>ahK~%AO*edwM|u^48%Cj
zl^11%K7V&rfor!K{$b9_8XNSksQL(jcO>hw*rY$K`HDbvCWEa!TsNhzvrL^)m(u29
zaUFwb5e#y@#;I1sFLK8-h}egP(=qr{$aXeaDp?i7I&N<JBVA;9(`gsW%faX?o_pMD
ztlFPI6MuW`GwSql1(Q8Zd-VB+_LU?OiS+BZdt4Ava2NmjoAH$6&@U8SX?GpR^DSNQ
z+xxJ3mCL%Oe33nyV^##(kNPneteS^`P2<m!u)oPOhi+UHlt@LTw^qv3H)POf43h23
zl1JI?o|(vGb5CDrS`dwvYh37Oj5Ku3-dz+yjAoZ@<r{$0JS#=%=ljHQB53v(LM2>5
zE*H|4>&G4cifemp8l7Z3R269;_-K+>frhygGX3FOMX|+dBfn3$^vyzS!TdZdZbpc&
za&LwSz@7(%_<2e1rc9++n=Wzs_-w=+a@FpAp%76c3CK~-pkgjeJEz}#s$m%}U-6(l
z+)#!I6ow_y(Mr`7h9OXflSAihReC%VP#+e?UMmrjeg6>NaZ4K)D7Wq|RsY0#F}wGk
z%b@SEj9-x<u(6m-FE={5?;LsbAn8~JPJxkIK*wY4<H^W8RtZ1|2S^x*WzDq~l}?KK
zO-Ubzk-t?_Gra3Yw-KjpQ40jVOe1HH{sAUnk<y)|D{jZH)iO5e5X|#XbYt2q&FD$p
zSZlVP6;;pV*Xr$==MRU_UL;rv3TFs$5DqOh&suUS|Il-W4K9n@8CEuAtlU!1n$<4d
zMmd=~Mu>@D!)M(}>fx%_A69eia}Al2(ohcu8<L3zsTuis?!!NZYD?;WIG}FTnd!Pd
zofvIq0)nHu1rafajSItE!Uf}24Btze<7ULp7$rSgE2*f2U>A&OL%)2JtK)i53ER@f
zZ=OX8`^_w(PKTcm{`mpXtvwPNk1)o(wBEVsv_89Idn8WdjrVt5sd3EfSj1j<*MT~v
zbB+()WuVq3uEr61O|Kdnj$cfFu%W%sRPe=%UNPOP^X6?-Kzo>!t(0%O4k%DdATMf(
zkdXxtEC};pE%KoE!1%0KwaKjCO`Ub0hy1OQGt)>a`N-K7iN)*<pZT8>Wr~?&qrDTv
zoN3$~f(cYQ*xtN*g?w{h(a@5wKQww-_FYuo-@!6M*7L5;$-NUcIHEtWU=?xPv_7l*
zNOt{dit#HUuZ;>Hn=p9Uxm&amGj+$P<{-(gs^Xxp<Qc<ovyr8awXjsPe)zDKs;s9B
zQcPhBKI5u1IBYjiBW&!Ie3f#1&q^QH6(adC{icgo++m%CLpn@ggMRc&IulqNR(6sW
zYqpdkjpLlIT)nICO-wSnq2g(2d%kw#Je{VPGUeB_|5whOc%W0$C!a!-h`7T9qJ;O<
zh_?Zw%+@G%AGdAVROjl?lw8kXlOHUK^uR-VWP4@V(b5U)d;XBW;OSN4^a_lfr?gH=
z&N!7A%i_Pj7*-$JL$fln;;w^u3@;WuD+g1#RG|5r79$Ju_?NZXn{MQmg7+H|{n>Ic
zTiVr;5hA_nh?wRCL{|e@P1k~dylv*v$e^KRu}>_mS1xul%?PrSyGg+mmf|f>Dy=B8
zc(pKn-c|EORf=0<s`J}?7_JSn`PKGoWUzWVyry<NhY3j32YiEf^p{8>btB+C9oFdQ
zTIl{4IQg6!O-BDEYOMlmSnZS0xD-n)$c-#yk#=NZJDUuZtOD#)1Qu(^gR+!^wO|=4
zk6|j_To#V3ko_<+z@Z^inCH%F7<LU7&V7oDf;pO4n(xOz<FNk(|2D|TC$a<kgnnOZ
z<5`J;lJ{vZ9urHT!yGyH@1I~Xll?7$!h)}szkMvf6}O6+qXm8_merE^#rpv;*3O(D
PHRJBJ|NJ&28*}tu;T)>+

literal 0
HcmV?d00001

diff --git a/vscode-extension/context-engine-uploader/commands.js b/vscode-extension/context-engine-uploader/commands.js
index 29f97859..197a0658 100644
--- a/vscode-extension/context-engine-uploader/commands.js
+++ b/vscode-extension/context-engine-uploader/commands.js
@@ -220,6 +220,14 @@ function registerExtensionCommands(deps) {
         }
     }));
 
+    disposables.push(vscode.commands.registerCommand('contextEngineUploader.writeMcpConfigCursor', () => {
+        try {
+            requireDep(writeMcpConfig, 'writeMcpConfig')({ targets: ['cursor'] }).catch(error => handleCatch(error, 'Failed to write Cursor MCP config'));
+        } catch (error) {
+            handleCatch(error, 'Failed to write Cursor MCP config');
+        }
+    }));
+
     // Onboarding/Stack commands
     disposables.push(vscode.commands.registerCommand('contextEngineUploader.cloneAndStartStack', async () => {
         try {
diff --git a/vscode-extension/context-engine-uploader/dashboard.js b/vscode-extension/context-engine-uploader/dashboard.js
index 69c6bc2c..3de35cb7 100644
--- a/vscode-extension/context-engine-uploader/dashboard.js
+++ b/vscode-extension/context-engine-uploader/dashboard.js
@@ -95,13 +95,14 @@ class DashboardViewProvider {
   _getHtmlContent(webview) {
     const state = this._getState();
     const nonce = getNonce();
+    const logoUri = webview.asWebviewUri(vscode.Uri.joinPath(this._extensionUri, 'assets', 'logo.jpeg'));
     
     return `<!DOCTYPE html>
 <html lang="en">
 <head>
   <meta charset="UTF-8">
   <meta name="viewport" content="width=device-width, initial-scale=1.0">
-  <meta http-equiv="Content-Security-Policy" content="default-src 'none'; style-src ${webview.cspSource} 'unsafe-inline'; font-src https://microsoft.github.io; script-src 'nonce-${nonce}';">
+  <meta http-equiv="Content-Security-Policy" content="default-src 'none'; style-src ${webview.cspSource} 'unsafe-inline'; font-src https://microsoft.github.io; img-src ${webview.cspSource}; script-src 'nonce-${nonce}';">
   <title>Context Engine Dashboard</title>
   <link href="https://microsoft.github.io/vscode-codicons/dist/codicon.css" rel="stylesheet">
   <style>
@@ -110,7 +111,7 @@ class DashboardViewProvider {
 </head>
 <body>
   <div class="dashboard">
-    ${this._getHeaderHtml(state)}
+    ${this._getHeaderHtml(state, logoUri)}
     ${this._getSetupCardHtml(state)}
     ${this._getQuickActionsHtml(state)}
     ${this._getIntegrationsHtml(state)}
@@ -213,14 +214,11 @@ class DashboardViewProvider {
       letter-spacing: -0.01em;
     }
 
-    .header .logo {
+    .header .logo-img {
       width: 18px;
       height: 18px;
-      background: linear-gradient(135deg, var(--accent), color-mix(in srgb, var(--accent) 70%, #8b5cf6));
       border-radius: var(--radius-sm);
-      display: flex;
-      align-items: center;
-      justify-content: center;
+      object-fit: cover;
     }
 
     .header .stats {
@@ -515,14 +513,14 @@ class DashboardViewProvider {
   `;
   }
 
-  _getHeaderHtml(state) {
+  _getHeaderHtml(state, logoUri) {
     const isActive = state.statusMode === 'indexing' || state.statusMode === 'watching';
     const statusText = state.statusMode === 'indexing' ? 'Indexing' : state.statusMode === 'watching' ? 'Watching' : 'Idle';
     const statusClass = isActive ? 'active' : 'idle';
     return `
     <div class="header">
       <h1>
-        <span class="logo"><i class="codicon codicon-database" style="font-size:12px;color:#fff;"></i></span>
+        <img src="${logoUri}" alt="Context Engine" class="logo-img">
         Context Engine
       </h1>
       <div class="stats">
diff --git a/vscode-extension/context-engine-uploader/mcp_config.js b/vscode-extension/context-engine-uploader/mcp_config.js
index 3e40c2fd..00e7fcc6 100644
--- a/vscode-extension/context-engine-uploader/mcp_config.js
+++ b/vscode-extension/context-engine-uploader/mcp_config.js
@@ -20,6 +20,13 @@ function getDefaultAntigravityMcpPath() {
   return path.join(home, '.gemini', 'antigravity', 'mcp_config.json');
 }
 
+function getDefaultCursorMcpPath() {
+  const home = (process.platform === 'win32')
+    ? (process.env.USERPROFILE || os.homedir())
+    : os.homedir();
+  return path.join(home, '.cursor', 'mcp.json');
+}
+
 function createMcpConfigManager(deps) {
   const vscode = deps.vscode;
   const log = deps.log;
@@ -104,6 +111,56 @@ function createMcpConfigManager(deps) {
     return success;
   }
 
+  async function writeCursorMcpServers(configPath, indexerUrl, memoryUrl, transportMode, serverMode = 'bridge', workspaceHint) {
+    try {
+      fs.mkdirSync(path.dirname(configPath), { recursive: true });
+    } catch (error) {
+      log(`Failed to ensure Cursor MCP directory: ${error instanceof Error ? error.message : String(error)}`);
+      vscode.window.showErrorMessage('Context Engine Uploader: failed to prepare Cursor MCP directory.');
+      return false;
+    }
+
+    const config = loadJsonConfigOrDefault(
+      configPath,
+      { mcpServers: {} },
+      'Context Engine Uploader: existing Cursor mcp.json is invalid JSON; not modified.',
+      'Failed to parse Cursor mcp.json'
+    );
+    if (!config) {
+      return false;
+    }
+    ensureMcpServersObject(config);
+
+    const servers = config.mcpServers;
+    const mode = (typeof transportMode === 'string' ? transportMode.trim() : 'sse-remote') || 'sse-remote';
+
+    log(`Preparing to write Cursor mcp.json at ${configPath} with indexerUrl=${indexerUrl || '""'} memoryUrl=${memoryUrl || '""'}`);
+
+    applyMcpServersUpdate(servers, {
+      serverMode,
+      transportMode: mode,
+      indexerUrl,
+      memoryUrl,
+      bridgeWorkspace: resolveBridgeWorkspacePath() || workspaceHint || '',
+      bridgeHttpUrl: () => resolveBridgeHttpUrl(),
+      makeBridgeHttpServer: (url) => ({ type: 'http', url }),
+      makeDirectHttpServer: (url) => ({ type: 'http', url }),
+      makeRemoteSseServer: (url) => makeMcpRemoteServer(url, { allowHttpForNonLocal: true, useCmdOnWindows: true }),
+      deleteContextEngineInDirect: true,
+    });
+
+    const success = writeJsonConfig(
+      configPath,
+      config,
+      'Context Engine Uploader: Cursor MCP config updated. Restart Cursor to reload MCP servers.',
+      'Wrote Cursor mcp.json at',
+      'Context Engine Uploader: failed to write Cursor mcp.json.',
+      'Failed to write Cursor mcp.json'
+    );
+
+    return success;
+  }
+
   function getClaudeHookCommand() {
     const isLinux = process.platform === 'linux';
     if (!isLinux) {
@@ -715,6 +772,7 @@ function createMcpConfigManager(deps) {
     const windsurfEnabled = settings.get('mcpWindsurfEnabled', false);
     const augmentEnabled = settings.get('mcpAugmentEnabled', false);
     const antigravityEnabled = settings.get('mcpAntigravityEnabled', false);
+    const cursorEnabled = settings.get('mcpCursorEnabled', false);
     const claudeHookEnabled = settings.get('claudeHookEnabled', false);
     const isLinux = process.platform === 'linux';
 
@@ -722,10 +780,10 @@ function createMcpConfigManager(deps) {
     const wantsClaude = targets ? targets.includes('claude') : claudeEnabled;
     const wantsWindsurf = targets ? targets.includes('windsurf') : windsurfEnabled;
     const wantsAugment = targets ? targets.includes('augment') : augmentEnabled;
-
     const wantsAntigravity = targets ? targets.includes('antigravity') : antigravityEnabled;
+    const wantsCursor = targets ? targets.includes('cursor') : cursorEnabled;
 
-    if (!wantsClaude && !wantsWindsurf && !wantsAugment && !wantsAntigravity && !claudeHookEnabled) {
+    if (!wantsClaude && !wantsWindsurf && !wantsAugment && !wantsAntigravity && !wantsCursor && !claudeHookEnabled) {
       vscode.window.showInformationMessage('Context Engine Uploader: MCP config writing is disabled in settings.');
       return;
     }
@@ -797,6 +855,13 @@ function createMcpConfigManager(deps) {
       const result = await writeAntigravityMcpServers(agPath, indexerUrl, memoryUrl, transportMode, serverMode, workspaceHint);
       wroteAny = wroteAny || result;
     }
+    if (wantsCursor) {
+      const customPath = (settings.get('cursorMcpPath') || '').trim();
+      const cursorPath = customPath || getDefaultCursorMcpPath();
+      const workspaceHint = getWorkspaceFolderPath();
+      const result = await writeCursorMcpServers(cursorPath, indexerUrl, memoryUrl, transportMode, serverMode, workspaceHint);
+      wroteAny = wroteAny || result;
+    }
     if (claudeHookEnabled) {
       const root = getWorkspaceFolderPath();
       if (!root) {
@@ -839,4 +904,5 @@ module.exports = {
   getDefaultWindsurfMcpPath,
   getDefaultAugmentMcpPath,
   getDefaultAntigravityMcpPath,
+  getDefaultCursorMcpPath,
 };
diff --git a/vscode-extension/context-engine-uploader/package.json b/vscode-extension/context-engine-uploader/package.json
index 9cce7423..c80cba96 100644
--- a/vscode-extension/context-engine-uploader/package.json
+++ b/vscode-extension/context-engine-uploader/package.json
@@ -2,7 +2,7 @@
   "name": "context-engine-uploader",
   "displayName": "Context-Engine.AI",
   "description": "Supercharge your AI coding assistants with rich codebase context. Integrates with Claude Code, Windsurf, Augment, and more via MCP.",
-  "version": "0.1.47",
+  "version": "0.1.48",
   "publisher": "context-engine",
   "engines": {
     "vscode": "^1.85.0"
@@ -29,6 +29,7 @@
     "onCommand:contextEngineUploader.writeMcpConfigWindsurf",
     "onCommand:contextEngineUploader.writeMcpConfigAugment",
     "onCommand:contextEngineUploader.writeMcpConfigAntigravity",
+    "onCommand:contextEngineUploader.writeMcpConfigCursor",
     "onCommand:contextEngineUploader.cloneAndStartStack",
     "onCommand:contextEngineUploader.startSavedStack",
     "onCommand:contextEngineUploader.openSettings",
@@ -90,6 +91,10 @@
         "command": "contextEngineUploader.writeMcpConfigAntigravity",
         "title": "Context Engine Uploader: Write MCP Config (Antigravity / mcp_config.json)"
       },
+      {
+        "command": "contextEngineUploader.writeMcpConfigCursor",
+        "title": "Context Engine Uploader: Write MCP Config (Cursor / mcp.json)"
+      },
       {
         "command": "contextEngineUploader.writeCtxConfig",
         "title": "Context Engine Uploader: Write CTX Config (ctx_config.json/.env)"
@@ -303,10 +308,16 @@
             "order": 4,
             "markdownDescription": "Write MCP config for **Google Antigravity**."
           },
-          "contextEngineUploader.autoWriteMcpConfigOnStartup": {
+          "contextEngineUploader.mcpCursorEnabled": {
             "type": "boolean",
             "default": false,
             "order": 5,
+            "markdownDescription": "Write MCP config for **Cursor** (~/.cursor/mcp.json)."
+          },
+          "contextEngineUploader.autoWriteMcpConfigOnStartup": {
+            "type": "boolean",
+            "default": false,
+            "order": 6,
             "description": "Automatically write MCP configs when extension activates."
           }
         }
@@ -503,6 +514,12 @@
             "order": 9,
             "description": "Custom Antigravity mcp_config.json path."
           },
+          "contextEngineUploader.cursorMcpPath": {
+            "type": "string",
+            "default": "",
+            "order": 10,
+            "description": "Custom Cursor mcp.json path."
+          },
           "contextEngineUploader.profilesDb": {
             "type": "object",
             "default": {
diff --git a/vscode-extension/context-engine-uploader/settings-webview.js b/vscode-extension/context-engine-uploader/settings-webview.js
index 64ecfc0d..d3d23563 100644
--- a/vscode-extension/context-engine-uploader/settings-webview.js
+++ b/vscode-extension/context-engine-uploader/settings-webview.js
@@ -37,6 +37,7 @@ const SETTINGS_SCHEMA = {
       { key: 'mcpWindsurfEnabled', label: 'Windsurf', type: 'boolean', description: 'Write MCP config for Windsurf/Codeium' },
       { key: 'mcpAugmentEnabled', label: 'Augment Code', type: 'boolean', description: 'Write MCP config for Augment' },
       { key: 'mcpAntigravityEnabled', label: 'Antigravity', type: 'boolean', description: 'Write MCP config for Google Antigravity' },
+      { key: 'mcpCursorEnabled', label: 'Cursor', type: 'boolean', description: 'Write MCP config for Cursor (~/.cursor/mcp.json)' },
       { key: 'autoWriteMcpConfigOnStartup', label: 'Auto-write on Startup', type: 'boolean', description: 'Automatically write MCP configs when extension activates' },
     ]
   },
@@ -179,13 +180,14 @@ class SettingsWebviewProvider {
   _getHtmlContent(webview) {
     const nonce = getNonce();
     const values = this._getAllSettings();
+    const logoUri = webview.asWebviewUri(vscode.Uri.joinPath(this._extensionUri, 'assets', 'logo.jpeg'));
 
     return `<!DOCTYPE html>
 <html lang="en">
 <head>
   <meta charset="UTF-8">
   <meta name="viewport" content="width=device-width, initial-scale=1.0">
-  <meta http-equiv="Content-Security-Policy" content="default-src 'none'; style-src ${webview.cspSource} 'unsafe-inline'; font-src https://microsoft.github.io; script-src 'nonce-${nonce}';">
+  <meta http-equiv="Content-Security-Policy" content="default-src 'none'; style-src ${webview.cspSource} 'unsafe-inline'; font-src https://microsoft.github.io; img-src ${webview.cspSource}; script-src 'nonce-${nonce}';">
   <title>Context Engine Settings</title>
   <link href="https://microsoft.github.io/vscode-codicons/dist/codicon.css" rel="stylesheet">
   <style>${this._getStyles()}</style>
@@ -194,7 +196,7 @@ class SettingsWebviewProvider {
   <div class="settings-container">
     <aside class="sidebar">
       <div class="sidebar-header">
-        <div class="logo"><span class="codicon codicon-settings-gear"></span></div>
+        <img src="${logoUri}" alt="Context Engine" class="logo-img">
         <h1>Settings</h1>
       </div>
       <nav class="nav-list">
@@ -328,16 +330,11 @@ class SettingsWebviewProvider {
       gap: 10px;
       border-bottom: 1px solid var(--border-subtle);
     }
-    .sidebar-header .logo {
+    .sidebar-header .logo-img {
       width: 28px;
       height: 28px;
-      background: linear-gradient(135deg, var(--accent), #8b5cf6);
       border-radius: var(--radius-md);
-      display: flex;
-      align-items: center;
-      justify-content: center;
-      color: white;
-      font-size: 14px;
+      object-fit: cover;
     }
     .sidebar-header h1 {
       font-size: 14px;
diff --git a/vscode-extension/context-engine-uploader/sidebar.js b/vscode-extension/context-engine-uploader/sidebar.js
index f00c1881..18188b43 100644
--- a/vscode-extension/context-engine-uploader/sidebar.js
+++ b/vscode-extension/context-engine-uploader/sidebar.js
@@ -2,7 +2,7 @@ const vscode = require('vscode');
 const fs = require('fs');
 const path = require('path');
 const { spawn } = require('child_process');
-const { getDefaultWindsurfMcpPath, getDefaultAugmentMcpPath, getDefaultAntigravityMcpPath } = require('./mcp_config');
+const { getDefaultWindsurfMcpPath, getDefaultAugmentMcpPath, getDefaultAntigravityMcpPath, getDefaultCursorMcpPath } = require('./mcp_config');
 const { checkAuthStatus } = require('./auth_utils');
 
 function makeTreeItem(label, opts = {}) {
@@ -489,6 +489,9 @@ function register(context, deps) {
       const antigravityEnabled = (() => {
         try { return !!cfg.get('mcpAntigravityEnabled', false); } catch (_) { return false; }
       })();
+      const cursorEnabled = (() => {
+        try { return !!cfg.get('mcpCursorEnabled', false); } catch (_) { return false; }
+      })();
       const windsurfMcpPath = (() => {
         try {
           const custom = (cfg.get('windsurfMcpPath') || '').trim();
@@ -513,6 +516,14 @@ function register(context, deps) {
           return getDefaultAntigravityMcpPath();
         }
       })();
+      const cursorMcpPath = (() => {
+        try {
+          const custom = (cfg.get('cursorMcpPath') || '').trim();
+          return custom || getDefaultCursorMcpPath();
+        } catch (_) {
+          return getDefaultCursorMcpPath();
+        }
+      })();
 
       const bridgeMode = (() => {
         const mode = resolveMcpMode(cfg);
@@ -528,7 +539,8 @@ function register(context, deps) {
       const missingWindsurfMcpConfig = !!(windsurfEnabled && windsurfMcpPath && !pathExists(windsurfMcpPath));
       const missingAugmentMcpConfig = !!(augmentEnabled && augmentMcpPath && !pathExists(augmentMcpPath));
       const missingAntigravityMcpConfig = !!(antigravityEnabled && antigravityMcpPath && !pathExists(antigravityMcpPath));
-      const missingAnyMcpConfig = !!(missingClaudeMcpConfig || missingWindsurfMcpConfig || missingAugmentMcpConfig || missingAntigravityMcpConfig);
+      const missingCursorMcpConfig = !!(cursorEnabled && cursorMcpPath && !pathExists(cursorMcpPath));
+      const missingAnyMcpConfig = !!(missingClaudeMcpConfig || missingWindsurfMcpConfig || missingAugmentMcpConfig || missingAntigravityMcpConfig || missingCursorMcpConfig);
       const missingCtxConfig = !ctxConfigPath;
 
       const items = [
@@ -617,7 +629,7 @@ function register(context, deps) {
           description: 'Missing',
           icon: new vscode.ThemeIcon('warning'),
           command: { command: 'contextEngineUploader.writeMcpConfigSelect', title: 'Write MCP Config...' },
-          tooltip: 'Select which MCP config to write (All enabled, Claude, Windsurf, Augment, Antigravity).',
+          tooltip: 'Select which MCP config to write (All enabled, Claude, Windsurf, Augment, Antigravity, Cursor).',
         }));
       }
 
@@ -686,7 +698,7 @@ function register(context, deps) {
         makeTreeItem('Write MCP Config...', {
           icon: new vscode.ThemeIcon('file-code'),
           command: { command: 'contextEngineUploader.writeMcpConfigSelect', title: 'Write MCP Config...' },
-          tooltip: 'Select which MCP config to write (All enabled, Claude, Windsurf, Augment, Antigravity).',
+          tooltip: 'Select which MCP config to write (All enabled, Claude, Windsurf, Augment, Antigravity, Cursor).',
         }),
         makeTreeItem('Write CTX Config (ctx_config.json)', {
           icon: new vscode.ThemeIcon('file-text'),

From aa67bf769e60b59656278f65de0245e8424aa359 Mon Sep 17 00:00:00 2001
From: John Donalson <mirlok@dr.com>
Date: Sat, 24 Jan 2026 11:23:21 -0500
Subject: [PATCH 19/29] Update README.md

---
 vscode-extension/context-engine-uploader/README.md | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vscode-extension/context-engine-uploader/README.md b/vscode-extension/context-engine-uploader/README.md
index c84a79b6..d430d7c1 100644
--- a/vscode-extension/context-engine-uploader/README.md
+++ b/vscode-extension/context-engine-uploader/README.md
@@ -61,7 +61,10 @@ MCP bridge (ctx-mcp-bridge) & MCP config lifecycle
   - **Centralized logging & health:** when the bridge process runs once per workspace you get a single stream of logs (`Context Engine Upload` output) and a single port to probe for health checks instead of multiple MCP child processes per IDE.
 - When you run **`Write MCP Config`**, the extension:
   - Writes `.mcp.json` in the workspace for Claude Code.
-  - Optionally writes Windsurf’s `mcp_config.json` (when `mcpWindsurfEnabled=true`).
+  - Optionally writes Windsurf's `mcp_config.json` (when `mcpWindsurfEnabled=true`).
+  - Optionally writes Augment's `settings.json` (when `mcpAugmentEnabled=true`).
+  - Optionally writes Antigravity's `mcp_config.json` (when `mcpAntigravityEnabled=true`).
+  - Optionally writes Cursor's `~/.cursor/mcp.json` (when `mcpCursorEnabled=true`).
   - Optionally scaffolds `ctx_config.json` + `.env` (when `scaffoldCtxConfig=true`).
 - The effective wiring mode is determined by the two MCP settings:
   - `mcpServerMode = bridge`, `mcpTransportMode = sse-remote` → **bridge-stdio**.

From d1453b12e8023694414c3a7fd3aa1a5ab229da2b Mon Sep 17 00:00:00 2001
From: John Donalson <mirlok@dr.com>
Date: Sat, 24 Jan 2026 11:29:52 -0500
Subject: [PATCH 20/29] Improve robustness in termination and deduplication
 logic

Enhances the TerminationChecker to handle None and non-numeric scores gracefully, ensuring robust sorting and filtering. Updates chunk deduplication to better handle structure and definition overlaps. Also updates ASTAnalyzer to consider 'calls' in result validation and adds documentation for Cursor integration in the VSCode extension README.
---
 scripts/ast_analyzer.py                       |  2 +-
 scripts/hybrid/termination.py                 | 56 +++++++++++--------
 scripts/ingest/chunk_deduplication.py         | 15 +++--
 .../context-engine-uploader/README.md         | 12 ++++
 4 files changed, 56 insertions(+), 29 deletions(-)

diff --git a/scripts/ast_analyzer.py b/scripts/ast_analyzer.py
index 4b16c5f7..0caf18cd 100644
--- a/scripts/ast_analyzer.py
+++ b/scripts/ast_analyzer.py
@@ -260,7 +260,7 @@ def analyze_file(
         # Use language mappings (32 languages, declarative queries)
         if _LANGUAGE_MAPPINGS_AVAILABLE and self.use_tree_sitter:
             result = self._analyze_with_mapping(content, file_path, language)
-            if result and (result.get("symbols") or result.get("imports")):
+            if result and (result.get("symbols") or result.get("imports") or result.get("calls")):
                 return result
         
         # Fallback to legacy per-language analyzers
diff --git a/scripts/hybrid/termination.py b/scripts/hybrid/termination.py
index ba588f83..f475ba0a 100644
--- a/scripts/hybrid/termination.py
+++ b/scripts/hybrid/termination.py
@@ -161,25 +161,35 @@ def check(
         if len(results) >= self.config.result_limit:
             logger.debug(f"Termination: result limit {self.config.result_limit}")
             return True, "result_limit"
-        
-        high_scoring = [r for r in results if r.get(score_key, 0) > 0]
+
+	        # Handle None/non-numeric scores gracefully
+	        def get_numeric_score(r: dict) -> float:
+	            score = r.get(score_key, 0)
+	            if score is None or isinstance(score, bool):
+	                return 0.0
+	            try:
+	                return float(score)
+	            except (TypeError, ValueError):
+	                return 0.0
+
+        high_scoring = [r for r in results if get_numeric_score(r) > 0]
         if len(high_scoring) < self.config.min_candidates_for_expansion:
             logger.debug(f"Termination: insufficient candidates ({len(high_scoring)})")
             return True, "insufficient_candidates"
-        
-        sorted_results = sorted(results, key=lambda x: -x.get(score_key, 0))
+
+        sorted_results = sorted(results, key=lambda x: -get_numeric_score(x))
         top_n = sorted_results[:self.config.top_n_to_track]
         
-        if top_n:
-            top_score = top_n[0].get(score_key, 0)
-            self.score_stats.update(top_score)
-            self.top_scores_history.append(top_score)
+	        if top_n:
+	            top_score = get_numeric_score(top_n[0])
+	            self.score_stats.update(top_score)
+	            self.top_scores_history.append(top_score)
         
         if self.iteration >= self.config.min_iterations_before_stop:
             
-            if self.config.use_page_hinkley and top_n:
-                top_score = top_n[0].get(score_key, 0)
-                if self.page_hinkley.update(top_score):
+	            if self.config.use_page_hinkley and top_n:
+	                top_score = get_numeric_score(top_n[0])
+	                if self.page_hinkley.update(top_score):
                     logger.debug("Termination: Page-Hinkley detected score drift")
                     return True, "score_drift_detected"
             
@@ -194,11 +204,11 @@ def check(
                     threshold = self.config.fixed_degradation_threshold
                 
                 max_drop = 0.0
-                for chunk_id, prev_score in self.tracked_chunk_scores.items():
-                    current_score = next(
-                        (r.get(score_key, 0) for r in results if r.get(id_key) == chunk_id),
-                        0.0
-                    )
+	                for chunk_id, prev_score in self.tracked_chunk_scores.items():
+	                    current_score = next(
+	                        (get_numeric_score(r) for r in results if r.get(id_key) == chunk_id),
+	                        0.0
+	                    )
                     if current_score < prev_score:
                         max_drop = max(max_drop, prev_score - current_score)
                 
@@ -209,14 +219,14 @@ def check(
                     )
                     return True, "score_degradation"
         
-        self.tracked_chunk_scores.clear()
-        for r in top_n:
-            chunk_id = r.get(id_key)
-            if chunk_id:
-                self.tracked_chunk_scores[chunk_id] = r.get(score_key, 0)
+	        self.tracked_chunk_scores.clear()
+	        for r in top_n:
+	            chunk_id = r.get(id_key)
+	            if chunk_id:
+	                self.tracked_chunk_scores[chunk_id] = get_numeric_score(r)
         
-        if top_n:
-            min_score = min(r.get(score_key, 0) for r in top_n)
+	        if top_n:
+	            min_score = min(get_numeric_score(r) for r in top_n)
             if min_score < self.config.min_relevance_score:
                 logger.debug(f"Termination: min relevance {min_score:.3f}")
                 return True, "min_relevance"
diff --git a/scripts/ingest/chunk_deduplication.py b/scripts/ingest/chunk_deduplication.py
index 5f38d066..5eb7aaf9 100644
--- a/scripts/ingest/chunk_deduplication.py
+++ b/scripts/ingest/chunk_deduplication.py
@@ -181,21 +181,26 @@ def _deduplicate_exact_content(chunks: Sequence[T], content_key: str) -> list[T]
 def _remove_substring_overlaps(chunks: Sequence[T], content_key: str) -> list[T]:
     """Remove BLOCK chunks that are substrings of DEFINITION/STRUCTURE chunks."""
     definitions = []
+    structures = []
     blocks = []
     other = []
 
     for chunk in chunks:
         specificity = get_chunk_specificity(chunk)
+        type_name = _extract_type_name(chunk)
         if specificity == 1:  # BLOCK-like
             blocks.append(chunk)
-        elif specificity >= 3:  # DEFINITION-like
+        elif specificity >= 2:  # DEFINITION-like (includes type_alias, type)
             definitions.append(chunk)
+        elif type_name == "structure":  # STRUCTURE-like
+            structures.append(chunk)
         else:
             other.append(chunk)
 
-    definitions.sort(key=lambda c: c.get("start_line", 0))
+    containers = definitions + structures
+    containers.sort(key=lambda c: c.get("start_line", 0))
 
-    final = other + definitions
+    final = other + containers
 
     for block in blocks:
         block_content = normalize_content(
@@ -205,9 +210,9 @@ def _remove_substring_overlaps(chunks: Sequence[T], content_key: str) -> list[T]
         block_end = block.get("end_line", 0)
 
         is_substring = False
-        for definition in _find_overlapping(definitions, block_start, block_end):
+        for container in _find_overlapping(containers, block_start, block_end):
             def_content = normalize_content(
-                definition.get(content_key, "") or definition.get("content", "") or definition.get("text", "")
+                container.get(content_key, "") or container.get("content", "") or container.get("text", "")
             )
             if block_content in def_content and len(block_content) < len(def_content):
                 is_substring = True
diff --git a/vscode-extension/context-engine-uploader/README.md b/vscode-extension/context-engine-uploader/README.md
index d430d7c1..9b4f64dd 100644
--- a/vscode-extension/context-engine-uploader/README.md
+++ b/vscode-extension/context-engine-uploader/README.md
@@ -81,6 +81,18 @@ MCP bridge (ctx-mcp-bridge) & MCP config lifecycle
   - In **stdio or direct modes**, the HTTP bridge is **not** auto-started; only the explicit `Start MCP HTTP Bridge` command will launch it.
 - Bridge settings are **workspace-scoped**, so different workspaces can choose different modes and ports (e.g., one workspace using stdio bridge, another using HTTP bridge on a different port).
 
+Cursor Integration
+------------------
+
+Enable `mcpCursorEnabled` in settings to write MCP config to `~/.cursor/mcp.json`.
+
+**Caveats:**
+- Cursor uses a **global** MCP config at `~/.cursor/mcp.json` (not per-project like Claude's `.mcp.json`).
+- After updating the config, you must **restart Cursor** for changes to take effect.
+- Cursor's MCP support requires the `http` transport mode. Set `mcpTransportMode` to `http`.
+- If using bridge mode, ensure the HTTP bridge is running (`autoStartMcpBridge=true`).
+- Custom config path: set `cursorMcpPath` to override the default `~/.cursor/mcp.json` location.
+
 Optional auth with the MCP bridge (PoC)
 --------------------------------------
 

From 50caf65a7b67144d14f5323c4ace047b846f6612 Mon Sep 17 00:00:00 2001
From: John Donalson <mirlok@dr.com>
Date: Sat, 24 Jan 2026 11:32:16 -0500
Subject: [PATCH 21/29] Update termination.py

---
 scripts/hybrid/termination.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/scripts/hybrid/termination.py b/scripts/hybrid/termination.py
index f475ba0a..4ea87e47 100644
--- a/scripts/hybrid/termination.py
+++ b/scripts/hybrid/termination.py
@@ -172,14 +172,14 @@ def get_numeric_score(r: dict) -> float:
 	            except (TypeError, ValueError):
 	                return 0.0
 
-        high_scoring = [r for r in results if get_numeric_score(r) > 0]
+	        high_scoring = [r for r in results if get_numeric_score(r) > 0]
         if len(high_scoring) < self.config.min_candidates_for_expansion:
             logger.debug(f"Termination: insufficient candidates ({len(high_scoring)})")
             return True, "insufficient_candidates"
 
         sorted_results = sorted(results, key=lambda x: -get_numeric_score(x))
         top_n = sorted_results[:self.config.top_n_to_track]
-        
+	        
 	        if top_n:
 	            top_score = get_numeric_score(top_n[0])
 	            self.score_stats.update(top_score)
@@ -190,8 +190,8 @@ def get_numeric_score(r: dict) -> float:
 	            if self.config.use_page_hinkley and top_n:
 	                top_score = get_numeric_score(top_n[0])
 	                if self.page_hinkley.update(top_score):
-                    logger.debug("Termination: Page-Hinkley detected score drift")
-                    return True, "score_drift_detected"
+	                    logger.debug("Termination: Page-Hinkley detected score drift")
+	                    return True, "score_drift_detected"
             
             if self.tracked_chunk_scores and self.iteration > 2:
                 if self.config.use_adaptive_threshold:
@@ -207,7 +207,7 @@ def get_numeric_score(r: dict) -> float:
 	                for chunk_id, prev_score in self.tracked_chunk_scores.items():
 	                    current_score = next(
 	                        (get_numeric_score(r) for r in results if r.get(id_key) == chunk_id),
-	                        0.0
+	                        0.0,
 	                    )
                     if current_score < prev_score:
                         max_drop = max(max_drop, prev_score - current_score)
@@ -224,12 +224,12 @@ def get_numeric_score(r: dict) -> float:
 	            chunk_id = r.get(id_key)
 	            if chunk_id:
 	                self.tracked_chunk_scores[chunk_id] = get_numeric_score(r)
-        
+	        
 	        if top_n:
 	            min_score = min(get_numeric_score(r) for r in top_n)
-            if min_score < self.config.min_relevance_score:
-                logger.debug(f"Termination: min relevance {min_score:.3f}")
-                return True, "min_relevance"
+	            if min_score < self.config.min_relevance_score:
+	                logger.debug(f"Termination: min relevance {min_score:.3f}")
+	                return True, "min_relevance"
         
         return False, ""
     

From 012170132032056b385468ab9f835fc4dcd09c1b Mon Sep 17 00:00:00 2001
From: John Donalson <mirlok@dr.com>
Date: Sat, 24 Jan 2026 11:36:18 -0500
Subject: [PATCH 22/29] Add INDEX_WORKERS support and optimize fresh collection
 indexing

Introduces the INDEX_WORKERS environment variable to docker-compose and reset.py for configurable parallel indexing. Optimizes index_repo in pipeline.py to skip unnecessary cache checks and deduplication when recreating a fresh collection, reducing redundant Qdrant calls and improving performance.
---
 docker-compose.yml                |  2 ++
 scripts/ctx_cli/commands/reset.py |  6 +++---
 scripts/ingest/pipeline.py        | 10 ++++++++++
 3 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/docker-compose.yml b/docker-compose.yml
index 4e5b4a63..1075675b 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -457,6 +457,8 @@ services:
       - INDEX_GRAPH_EDGES_MODE=${INDEX_GRAPH_EDGES_MODE:-symbol}
       # Defer pseudo-tag generation to watcher worker for faster initial indexing
       - PSEUDO_DEFER_TO_WORKER=${PSEUDO_DEFER_TO_WORKER:-1}
+      # Parallel indexing - number of worker threads (default: 4, use -1 for CPU count)
+      - INDEX_WORKERS=${INDEX_WORKERS:-4}
     volumes:
       - workspace_pvc:/work:rw
       - codebase_pvc:/work/.codebase:rw
diff --git a/scripts/ctx_cli/commands/reset.py b/scripts/ctx_cli/commands/reset.py
index 9cbdde88..3e8642cf 100644
--- a/scripts/ctx_cli/commands/reset.py
+++ b/scripts/ctx_cli/commands/reset.py
@@ -344,13 +344,13 @@ def reset(
 
         # Build env vars for indexer
         indexer_env = {}
-        for var in ["INDEX_MICRO_CHUNKS", "MAX_MICRO_CHUNKS_PER_FILE", "TOKENIZER_PATH", "TOKENIZER_URL"]:
+        for var in ["INDEX_MICRO_CHUNKS", "MAX_MICRO_CHUNKS_PER_FILE", "TOKENIZER_PATH", "TOKENIZER_URL", "INDEX_WORKERS"]:
             if var in os.environ:
                 indexer_env[var] = os.environ[var]
 
-        # Defer pseudo-describe to backfill worker for much faster initial indexing
-        # The watch_index worker will backfill pseudo/tags after indexing completes
         indexer_env["PSEUDO_DEFER_TO_WORKER"] = "1"
+        if "INDEX_WORKERS" not in indexer_env:
+            indexer_env["INDEX_WORKERS"] = "4"
 
         # Run indexer detached (-d) so CLI doesn't block
         # Use --rm to auto-remove container on exit; first remove any stale container with same name
diff --git a/scripts/ingest/pipeline.py b/scripts/ingest/pipeline.py
index 35fbfb55..35e3616d 100644
--- a/scripts/ingest/pipeline.py
+++ b/scripts/ingest/pipeline.py
@@ -1290,6 +1290,16 @@ def index_repo(
     schema_mode: str | None = None,
 ):
     """Index a repository into Qdrant."""
+    # CRITICAL OPTIMIZATION: When recreating collection, skip all cache checks and deduplication
+    # The collection is empty, so:
+    # - skip_unchanged=False: no point checking if file hash changed (nothing in DB)
+    # - dedupe=False: no point deleting existing points (collection is empty)
+    # This avoids 2 Qdrant calls per file (scroll + delete) that are wasteful for fresh collections
+    if recreate:
+        skip_unchanged = False
+        dedupe = False
+        print("[index_repo] Recreate mode: skipping cache checks and deduplication (collection is fresh)")
+
     fast_fs = _env_truthy(os.environ.get("INDEX_FS_FASTPATH"), False)
     if skip_unchanged and not recreate and fast_fs and get_cached_file_meta is not None:
         try:

From 4390e0368415539aefb28d4a50cec28a872c1c2a Mon Sep 17 00:00:00 2001
From: John Donalson <mirlok@dr.com>
Date: Sat, 24 Jan 2026 11:38:26 -0500
Subject: [PATCH 23/29] Fix indentation and cleanup in termination logic

Corrected indentation issues and removed extraneous whitespace in the TerminationChecker class. This improves code readability and ensures proper execution of the termination checks.
---
 scripts/hybrid/termination.py | 85 +++++++++++++++++------------------
 1 file changed, 42 insertions(+), 43 deletions(-)

diff --git a/scripts/hybrid/termination.py b/scripts/hybrid/termination.py
index 4ea87e47..ec45eb37 100644
--- a/scripts/hybrid/termination.py
+++ b/scripts/hybrid/termination.py
@@ -162,37 +162,36 @@ def check(
             logger.debug(f"Termination: result limit {self.config.result_limit}")
             return True, "result_limit"
 
-	        # Handle None/non-numeric scores gracefully
-	        def get_numeric_score(r: dict) -> float:
-	            score = r.get(score_key, 0)
-	            if score is None or isinstance(score, bool):
-	                return 0.0
-	            try:
-	                return float(score)
-	            except (TypeError, ValueError):
-	                return 0.0
+        def get_numeric_score(r: dict) -> float:
+            score = r.get(score_key, 0)
+            if score is None or isinstance(score, bool):
+                return 0.0
+            try:
+                return float(score)
+            except (TypeError, ValueError):
+                return 0.0
 
-	        high_scoring = [r for r in results if get_numeric_score(r) > 0]
+        high_scoring = [r for r in results if get_numeric_score(r) > 0]
         if len(high_scoring) < self.config.min_candidates_for_expansion:
             logger.debug(f"Termination: insufficient candidates ({len(high_scoring)})")
             return True, "insufficient_candidates"
 
         sorted_results = sorted(results, key=lambda x: -get_numeric_score(x))
         top_n = sorted_results[:self.config.top_n_to_track]
-	        
-	        if top_n:
-	            top_score = get_numeric_score(top_n[0])
-	            self.score_stats.update(top_score)
-	            self.top_scores_history.append(top_score)
-        
+
+        if top_n:
+            top_score = get_numeric_score(top_n[0])
+            self.score_stats.update(top_score)
+            self.top_scores_history.append(top_score)
+
         if self.iteration >= self.config.min_iterations_before_stop:
-            
-	            if self.config.use_page_hinkley and top_n:
-	                top_score = get_numeric_score(top_n[0])
-	                if self.page_hinkley.update(top_score):
-	                    logger.debug("Termination: Page-Hinkley detected score drift")
-	                    return True, "score_drift_detected"
-            
+
+            if self.config.use_page_hinkley and top_n:
+                top_score = get_numeric_score(top_n[0])
+                if self.page_hinkley.update(top_score):
+                    logger.debug("Termination: Page-Hinkley detected score drift")
+                    return True, "score_drift_detected"
+
             if self.tracked_chunk_scores and self.iteration > 2:
                 if self.config.use_adaptive_threshold:
                     threshold = self.score_stats.adaptive_threshold(
@@ -202,35 +201,35 @@ def get_numeric_score(r: dict) -> float:
                         threshold = self.config.fixed_degradation_threshold
                 else:
                     threshold = self.config.fixed_degradation_threshold
-                
+
                 max_drop = 0.0
-	                for chunk_id, prev_score in self.tracked_chunk_scores.items():
-	                    current_score = next(
-	                        (get_numeric_score(r) for r in results if r.get(id_key) == chunk_id),
-	                        0.0,
-	                    )
+                for chunk_id, prev_score in self.tracked_chunk_scores.items():
+                    current_score = next(
+                        (get_numeric_score(r) for r in results if r.get(id_key) == chunk_id),
+                        0.0,
+                    )
                     if current_score < prev_score:
                         max_drop = max(max_drop, prev_score - current_score)
-                
+
                 if max_drop >= threshold:
                     logger.debug(
                         f"Termination: score degradation {max_drop:.3f} >= "
                         f"threshold {threshold:.3f}"
                     )
                     return True, "score_degradation"
-        
-	        self.tracked_chunk_scores.clear()
-	        for r in top_n:
-	            chunk_id = r.get(id_key)
-	            if chunk_id:
-	                self.tracked_chunk_scores[chunk_id] = get_numeric_score(r)
-	        
-	        if top_n:
-	            min_score = min(get_numeric_score(r) for r in top_n)
-	            if min_score < self.config.min_relevance_score:
-	                logger.debug(f"Termination: min relevance {min_score:.3f}")
-	                return True, "min_relevance"
-        
+
+        self.tracked_chunk_scores.clear()
+        for r in top_n:
+            chunk_id = r.get(id_key)
+            if chunk_id:
+                self.tracked_chunk_scores[chunk_id] = get_numeric_score(r)
+
+        if top_n:
+            min_score = min(get_numeric_score(r) for r in top_n)
+            if min_score < self.config.min_relevance_score:
+                logger.debug(f"Termination: min relevance {min_score:.3f}")
+                return True, "min_relevance"
+
         return False, ""
     
     def get_stats(self) -> Dict[str, float]:

From ef1264f7303e78e8569867c3135f2f67c4f8144e Mon Sep 17 00:00:00 2001
From: John Donalson <mirlok@dr.com>
Date: Sat, 24 Jan 2026 11:40:05 -0500
Subject: [PATCH 24/29] Improve symbol lookup and simplify Mann-Whitney U
 ranking

Refined the symbol lookup in ASTAnalyzer to select the most specific enclosing symbol by minimizing the span. Simplified the Mann-Whitney U implementation by replacing the ranks dictionary with a rank list, streamlining rank assignment and calculation.
---
 scripts/ast_analyzer.py       |  9 +++++++--
 scripts/hybrid/termination.py | 11 ++++-------
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/scripts/ast_analyzer.py b/scripts/ast_analyzer.py
index 0caf18cd..c2709c04 100644
--- a/scripts/ast_analyzer.py
+++ b/scripts/ast_analyzer.py
@@ -883,10 +883,15 @@ def _extract_calls_from_tree(self, root, content_bytes: bytes, symbols: List[Cod
         symbol_ranges = [(s.start_line, s.end_line, s.path or s.name) for s in symbols]
         
         def find_enclosing_symbol(line: int) -> str:
+            best_match = ""
+            best_span = float("inf")
             for start, end, path in symbol_ranges:
                 if start <= line <= end:
-                    return path
-            return ""
+                    span = end - start
+                    if span < best_span:
+                        best_span = span
+                        best_match = path
+            return best_match
         
         def walk(node):
             node_type = node.type
diff --git a/scripts/hybrid/termination.py b/scripts/hybrid/termination.py
index ec45eb37..c5746b1d 100644
--- a/scripts/hybrid/termination.py
+++ b/scripts/hybrid/termination.py
@@ -259,21 +259,18 @@ def mann_whitney_u(x: Sequence[float], y: Sequence[float]) -> Tuple[float, float
     combined = [(v, 0) for v in x] + [(v, 1) for v in y]
     combined.sort(key=lambda t: t[0])
     
-    ranks = {}
     i = 0
+    rank_list = []
     while i < len(combined):
         j = i
         while j < len(combined) and combined[j][0] == combined[i][0]:
             j += 1
         avg_rank = (i + j + 1) / 2.0
         for k in range(i, j):
-            val = combined[k][0]
-            if val not in ranks:
-                ranks[val] = []
-            ranks[val].append(avg_rank)
+            rank_list.append((combined[k][0], combined[k][1], avg_rank))
         i = j
-    
-    r1 = sum(ranks[v][0] if len(ranks[v]) == 1 else ranks[v].pop(0) for v in x)
+
+    r1 = sum(rank for val, group, rank in rank_list if group == 0)
     
     u1 = r1 - nx * (nx + 1) / 2
     u2 = nx * ny - u1

From d7c85133cf842c8589edf88508cecc8aa2ee35f1 Mon Sep 17 00:00:00 2001
From: John Donalson <mirlok@dr.com>
Date: Sat, 24 Jan 2026 12:05:55 -0500
Subject: [PATCH 25/29] Improve import parsing and path filtering logic

Refines Python import statement parsing in ASTAnalyzer to handle multiple modules and aliases. Updates Qdrant path filtering to use 'metadata.path_prefix' and strips trailing slashes for more accurate matching.
---
 scripts/ast_analyzer.py  | 7 ++++---
 scripts/hybrid/qdrant.py | 5 +++--
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/scripts/ast_analyzer.py b/scripts/ast_analyzer.py
index c2709c04..e84efd5d 100644
--- a/scripts/ast_analyzer.py
+++ b/scripts/ast_analyzer.py
@@ -825,9 +825,10 @@ def _parse_import_text(self, text: str, language: str) -> Tuple[str, List[str],
                     names = [n.strip().split(" as ")[0] for n in names_str.split(",")]
                     return module, names, True
             elif text.startswith("import "):
-                match = re.match(r"import\s+([\w.]+)", text)
-                if match:
-                    return match.group(1), [], False
+                modules_part = text[7:].strip()
+                modules = [m.strip().split(" as ")[0].strip() for m in modules_part.split(",")]
+                if modules:
+                    return modules[0], modules[1:] if len(modules) > 1 else [], False
         
         # JavaScript/TypeScript: import X from 'Y' or require('Y')
         elif language in ("javascript", "typescript", "jsx", "tsx"):
diff --git a/scripts/hybrid/qdrant.py b/scripts/hybrid/qdrant.py
index f14ea549..039ca6f1 100644
--- a/scripts/hybrid/qdrant.py
+++ b/scripts/hybrid/qdrant.py
@@ -897,9 +897,10 @@ def find_similar_chunks(
     must = []
     
     if path_filter:
+        path_filter_clean = path_filter.rstrip("/")
         must.append(models.FieldCondition(
-            key="metadata.path",
-            match=models.MatchText(text=path_filter),
+            key="metadata.path_prefix",
+            match=models.MatchValue(value=path_filter_clean),
         ))
     
     flt = models.Filter(must=must, must_not=must_not) if must or must_not else None

From 64901f22a46e05a74ecc89b50a1a57c9a8da77f6 Mon Sep 17 00:00:00 2001
From: John Donalson <mirlok@dr.com>
Date: Sat, 24 Jan 2026 12:22:38 -0500
Subject: [PATCH 26/29] Add embedding, exception, cache, and model modules

Introduces new modules for embedding provider protocols, structured exception hierarchy, file discovery and syntax tree caching, type aliases, and domain models for code indexing. Also updates the base language mapping to support constant extraction and import resolution extension points.
---
 scripts/embedding_provider.py            | 303 +++++++++++++++++++++++
 scripts/exceptions.py                    | 273 ++++++++++++++++++++
 scripts/ingest/file_discovery_cache.py   | 198 +++++++++++++++
 scripts/ingest/language_mappings/base.py | 104 +++++---
 scripts/ingest/models.py                 | 264 ++++++++++++++++++++
 scripts/ingest/tree_cache.py             | 241 ++++++++++++++++++
 scripts/ingest/types.py                  |  73 ++++++
 7 files changed, 1422 insertions(+), 34 deletions(-)
 create mode 100644 scripts/embedding_provider.py
 create mode 100644 scripts/exceptions.py
 create mode 100644 scripts/ingest/file_discovery_cache.py
 create mode 100644 scripts/ingest/models.py
 create mode 100644 scripts/ingest/tree_cache.py
 create mode 100644 scripts/ingest/types.py

diff --git a/scripts/embedding_provider.py b/scripts/embedding_provider.py
new file mode 100644
index 00000000..6a59f987
--- /dev/null
+++ b/scripts/embedding_provider.py
@@ -0,0 +1,303 @@
+"""Protocol-based embedding provider interface.
+
+Defines the abstract interface for embedding implementations, enabling
+pluggable backends (FastEmbed, OpenAI, local models, etc.) while maintaining
+consistent behavior and type safety.
+
+Usage:
+    class FastEmbedProvider:
+        '''Implements EmbeddingProvider protocol.'''
+        
+        @property
+        def name(self) -> str:
+            return "fastembed"
+        
+        async def embed(self, texts: list[str]) -> list[list[float]]:
+            return list(self.model.embed(texts))
+    
+    # Manager for multiple providers
+    manager = EmbeddingManager()
+    manager.register_provider(FastEmbedProvider(), set_default=True)
+    
+    embeddings = await manager.embed(["hello", "world"])
+"""
+
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from typing import (
+    Any,
+    AsyncIterator,
+    Dict,
+    List,
+    Optional,
+    Protocol,
+    runtime_checkable,
+)
+
+
+@dataclass
+class RerankResult:
+    """Result from reranking operation."""
+    index: int
+    score: float
+    text: Optional[str] = None
+
+
+@dataclass
+class EmbeddingConfig:
+    """Configuration for embedding providers."""
+    provider: str
+    model: str
+    dims: int
+    distance: str = "cosine"
+    batch_size: int = 100
+    max_tokens: Optional[int] = None
+    api_key: Optional[str] = None
+    base_url: Optional[str] = None
+    timeout: int = 30
+    retry_attempts: int = 3
+    retry_delay: float = 1.0
+
+
+@runtime_checkable
+class EmbeddingProvider(Protocol):
+    """Abstract protocol for embedding providers.
+    
+    All embedding implementations must follow this interface.
+    Enables pluggable backends (OpenAI, local models, etc.)
+    """
+
+    @property
+    def name(self) -> str:
+        """Provider name (e.g., 'fastembed', 'openai')."""
+        ...
+
+    @property
+    def model(self) -> str:
+        """Model name (e.g., 'BAAI/bge-base-en-v1.5')."""
+        ...
+
+    @property
+    def dims(self) -> int:
+        """Embedding dimensions."""
+        ...
+
+    @property
+    def distance(self) -> str:
+        """Distance metric ('cosine' | 'l2' | 'ip')."""
+        ...
+
+    @property
+    def batch_size(self) -> int:
+        """Maximum batch size for embedding requests."""
+        ...
+
+    async def embed(self, texts: List[str]) -> List[List[float]]:
+        """Generate embeddings for a list of texts."""
+        ...
+
+    async def embed_single(self, text: str) -> List[float]:
+        """Generate embedding for a single text."""
+        ...
+
+    async def embed_batch(
+        self, texts: List[str], batch_size: Optional[int] = None
+    ) -> List[List[float]]:
+        """Generate embeddings in batches for optimal performance."""
+        ...
+
+    def is_available(self) -> bool:
+        """Check if the provider is available and properly configured."""
+        ...
+
+    def get_optimal_batch_size(self) -> int:
+        """Get optimal batch size for this provider."""
+        ...
+
+    def get_max_tokens_per_batch(self) -> int:
+        """Get maximum tokens per batch for this provider."""
+        ...
+
+    def get_recommended_concurrency(self) -> int:
+        """Get recommended concurrent batch count based on provider's rate limits."""
+        ...
+
+    def supports_reranking(self) -> bool:
+        """Return True if this provider supports reranking."""
+        ...
+
+
+class BaseEmbeddingProvider(ABC):
+    """Base class with default implementations for common operations."""
+
+    def __init__(self, config: EmbeddingConfig):
+        self._config = config
+        self._total_tokens = 0
+        self._total_requests = 0
+
+    @property
+    def config(self) -> EmbeddingConfig:
+        return self._config
+
+    @property
+    def name(self) -> str:
+        return self._config.provider
+
+    @property
+    def model(self) -> str:
+        return self._config.model
+
+    @property
+    def dims(self) -> int:
+        return self._config.dims
+
+    @property
+    def distance(self) -> str:
+        return self._config.distance
+
+    @property
+    def batch_size(self) -> int:
+        return self._config.batch_size
+
+    @abstractmethod
+    async def embed(self, texts: List[str]) -> List[List[float]]:
+        """Generate embeddings for a list of texts."""
+        ...
+
+    async def embed_single(self, text: str) -> List[float]:
+        """Generate embedding for a single text."""
+        results = await self.embed([text])
+        return results[0]
+
+    async def embed_batch(
+        self, texts: List[str], batch_size: Optional[int] = None
+    ) -> List[List[float]]:
+        """Generate embeddings in batches."""
+        effective_batch_size = batch_size or self.batch_size
+        all_embeddings: List[List[float]] = []
+
+        for i in range(0, len(texts), effective_batch_size):
+            batch = texts[i : i + effective_batch_size]
+            embeddings = await self.embed(batch)
+            all_embeddings.extend(embeddings)
+
+        return all_embeddings
+
+    async def embed_streaming(self, texts: List[str]) -> AsyncIterator[List[float]]:
+        """Generate embeddings with streaming results."""
+        for text in texts:
+            embedding = await self.embed_single(text)
+            yield embedding
+
+    def is_available(self) -> bool:
+        """Check if provider is available."""
+        return True
+
+    def get_optimal_batch_size(self) -> int:
+        return self.batch_size
+
+    def get_max_tokens_per_batch(self) -> int:
+        return self._config.max_tokens or 8192
+
+    def get_max_documents_per_batch(self) -> int:
+        return self.batch_size
+
+    def get_recommended_concurrency(self) -> int:
+        return 4
+
+    def get_max_rerank_batch_size(self) -> int:
+        return 64
+
+    def supports_reranking(self) -> bool:
+        return False
+
+    async def rerank(
+        self, query: str, documents: List[str], top_k: Optional[int] = None
+    ) -> List[RerankResult]:
+        raise NotImplementedError("Reranking not supported by this provider")
+
+    def estimate_tokens(self, text: str) -> int:
+        """Rough token estimation (chars / 4)."""
+        return len(text) // 4
+
+    def get_usage_stats(self) -> Dict[str, Any]:
+        return {
+            "total_tokens": self._total_tokens,
+            "total_requests": self._total_requests,
+        }
+
+    def reset_usage_stats(self) -> None:
+        self._total_tokens = 0
+        self._total_requests = 0
+
+
+@dataclass
+class EmbeddingManager:
+    """Manager for multiple embedding providers.
+    
+    Centralizes provider registration and selection.
+    """
+    
+    _providers: Dict[str, EmbeddingProvider] = field(default_factory=dict)
+    _default_provider: Optional[str] = None
+
+    def register_provider(
+        self, provider: EmbeddingProvider, set_default: bool = False
+    ) -> None:
+        """Register an embedding provider."""
+        self._providers[provider.name] = provider
+        if set_default or self._default_provider is None:
+            self._default_provider = provider.name
+
+    def get_provider(self, name: Optional[str] = None) -> EmbeddingProvider:
+        """Get a provider by name or the default provider."""
+        provider_name = name or self._default_provider
+        if provider_name is None:
+            raise ValueError("No default provider set and no name provided")
+        if provider_name not in self._providers:
+            raise ValueError(f"Provider '{provider_name}' not registered")
+        return self._providers[provider_name]
+
+    def list_providers(self) -> List[str]:
+        """List registered provider names."""
+        return list(self._providers.keys())
+
+    async def embed(
+        self, texts: List[str], provider_name: Optional[str] = None
+    ) -> List[List[float]]:
+        """Generate embeddings using specified or default provider."""
+        provider = self.get_provider(provider_name)
+        return await provider.embed(texts)
+
+    async def embed_batch(
+        self,
+        texts: List[str],
+        batch_size: Optional[int] = None,
+        provider_name: Optional[str] = None,
+    ) -> List[List[float]]:
+        """Generate embeddings in batches."""
+        provider = self.get_provider(provider_name)
+        return await provider.embed_batch(texts, batch_size)
+
+
+_default_manager: Optional[EmbeddingManager] = None
+
+
+def get_embedding_manager() -> EmbeddingManager:
+    """Get or create the default embedding manager."""
+    global _default_manager
+    if _default_manager is None:
+        _default_manager = EmbeddingManager()
+    return _default_manager
+
+
+__all__ = [
+    "RerankResult",
+    "EmbeddingConfig",
+    "EmbeddingProvider",
+    "BaseEmbeddingProvider",
+    "EmbeddingManager",
+    "get_embedding_manager",
+]
diff --git a/scripts/exceptions.py b/scripts/exceptions.py
new file mode 100644
index 00000000..c48f603b
--- /dev/null
+++ b/scripts/exceptions.py
@@ -0,0 +1,273 @@
+"""Structured exception hierarchy for Context-Engine.
+
+Provides a clear taxonomy of errors for better error handling,
+logging, and debugging. All exceptions include context about
+what operation failed and why.
+
+Usage:
+    try:
+        parse_file(path)
+    except ParsingError as e:
+        logger.error(f"Failed to parse {e.file_path}: {e}")
+    except ContextEngineError as e:
+        logger.error(f"Operation failed: {e}")
+"""
+
+from __future__ import annotations
+
+from typing import Optional, Any, Dict
+
+
+class ContextEngineError(Exception):
+    """Base exception for all Context-Engine errors."""
+    
+    def __init__(self, message: str, context: Optional[Dict[str, Any]] = None):
+        super().__init__(message)
+        self.message = message
+        self.context = context or {}
+    
+    def __str__(self) -> str:
+        if self.context:
+            ctx_str = ", ".join(f"{k}={v}" for k, v in self.context.items())
+            return f"{self.message} [{ctx_str}]"
+        return self.message
+
+
+class ValidationError(ContextEngineError):
+    """Invalid input or configuration."""
+    
+    def __init__(self, message: str, field: Optional[str] = None, value: Any = None):
+        context = {}
+        if field:
+            context["field"] = field
+        if value is not None:
+            context["value"] = repr(value)[:100]
+        super().__init__(message, context)
+        self.field = field
+        self.value = value
+
+
+class ParsingError(ContextEngineError):
+    """Failed to parse a file or content."""
+    
+    def __init__(
+        self,
+        message: str,
+        file_path: Optional[str] = None,
+        language: Optional[str] = None,
+        line: Optional[int] = None,
+    ):
+        context = {}
+        if file_path:
+            context["file"] = file_path
+        if language:
+            context["language"] = language
+        if line is not None:
+            context["line"] = line
+        super().__init__(message, context)
+        self.file_path = file_path
+        self.language = language
+        self.line = line
+
+
+class ChunkingError(ContextEngineError):
+    """Failed to chunk content."""
+    
+    def __init__(
+        self,
+        message: str,
+        file_path: Optional[str] = None,
+        chunk_index: Optional[int] = None,
+    ):
+        context = {}
+        if file_path:
+            context["file"] = file_path
+        if chunk_index is not None:
+            context["chunk_index"] = chunk_index
+        super().__init__(message, context)
+        self.file_path = file_path
+        self.chunk_index = chunk_index
+
+
+class EmbeddingError(ContextEngineError):
+    """Failed to generate embeddings."""
+    
+    def __init__(
+        self,
+        message: str,
+        provider: Optional[str] = None,
+        model: Optional[str] = None,
+        batch_size: Optional[int] = None,
+    ):
+        context = {}
+        if provider:
+            context["provider"] = provider
+        if model:
+            context["model"] = model
+        if batch_size is not None:
+            context["batch_size"] = batch_size
+        super().__init__(message, context)
+        self.provider = provider
+        self.model = model
+        self.batch_size = batch_size
+
+
+class IndexingError(ContextEngineError):
+    """Failed to index content into vector database."""
+    
+    def __init__(
+        self,
+        message: str,
+        collection: Optional[str] = None,
+        file_path: Optional[str] = None,
+        point_count: Optional[int] = None,
+    ):
+        context = {}
+        if collection:
+            context["collection"] = collection
+        if file_path:
+            context["file"] = file_path
+        if point_count is not None:
+            context["points"] = point_count
+        super().__init__(message, context)
+        self.collection = collection
+        self.file_path = file_path
+        self.point_count = point_count
+
+
+class DatabaseError(ContextEngineError):
+    """Database operation failed."""
+    
+    def __init__(
+        self,
+        message: str,
+        operation: Optional[str] = None,
+        collection: Optional[str] = None,
+    ):
+        context = {}
+        if operation:
+            context["operation"] = operation
+        if collection:
+            context["collection"] = collection
+        super().__init__(message, context)
+        self.operation = operation
+        self.collection = collection
+
+
+class SearchError(ContextEngineError):
+    """Search operation failed."""
+    
+    def __init__(
+        self,
+        message: str,
+        query: Optional[str] = None,
+        collection: Optional[str] = None,
+    ):
+        context = {}
+        if query:
+            context["query"] = query[:100]
+        if collection:
+            context["collection"] = collection
+        super().__init__(message, context)
+        self.query = query
+        self.collection = collection
+
+
+class ConfigurationError(ContextEngineError):
+    """Invalid or missing configuration."""
+    
+    def __init__(self, message: str, config_key: Optional[str] = None):
+        context = {}
+        if config_key:
+            context["key"] = config_key
+        super().__init__(message, context)
+        self.config_key = config_key
+
+
+class ProviderError(ContextEngineError):
+    """External provider/service error."""
+    
+    def __init__(
+        self,
+        message: str,
+        provider: Optional[str] = None,
+        status_code: Optional[int] = None,
+    ):
+        context = {}
+        if provider:
+            context["provider"] = provider
+        if status_code is not None:
+            context["status"] = status_code
+        super().__init__(message, context)
+        self.provider = provider
+        self.status_code = status_code
+
+
+class CacheError(ContextEngineError):
+    """Cache operation failed."""
+    
+    def __init__(
+        self,
+        message: str,
+        cache_type: Optional[str] = None,
+        key: Optional[str] = None,
+    ):
+        context = {}
+        if cache_type:
+            context["cache"] = cache_type
+        if key:
+            context["key"] = key[:100]
+        super().__init__(message, context)
+        self.cache_type = cache_type
+        self.key = key
+
+
+class RateLimitError(ProviderError):
+    """Rate limit exceeded."""
+    
+    def __init__(
+        self,
+        message: str,
+        provider: Optional[str] = None,
+        retry_after: Optional[float] = None,
+    ):
+        super().__init__(message, provider, status_code=429)
+        self.retry_after = retry_after
+        if retry_after is not None:
+            self.context["retry_after"] = retry_after
+
+
+class TimeoutError(ContextEngineError):
+    """Operation timed out."""
+    
+    def __init__(
+        self,
+        message: str,
+        operation: Optional[str] = None,
+        timeout_seconds: Optional[float] = None,
+    ):
+        context = {}
+        if operation:
+            context["operation"] = operation
+        if timeout_seconds is not None:
+            context["timeout"] = timeout_seconds
+        super().__init__(message, context)
+        self.operation = operation
+        self.timeout_seconds = timeout_seconds
+
+
+__all__ = [
+    "ContextEngineError",
+    "ValidationError",
+    "ParsingError",
+    "ChunkingError",
+    "EmbeddingError",
+    "IndexingError",
+    "DatabaseError",
+    "SearchError",
+    "ConfigurationError",
+    "ProviderError",
+    "CacheError",
+    "RateLimitError",
+    "TimeoutError",
+]
diff --git a/scripts/ingest/file_discovery_cache.py b/scripts/ingest/file_discovery_cache.py
new file mode 100644
index 00000000..dd850579
--- /dev/null
+++ b/scripts/ingest/file_discovery_cache.py
@@ -0,0 +1,198 @@
+"""Cache for file discovery operations to reduce filesystem overhead.
+
+Caches glob pattern matching results with TTL and directory mtime validation.
+Useful for repeated directory scans during watch mode or incremental indexing.
+
+Usage:
+    cache = FileDiscoveryCache(max_entries=100, ttl_seconds=300)
+    
+    # Get files matching patterns (cached)
+    files = cache.get_files(
+        directory=Path("/project"),
+        patterns=["**/*.py", "**/*.js"],
+        exclude_patterns=["**/node_modules/**"]
+    )
+    
+    # Invalidate when directory changes
+    cache.invalidate_directory(Path("/project/src"))
+"""
+
+from __future__ import annotations
+
+import logging
+import time
+from collections import OrderedDict
+from fnmatch import fnmatch
+from pathlib import Path
+from typing import List, Dict, Optional, Tuple, Any
+
+logger = logging.getLogger(__name__)
+
+
+class FileDiscoveryCache:
+    """LRU cache for file discovery operations with TTL and mtime validation."""
+
+    def __init__(self, max_entries: int = 100, ttl_seconds: int = 300):
+        self.max_entries = max_entries
+        self.ttl_seconds = ttl_seconds
+        self._cache: OrderedDict[str, Tuple[List[Path], float, float]] = OrderedDict()
+        self._hits = 0
+        self._misses = 0
+        self._evictions = 0
+        self._invalidations = 0
+
+    def get_files(
+        self,
+        directory: Path,
+        patterns: List[str],
+        exclude_patterns: Optional[List[str]] = None,
+    ) -> List[Path]:
+        """Get files matching patterns with caching."""
+        cache_key = self._make_cache_key(directory, patterns, exclude_patterns)
+
+        cached_result = self._get_from_cache(cache_key, directory)
+        if cached_result is not None:
+            self._hits += 1
+            return cached_result
+
+        self._misses += 1
+        files = self._discover_files(directory, patterns, exclude_patterns)
+        self._store_in_cache(cache_key, files, directory)
+        return files
+
+    def invalidate_directory(self, directory: Path) -> int:
+        """Invalidate all cache entries for a directory. Returns count removed."""
+        dir_str = str(directory)
+        keys_to_remove = [key for key in self._cache if key.startswith(f"{dir_str}|")]
+
+        for key in keys_to_remove:
+            del self._cache[key]
+            self._invalidations += 1
+
+        return len(keys_to_remove)
+
+    def clear(self) -> None:
+        """Clear all cache entries."""
+        count = len(self._cache)
+        self._cache.clear()
+        self._evictions += count
+
+    def get_stats(self) -> Dict[str, Any]:
+        """Get cache statistics."""
+        total_requests = self._hits + self._misses
+        hit_rate = (self._hits / total_requests * 100) if total_requests > 0 else 0.0
+
+        return {
+            "hits": self._hits,
+            "misses": self._misses,
+            "evictions": self._evictions,
+            "invalidations": self._invalidations,
+            "cache_size": len(self._cache),
+            "max_entries": self.max_entries,
+            "ttl_seconds": self.ttl_seconds,
+            "hit_rate_percent": round(hit_rate, 2),
+        }
+
+    def _make_cache_key(
+        self, directory: Path, patterns: List[str], exclude_patterns: Optional[List[str]]
+    ) -> str:
+        """Create a cache key from directory and patterns."""
+        patterns_str = "|".join(sorted(patterns))
+        exclude_str = "|".join(sorted(exclude_patterns or []))
+        return f"{directory}|{patterns_str}|{exclude_str}"
+
+    def _get_from_cache(self, cache_key: str, directory: Path) -> Optional[List[Path]]:
+        """Get entry from cache if valid (TTL and mtime)."""
+        if cache_key not in self._cache:
+            return None
+
+        files, timestamp, cached_mtime = self._cache[cache_key]
+        current_time = time.time()
+
+        if current_time - timestamp > self.ttl_seconds:
+            del self._cache[cache_key]
+            self._evictions += 1
+            return None
+
+        try:
+            current_mtime = directory.stat().st_mtime
+            if current_mtime > cached_mtime:
+                del self._cache[cache_key]
+                self._invalidations += 1
+                return None
+        except OSError:
+            del self._cache[cache_key]
+            self._invalidations += 1
+            return None
+
+        self._cache.move_to_end(cache_key)
+        return files
+
+    def _store_in_cache(self, cache_key: str, files: List[Path], directory: Path) -> None:
+        """Store files in cache with mtime tracking."""
+        try:
+            directory_mtime = directory.stat().st_mtime
+        except OSError:
+            return
+
+        while len(self._cache) >= self.max_entries:
+            oldest_key = next(iter(self._cache))
+            del self._cache[oldest_key]
+            self._evictions += 1
+
+        self._cache[cache_key] = (files, time.time(), directory_mtime)
+
+    def _discover_files(
+        self, directory: Path, patterns: List[str], exclude_patterns: Optional[List[str]]
+    ) -> List[Path]:
+        """Perform actual file discovery via glob."""
+        try:
+            files: List[Path] = []
+            for pattern in patterns:
+                files.extend(directory.glob(pattern))
+
+            seen = set()
+            unique_files = []
+            for file_path in files:
+                if file_path not in seen:
+                    seen.add(file_path)
+                    unique_files.append(file_path)
+            files = unique_files
+
+            if exclude_patterns:
+                filtered_files = []
+                for file_path in files:
+                    try:
+                        rel_path = file_path.relative_to(directory)
+                    except ValueError:
+                        rel_path = file_path
+                    excluded = any(
+                        fnmatch(str(rel_path), ep) or fnmatch(str(file_path), ep)
+                        for ep in exclude_patterns
+                    )
+                    if not excluded:
+                        filtered_files.append(file_path)
+                files = filtered_files
+
+            return files
+
+        except Exception as e:
+            logger.debug(f"Failed to discover files in {directory}: {e}")
+            return []
+
+
+_default_cache: Optional[FileDiscoveryCache] = None
+
+
+def get_default_file_discovery_cache() -> FileDiscoveryCache:
+    """Get or create the default global file discovery cache instance."""
+    global _default_cache
+    if _default_cache is None:
+        _default_cache = FileDiscoveryCache()
+    return _default_cache
+
+
+__all__ = [
+    "FileDiscoveryCache",
+    "get_default_file_discovery_cache",
+]
diff --git a/scripts/ingest/language_mappings/base.py b/scripts/ingest/language_mappings/base.py
index c68841cb..289e1d5a 100644
--- a/scripts/ingest/language_mappings/base.py
+++ b/scripts/ingest/language_mappings/base.py
@@ -158,42 +158,78 @@ def get_expression_preview(self, expr: str, max_length: int = 20) -> str:
             expr = expr[:max_length - 3] + "..."
         return expr if expr else "expr"
 
-    def find_child_by_type(self, node: Any, node_type: str) -> Optional[Any]:
-        """Find first child of specified type."""
-        if not TREE_SITTER_AVAILABLE or node is None:
-            return None
-        for i in range(node.child_count):
-            child = node.child(i)
-            if child and child.type == node_type:
-                return child
-        return None
+    # -------------------------------------------------------------------------
+    # Constant extraction (UPPER_SNAKE_CASE pattern)
+    # -------------------------------------------------------------------------
 
-    def find_children_by_type(self, node: Any, node_type: str) -> List[Any]:
-        """Find all children of specified type."""
-        if not TREE_SITTER_AVAILABLE or node is None:
-            return []
-        return [node.child(i) for i in range(node.child_count) 
-                if node.child(i) and node.child(i).type == node_type]
+    def extract_constants(
+        self, concept: ConceptType, captures: Dict[str, Any], content: bytes
+    ) -> Optional[List[Dict[str, Any]]]:
+        """Extract constants from definition captures.
+        
+        Override in language-specific mappings for custom constant detection.
+        Default implementation detects UPPER_SNAKE_CASE patterns.
+        
+        Returns:
+            List of {"name": str, "value": str} dicts, or None if not a constant
+        """
+        import re
+        
+        if concept != ConceptType.DEFINITION:
+            return None
+        
+        name = self.extract_name(concept, captures, content)
+        if not name:
+            return None
+        
+        if not re.match(r"^_?[A-Z][A-Z0-9_]*$", name):
+            return None
+        
+        text = self.extract_content(concept, captures, content)
+        value = ""
+        
+        for pattern in [
+            r"=\s*(.+?)(?:\n|$)",
+            r":\s*\w+\s*=\s*(.+?)(?:\n|$)",
+        ]:
+            match = re.search(pattern, text)
+            if match:
+                value = match.group(1).strip()
+                break
+        
+        if len(value) > MAX_CONSTANT_VALUE_LENGTH:
+            value = value[:MAX_CONSTANT_VALUE_LENGTH] + "..."
+        
+        return [{"name": name, "value": value}]
 
-    def get_node_line_range(self, node: Any) -> tuple:
-        """Get (start_line, end_line) 1-based."""
-        if not TREE_SITTER_AVAILABLE or node is None:
-            return (1, 1)
-        return (node.start_point[0] + 1, node.end_point[0] + 1)
+    # -------------------------------------------------------------------------
+    # Import resolution (override per-language)
+    # -------------------------------------------------------------------------
 
-    def get_node_byte_range(self, node: Any) -> tuple:
-        """Get (start_byte, end_byte)."""
-        if not TREE_SITTER_AVAILABLE or node is None:
-            return (0, 0)
-        return (node.start_byte, node.end_byte)
+    def resolve_import_path(
+        self, import_text: str, base_dir: str, source_file: str
+    ) -> Optional[str]:
+        """Resolve import statement to actual file path.
+        
+        Override in language-specific mappings. Default returns None.
+        
+        Args:
+            import_text: The import statement text
+            base_dir: Base directory of the project
+            source_file: Path to the file containing the import
+        
+        Returns:
+            Resolved file path, or None if cannot resolve
+        """
+        return None
 
-    def walk_tree(self, node: Any) -> Iterator[Any]:
-        """Walk all nodes depth-first."""
-        if not TREE_SITTER_AVAILABLE or node is None:
-            return
-        yield node
-        for i in range(node.child_count):
-            child = node.child(i)
-            if child:
-                yield from self.walk_tree(child)
+    def get_import_module(self, import_text: str) -> Optional[str]:
+        """Extract module name from import statement.
+        
+        Override in language-specific mappings.
+        
+        Returns:
+            Module name, or None if cannot parse
+        """
+        return None
 
diff --git a/scripts/ingest/models.py b/scripts/ingest/models.py
new file mode 100644
index 00000000..00c9deb6
--- /dev/null
+++ b/scripts/ingest/models.py
@@ -0,0 +1,264 @@
+"""Domain models for code indexing.
+
+Frozen dataclasses for immutability and hashability.
+All models validate their invariants in __post_init__.
+
+Usage:
+    chunk = Chunk(
+        id="abc123",
+        content="def foo(): pass",
+        start_line=1,
+        end_line=1,
+        file_path="/src/main.py",
+        language="python",
+        chunk_type=ChunkType.DEFINITION,
+    )
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Any, Dict, List, Optional, FrozenSet
+
+from scripts.exceptions import ValidationError
+
+
+class ChunkType(str, Enum):
+    """Universal chunk types for code chunking."""
+    DEFINITION = "definition"
+    BLOCK = "block"
+    COMMENT = "comment"
+    IMPORT = "import"
+    STRUCTURE = "structure"
+    UNKNOWN = "unknown"
+
+
+class SymbolKind(str, Enum):
+    """Symbol kinds for code analysis."""
+    FUNCTION = "function"
+    METHOD = "method"
+    CLASS = "class"
+    INTERFACE = "interface"
+    STRUCT = "struct"
+    ENUM = "enum"
+    CONSTANT = "constant"
+    VARIABLE = "variable"
+    TYPE_ALIAS = "type_alias"
+    MODULE = "module"
+    NAMESPACE = "namespace"
+    PROPERTY = "property"
+    UNKNOWN = "unknown"
+
+
+@dataclass(frozen=True)
+class Position:
+    """A position in a source file."""
+    line: int
+    column: int = 0
+    byte_offset: Optional[int] = None
+
+    def __post_init__(self) -> None:
+        if self.line < 0:
+            raise ValidationError("Line must be non-negative", field="line", value=self.line)
+        if self.column < 0:
+            raise ValidationError("Column must be non-negative", field="column", value=self.column)
+
+
+@dataclass(frozen=True)
+class Range:
+    """A range in a source file (start inclusive, end exclusive)."""
+    start: Position
+    end: Position
+
+    def __post_init__(self) -> None:
+        if self.start.line > self.end.line:
+            raise ValidationError(
+                f"Start line ({self.start.line}) must be <= end line ({self.end.line})",
+                field="range",
+            )
+        if self.start.line == self.end.line and self.start.column > self.end.column:
+            raise ValidationError(
+                f"Start column ({self.start.column}) must be <= end column ({self.end.column}) on same line",
+                field="range",
+            )
+
+    @property
+    def line_count(self) -> int:
+        return self.end.line - self.start.line + 1
+
+
+@dataclass(frozen=True)
+class Symbol:
+    """A code symbol (function, class, method, etc.)."""
+    name: str
+    kind: SymbolKind
+    start_line: int
+    end_line: int
+    path: Optional[str] = None
+    signature: Optional[str] = None
+    docstring: Optional[str] = None
+    decorators: FrozenSet[str] = field(default_factory=frozenset)
+    parameters: FrozenSet[str] = field(default_factory=frozenset)
+    complexity: Optional[int] = None
+
+    def __post_init__(self) -> None:
+        if not self.name:
+            raise ValidationError("Symbol name cannot be empty", field="name")
+        if self.start_line < 0:
+            raise ValidationError("start_line must be non-negative", field="start_line", value=self.start_line)
+        if self.end_line < self.start_line:
+            raise ValidationError(
+                f"end_line ({self.end_line}) must be >= start_line ({self.start_line})",
+                field="end_line",
+            )
+
+    @property
+    def line_count(self) -> int:
+        return self.end_line - self.start_line + 1
+
+    @property
+    def full_path(self) -> str:
+        return self.path or self.name
+
+
+@dataclass(frozen=True)
+class Chunk:
+    """A code chunk for indexing."""
+    id: str
+    content: str
+    start_line: int
+    end_line: int
+    file_path: str
+    language: str
+    chunk_type: ChunkType = ChunkType.UNKNOWN
+    symbol: Optional[str] = None
+    symbol_path: Optional[str] = None
+    imports: FrozenSet[str] = field(default_factory=frozenset)
+    calls: FrozenSet[str] = field(default_factory=frozenset)
+    metadata: Dict[str, Any] = field(default_factory=dict, hash=False)
+
+    def __post_init__(self) -> None:
+        if not self.id:
+            raise ValidationError("Chunk id cannot be empty", field="id")
+        if not self.content:
+            raise ValidationError("Chunk content cannot be empty", field="content")
+        if self.start_line < 0:
+            raise ValidationError("start_line must be non-negative", field="start_line", value=self.start_line)
+        if self.end_line < self.start_line:
+            raise ValidationError(
+                f"end_line ({self.end_line}) must be >= start_line ({self.start_line})",
+                field="end_line",
+            )
+
+    @property
+    def line_count(self) -> int:
+        return self.end_line - self.start_line + 1
+
+
+@dataclass(frozen=True)
+class ImportRef:
+    """An import reference."""
+    module: str
+    names: FrozenSet[str] = field(default_factory=frozenset)
+    is_from_import: bool = False
+    alias: Optional[str] = None
+    line: Optional[int] = None
+
+
+@dataclass(frozen=True)
+class CallRef:
+    """A function/method call reference."""
+    callee: str
+    caller_symbol: Optional[str] = None
+    line: Optional[int] = None
+    resolved_path: Optional[str] = None
+
+
+@dataclass(frozen=True)
+class FileAnalysis:
+    """Complete analysis result for a file."""
+    file_path: str
+    language: str
+    symbols: FrozenSet[Symbol] = field(default_factory=frozenset)
+    chunks: FrozenSet[Chunk] = field(default_factory=frozenset)
+    imports: FrozenSet[ImportRef] = field(default_factory=frozenset)
+    calls: FrozenSet[CallRef] = field(default_factory=frozenset)
+    file_hash: Optional[str] = None
+    line_count: int = 0
+    parse_time_ms: Optional[float] = None
+
+
+@dataclass
+class IndexingResult:
+    """Result of indexing a file or batch of files."""
+    files_processed: int = 0
+    files_skipped: int = 0
+    files_failed: int = 0
+    chunks_indexed: int = 0
+    symbols_indexed: int = 0
+    errors: List[str] = field(default_factory=list)
+    duration_seconds: float = 0.0
+
+    @property
+    def success_rate(self) -> float:
+        total = self.files_processed + self.files_skipped + self.files_failed
+        if total == 0:
+            return 1.0
+        return self.files_processed / total
+
+
+def chunk_from_dict(data: Dict[str, Any]) -> Chunk:
+    """Create a Chunk from a dictionary (for interop with existing code)."""
+    return Chunk(
+        id=data.get("id", data.get("chunk_id", "")),
+        content=data.get("content", data.get("code", data.get("text", ""))),
+        start_line=data.get("start_line", data.get("start", 0)),
+        end_line=data.get("end_line", data.get("end", 0)),
+        file_path=data.get("file_path", data.get("path", "")),
+        language=data.get("language", "unknown"),
+        chunk_type=ChunkType(data.get("chunk_type", data.get("type", "unknown"))),
+        symbol=data.get("symbol", data.get("name")),
+        symbol_path=data.get("symbol_path"),
+        imports=frozenset(data.get("imports", [])),
+        calls=frozenset(data.get("calls", [])),
+        metadata=data.get("metadata", {}),
+    )
+
+
+def symbol_from_dict(data: Dict[str, Any]) -> Symbol:
+    """Create a Symbol from a dictionary (for interop with existing code)."""
+    kind_str = data.get("kind", data.get("type", "unknown"))
+    try:
+        kind = SymbolKind(kind_str.lower())
+    except ValueError:
+        kind = SymbolKind.UNKNOWN
+
+    return Symbol(
+        name=data.get("name", ""),
+        kind=kind,
+        start_line=data.get("start_line", data.get("start", 0)),
+        end_line=data.get("end_line", data.get("end", 0)),
+        path=data.get("path", data.get("symbol_path")),
+        signature=data.get("signature"),
+        docstring=data.get("docstring"),
+        decorators=frozenset(data.get("decorators", [])),
+        parameters=frozenset(data.get("parameters", [])),
+        complexity=data.get("complexity"),
+    )
+
+
+__all__ = [
+    "ChunkType",
+    "SymbolKind",
+    "Position",
+    "Range",
+    "Symbol",
+    "Chunk",
+    "ImportRef",
+    "CallRef",
+    "FileAnalysis",
+    "IndexingResult",
+    "chunk_from_dict",
+    "symbol_from_dict",
+]
diff --git a/scripts/ingest/tree_cache.py b/scripts/ingest/tree_cache.py
new file mode 100644
index 00000000..3f834279
--- /dev/null
+++ b/scripts/ingest/tree_cache.py
@@ -0,0 +1,241 @@
+"""LRU cache for parsed syntax trees with automatic invalidation.
+
+Provides significant performance improvement by caching parsed ASTs
+and validating freshness via file mtime/size. Thread-safe for concurrent access.
+
+Usage:
+    cache = TreeCache(max_entries=1000)
+    
+    # Try to get cached tree
+    tree = cache.get(file_path)
+    if tree is None:
+        tree = parser.parse(content)
+        cache.put(file_path, tree)
+    
+    # Get statistics
+    stats = cache.get_stats()
+"""
+
+from __future__ import annotations
+
+import logging
+import time
+from collections import OrderedDict
+from pathlib import Path
+from threading import RLock
+from typing import Any, Optional, Dict
+
+logger = logging.getLogger(__name__)
+
+
+class TreeCacheEntry:
+    """Represents a cached syntax tree with metadata."""
+    
+    __slots__ = ("tree", "file_path", "mtime", "size", "access_time", "hit_count")
+
+    def __init__(self, tree: Any, file_path: Path, mtime: float, size: int):
+        self.tree = tree
+        self.file_path = file_path
+        self.mtime = mtime
+        self.size = size
+        self.access_time = time.time()
+        self.hit_count = 0
+
+    def is_valid(self) -> bool:
+        """Check if cache entry is still valid based on file mtime and size."""
+        try:
+            stat = self.file_path.stat()
+            return stat.st_mtime == self.mtime and stat.st_size == self.size
+        except (OSError, FileNotFoundError):
+            return False
+
+    def touch(self) -> None:
+        """Update access time and increment hit count."""
+        self.access_time = time.time()
+        self.hit_count += 1
+
+
+class TreeCache:
+    """LRU cache for parsed syntax trees with automatic invalidation."""
+
+    def __init__(self, max_entries: int = 1000, max_memory_mb: int = 500):
+        self.max_entries = max_entries
+        self.max_memory_bytes = max_memory_mb * 1024 * 1024
+        self._cache: OrderedDict[str, TreeCacheEntry] = OrderedDict()
+        self._lock = RLock()
+
+        self._hits = 0
+        self._misses = 0
+        self._evictions = 0
+        self._invalidations = 0
+
+    def get(self, file_path: Path) -> Optional[Any]:
+        """Get cached syntax tree for file, returning None if not cached or stale."""
+        cache_key = str(file_path.resolve())
+
+        with self._lock:
+            if cache_key not in self._cache:
+                self._misses += 1
+                return None
+
+            entry = self._cache[cache_key]
+
+            if not entry.is_valid():
+                del self._cache[cache_key]
+                self._invalidations += 1
+                self._misses += 1
+                return None
+
+            self._cache.move_to_end(cache_key)
+            entry.touch()
+            self._hits += 1
+            return entry.tree
+
+    def get_for_comparison(self, file_path: Path) -> Optional[Any]:
+        """Get cached tree even if stale - useful for incremental parsing comparison."""
+        cache_key = str(file_path.resolve())
+
+        with self._lock:
+            if cache_key not in self._cache:
+                self._misses += 1
+                return None
+            return self._cache[cache_key].tree
+
+    def put(self, file_path: Path, tree: Any) -> None:
+        """Cache a parsed syntax tree."""
+        if tree is None:
+            return
+
+        cache_key = str(file_path.resolve())
+
+        try:
+            stat = file_path.stat()
+            mtime = stat.st_mtime
+            size = stat.st_size
+        except (OSError, FileNotFoundError):
+            return
+
+        with self._lock:
+            entry = TreeCacheEntry(tree, file_path, mtime, size)
+
+            if cache_key in self._cache:
+                del self._cache[cache_key]
+
+            self._cache[cache_key] = entry
+            self._enforce_limits()
+
+    def invalidate(self, file_path: Path) -> bool:
+        """Invalidate cached entry for a file. Returns True if found and removed."""
+        cache_key = str(file_path.resolve())
+
+        with self._lock:
+            if cache_key in self._cache:
+                del self._cache[cache_key]
+                self._invalidations += 1
+                return True
+            return False
+
+    def clear(self) -> None:
+        """Clear all cached entries."""
+        with self._lock:
+            self._cache.clear()
+
+    def _enforce_limits(self) -> None:
+        """Enforce cache size and memory limits using LRU eviction."""
+        while len(self._cache) > self.max_entries:
+            self._evict_lru()
+
+        estimated_memory = sum(entry.size for entry in self._cache.values())
+        while estimated_memory > self.max_memory_bytes and self._cache:
+            evicted_entry = self._evict_lru()
+            if evicted_entry:
+                estimated_memory -= evicted_entry.size
+
+    def _evict_lru(self) -> Optional[TreeCacheEntry]:
+        """Evict least recently used entry."""
+        if not self._cache:
+            return None
+        _, entry = self._cache.popitem(last=False)
+        self._evictions += 1
+        return entry
+
+    def get_stats(self) -> Dict[str, Any]:
+        """Get cache statistics."""
+        with self._lock:
+            total_requests = self._hits + self._misses
+            hit_rate = (self._hits / total_requests * 100) if total_requests > 0 else 0.0
+
+            return {
+                "entries": len(self._cache),
+                "max_entries": self.max_entries,
+                "hits": self._hits,
+                "misses": self._misses,
+                "hit_rate_percent": round(hit_rate, 2),
+                "evictions": self._evictions,
+                "invalidations": self._invalidations,
+                "total_requests": total_requests,
+                "estimated_memory_mb": round(
+                    sum(entry.size for entry in self._cache.values()) / 1024 / 1024, 2
+                ),
+                "max_memory_mb": round(self.max_memory_bytes / 1024 / 1024, 2),
+            }
+
+    def cleanup_stale_entries(self) -> int:
+        """Remove all stale entries. Returns number removed."""
+        stale_keys = []
+
+        with self._lock:
+            for cache_key, entry in self._cache.items():
+                if not entry.is_valid():
+                    stale_keys.append(cache_key)
+
+            for key in stale_keys:
+                del self._cache[key]
+                self._invalidations += 1
+
+        return len(stale_keys)
+
+    def get_cache_info(self, file_path: Path) -> Optional[Dict[str, Any]]:
+        """Get detailed information about a cached entry."""
+        cache_key = str(file_path.resolve())
+
+        with self._lock:
+            if cache_key not in self._cache:
+                return None
+
+            entry = self._cache[cache_key]
+            return {
+                "file_path": str(entry.file_path),
+                "cached_mtime": entry.mtime,
+                "cached_size": entry.size,
+                "access_time": entry.access_time,
+                "hit_count": entry.hit_count,
+                "is_valid": entry.is_valid(),
+                "age_seconds": time.time() - entry.access_time,
+            }
+
+
+_default_cache: Optional[TreeCache] = None
+
+
+def get_default_cache() -> TreeCache:
+    """Get or create the default global tree cache instance."""
+    global _default_cache
+    if _default_cache is None:
+        _default_cache = TreeCache()
+    return _default_cache
+
+
+def configure_default_cache(max_entries: int = 1000, max_memory_mb: int = 500) -> TreeCache:
+    """Configure the default global tree cache."""
+    global _default_cache
+    _default_cache = TreeCache(max_entries, max_memory_mb)
+    return _default_cache
+
+
+__all__ = [
+    "TreeCache",
+    "TreeCacheEntry",
+    "get_default_cache",
+    "configure_default_cache",
+]
diff --git a/scripts/ingest/types.py b/scripts/ingest/types.py
new file mode 100644
index 00000000..e2e6d53b
--- /dev/null
+++ b/scripts/ingest/types.py
@@ -0,0 +1,73 @@
+"""Type aliases for semantic type safety.
+
+Using NewType creates distinct types that catch bugs at type-check time
+while having zero runtime overhead.
+
+Example:
+    def get_chunk(chunk_id: ChunkId) -> Chunk:
+        ...
+    
+    # Type checker catches this mistake:
+    file_id: FileId = FileId(123)
+    get_chunk(file_id)  # Error: FileId is not ChunkId
+"""
+
+from __future__ import annotations
+
+from typing import NewType, TypeVar, Dict, Any, List, Optional, Union
+
+ChunkId = NewType("ChunkId", str)
+FileId = NewType("FileId", str)
+PointId = NewType("PointId", str)
+
+LineNumber = NewType("LineNumber", int)
+ByteOffset = NewType("ByteOffset", int)
+ColumnNumber = NewType("ColumnNumber", int)
+
+TokenCount = NewType("TokenCount", int)
+CharCount = NewType("CharCount", int)
+
+Score = NewType("Score", float)
+Embedding = NewType("Embedding", List[float])
+
+FilePath = NewType("FilePath", str)
+RepoName = NewType("RepoName", str)
+CollectionName = NewType("CollectionName", str)
+
+Language = NewType("Language", str)
+SymbolPath = NewType("SymbolPath", str)
+SymbolName = NewType("SymbolName", str)
+
+FileHash = NewType("FileHash", str)
+ContentHash = NewType("ContentHash", str)
+
+Payload = Dict[str, Any]
+Metadata = Dict[str, Any]
+
+T = TypeVar("T")
+ChunkT = TypeVar("ChunkT", bound="Chunk")
+
+__all__ = [
+    "ChunkId",
+    "FileId", 
+    "PointId",
+    "LineNumber",
+    "ByteOffset",
+    "ColumnNumber",
+    "TokenCount",
+    "CharCount",
+    "Score",
+    "Embedding",
+    "FilePath",
+    "RepoName",
+    "CollectionName",
+    "Language",
+    "SymbolPath",
+    "SymbolName",
+    "FileHash",
+    "ContentHash",
+    "Payload",
+    "Metadata",
+    "T",
+    "ChunkT",
+]

From c569e49126da2db8f27d9f22b3ca6069eb5aa485 Mon Sep 17 00:00:00 2001
From: John Donalson <mirlok@dr.com>
Date: Sat, 24 Jan 2026 12:26:18 -0500
Subject: [PATCH 27/29] Improve context hit extraction and logging

Refactored context hit extraction in ctx.py to handle TOON-encoded strings and provide better error handling and debug output. Replaced a print statement with logger.info in pipeline.py for consistency with logging practices.
---
 scripts/ctx.py             | 47 ++++++++++++++++++++++++++++----------
 scripts/ingest/pipeline.py |  2 +-
 2 files changed, 36 insertions(+), 13 deletions(-)

diff --git a/scripts/ctx.py b/scripts/ctx.py
index c3e03f5c..0d90cea4 100755
--- a/scripts/ctx.py
+++ b/scripts/ctx.py
@@ -1069,10 +1069,35 @@ def fetch_context(query: str, **filters) -> Tuple[str, str]:
         sys.stderr.write("[DEBUG] repo_search returned no data\n")
         sys.stderr.flush()
         return "", "Context retrieval returned no data."
-
-    hits = data.get("results_json") or data.get("results") or []
-    if isinstance(hits, str):
-        hits = []
+	    
+	    def _extract_hits(payload: dict, *, label: str) -> list:
+	        """Extract hits from MCP response, handling TOON-encoded strings.
+	        
+	        Prefers results_json when available, falls back to results, and
+	        attempts TOON decode if the value is a string.
+	        """
+	        hits_val = payload.get("results_json") or payload.get("results") or []
+	        if isinstance(hits_val, str):
+	            raw_hits = hits_val
+	            # Prefer preserved structured results_json when present
+	            if isinstance(payload.get("results_json"), list):
+	                hits_val = payload["results_json"]
+	            else:
+	                try:
+	                    from toon import decode as toon_decode  # type: ignore[import-untyped]
+	                    decoded = toon_decode(raw_hits)
+	                    decoded_hits = decoded.get("results") or []
+	                    hits_val = decoded_hits if isinstance(decoded_hits, list) else []
+	                except Exception:
+	                    sys.stderr.write(
+	                        f"[DEBUG] {label} returned TOON-formatted string results but could not decode; "
+	                        "treating as no results. Hint: install 'toon' or set TOON_ENABLED=0.\n"
+	                    )
+	                    sys.stderr.flush()
+	                    hits_val = []
+	        return hits_val if isinstance(hits_val, list) else []
+	    
+	    hits = _extract_hits(data, label="repo_search")
     relevance = _estimate_query_result_relevance(query, hits)
     sys.stderr.write(f"[DEBUG] repo_search returned {len(hits)} hits (relevance={relevance:.3f})\n")
     sys.stderr.flush()
@@ -1119,14 +1144,12 @@ def fetch_context(query: str, **filters) -> Tuple[str, str]:
             "collection": collection_name,
         }
         memory_result = call_mcp_tool("context_search", memory_params)
-        if "error" not in memory_result:
-            memory_data = parse_mcp_response(memory_result)
-            if memory_data:
-                memory_hits = memory_data.get("results_json") or memory_data.get("results") or []
-                if isinstance(memory_hits, str):
-                    memory_hits = []
-                if memory_hits:
-                    return format_search_results(memory_hits, include_snippets=with_snippets), "Using memories and design docs"
+	        if "error" not in memory_result:
+	            memory_data = parse_mcp_response(memory_result)
+	            if memory_data:
+	                memory_hits = _extract_hits(memory_data, label="context_search")
+	                if memory_hits:
+	                    return format_search_results(memory_hits, include_snippets=with_snippets), "Using memories and design docs"
         return "", "No relevant context found for the prompt."
 
     return format_search_results(hits, include_snippets=with_snippets), ""
diff --git a/scripts/ingest/pipeline.py b/scripts/ingest/pipeline.py
index 35e3616d..405dc30b 100644
--- a/scripts/ingest/pipeline.py
+++ b/scripts/ingest/pipeline.py
@@ -1298,7 +1298,7 @@ def index_repo(
     if recreate:
         skip_unchanged = False
         dedupe = False
-        print("[index_repo] Recreate mode: skipping cache checks and deduplication (collection is fresh)")
+        logger.info("[index_repo] Recreate mode: skipping cache checks and deduplication (collection is fresh)")
 
     fast_fs = _env_truthy(os.environ.get("INDEX_FS_FASTPATH"), False)
     if skip_unchanged and not recreate and fast_fs and get_cached_file_meta is not None:

From c85a054f85645f2b171d3387a1976ddb609fec46 Mon Sep 17 00:00:00 2001
From: John Donalson <mirlok@dr.com>
Date: Sat, 24 Jan 2026 12:33:20 -0500
Subject: [PATCH 28/29] Fix indentation and update debug messages in scripts

Corrected indentation issues in scripts/ctx.py to ensure proper function definitions and logic flow. Updated a debug message in scripts/hybrid_search.py to clarify the threshold method. In scripts/ast_analyzer.py, normalized the language before retrieving the mapping to prevent errors.
---
 scripts/ast_analyzer.py  |  2 +-
 scripts/ctx.py           | 70 ++++++++++++++++++++--------------------
 scripts/hybrid_search.py |  2 +-
 3 files changed, 37 insertions(+), 37 deletions(-)

diff --git a/scripts/ast_analyzer.py b/scripts/ast_analyzer.py
index e84efd5d..11621303 100644
--- a/scripts/ast_analyzer.py
+++ b/scripts/ast_analyzer.py
@@ -448,7 +448,7 @@ def _analyze_with_mapping(self, content: str, file_path: str, language: str) ->
             return self._empty_analysis()
         
         try:
-            mapping = get_mapping(language)
+            mapping = get_mapping(self._normalize_lang(language))
         except (TypeError, Exception) as e:
             logger.debug(f"Mapping instantiation failed for {language}: {e}")
             return self._empty_analysis()
diff --git a/scripts/ctx.py b/scripts/ctx.py
index 0d90cea4..f6a9c6a4 100755
--- a/scripts/ctx.py
+++ b/scripts/ctx.py
@@ -1069,35 +1069,35 @@ def fetch_context(query: str, **filters) -> Tuple[str, str]:
         sys.stderr.write("[DEBUG] repo_search returned no data\n")
         sys.stderr.flush()
         return "", "Context retrieval returned no data."
-	    
-	    def _extract_hits(payload: dict, *, label: str) -> list:
-	        """Extract hits from MCP response, handling TOON-encoded strings.
-	        
-	        Prefers results_json when available, falls back to results, and
-	        attempts TOON decode if the value is a string.
-	        """
-	        hits_val = payload.get("results_json") or payload.get("results") or []
-	        if isinstance(hits_val, str):
-	            raw_hits = hits_val
-	            # Prefer preserved structured results_json when present
-	            if isinstance(payload.get("results_json"), list):
-	                hits_val = payload["results_json"]
-	            else:
-	                try:
-	                    from toon import decode as toon_decode  # type: ignore[import-untyped]
-	                    decoded = toon_decode(raw_hits)
-	                    decoded_hits = decoded.get("results") or []
-	                    hits_val = decoded_hits if isinstance(decoded_hits, list) else []
-	                except Exception:
-	                    sys.stderr.write(
-	                        f"[DEBUG] {label} returned TOON-formatted string results but could not decode; "
-	                        "treating as no results. Hint: install 'toon' or set TOON_ENABLED=0.\n"
-	                    )
-	                    sys.stderr.flush()
-	                    hits_val = []
-	        return hits_val if isinstance(hits_val, list) else []
-	    
-	    hits = _extract_hits(data, label="repo_search")
+
+    def _extract_hits(payload: dict, *, label: str) -> list:
+        """Extract hits from MCP response, handling TOON-encoded strings.
+
+        Prefers results_json when available, falls back to results, and
+        attempts TOON decode if the value is a string.
+        """
+        hits_val = payload.get("results_json") or payload.get("results") or []
+        if isinstance(hits_val, str):
+            raw_hits = hits_val
+            # Prefer preserved structured results_json when present
+            if isinstance(payload.get("results_json"), list):
+                hits_val = payload["results_json"]
+            else:
+                try:
+                    from toon import decode as toon_decode  # type: ignore[import-untyped]
+                    decoded = toon_decode(raw_hits)
+                    decoded_hits = decoded.get("results") or []
+                    hits_val = decoded_hits if isinstance(decoded_hits, list) else []
+                except Exception:
+                    sys.stderr.write(
+                        f"[DEBUG] {label} returned TOON-formatted string results but could not decode; "
+                        "treating as no results. Hint: install 'toon' or set TOON_ENABLED=0.\n"
+                    )
+                    sys.stderr.flush()
+                    hits_val = []
+        return hits_val if isinstance(hits_val, list) else []
+
+    hits = _extract_hits(data, label="repo_search")
     relevance = _estimate_query_result_relevance(query, hits)
     sys.stderr.write(f"[DEBUG] repo_search returned {len(hits)} hits (relevance={relevance:.3f})\n")
     sys.stderr.flush()
@@ -1144,12 +1144,12 @@ def _extract_hits(payload: dict, *, label: str) -> list:
             "collection": collection_name,
         }
         memory_result = call_mcp_tool("context_search", memory_params)
-	        if "error" not in memory_result:
-	            memory_data = parse_mcp_response(memory_result)
-	            if memory_data:
-	                memory_hits = _extract_hits(memory_data, label="context_search")
-	                if memory_hits:
-	                    return format_search_results(memory_hits, include_snippets=with_snippets), "Using memories and design docs"
+        if "error" not in memory_result:
+            memory_data = parse_mcp_response(memory_result)
+            if memory_data:
+                memory_hits = _extract_hits(memory_data, label="context_search")
+                if memory_hits:
+                    return format_search_results(memory_hits, include_snippets=with_snippets), "Using memories and design docs"
         return "", "No relevant context found for the prompt."
 
     return format_search_results(hits, include_snippets=with_snippets), ""
diff --git a/scripts/hybrid_search.py b/scripts/hybrid_search.py
index 89f3a7b9..4fbd1e25 100644
--- a/scripts/hybrid_search.py
+++ b/scripts/hybrid_search.py
@@ -3042,7 +3042,7 @@ def _resolve(seg: str) -> list[str]:
         if os.environ.get("DEBUG_HYBRID_SEARCH"):
             logger.debug(
                 f"Elbow filter: {original_count} -> {len(items)} results "
-                f"(threshold based on Kneedle algorithm)"
+                f"(threshold based on curvature method)"
             )
         _dt("elbow_filter")
 

From 9e85f082a35a8d3dbb87d208d5de5733747035c3 Mon Sep 17 00:00:00 2001
From: John Donalson <mirlok@dr.com>
Date: Sat, 24 Jan 2026 12:46:03 -0500
Subject: [PATCH 29/29] Restrict call attribution to valid symbol kinds

Update _extract_calls_from_tree to only attribute function calls to symbols of kind function, method, class, async_function, or module. This prevents assignments and constants from being incorrectly identified as callers.
---
 scripts/ast_analyzer.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/scripts/ast_analyzer.py b/scripts/ast_analyzer.py
index 11621303..c1de92e7 100644
--- a/scripts/ast_analyzer.py
+++ b/scripts/ast_analyzer.py
@@ -881,7 +881,15 @@ def _parse_import_text(self, text: str, language: str) -> Tuple[str, List[str],
     def _extract_calls_from_tree(self, root, content_bytes: bytes, symbols: List[CodeSymbol], language: str) -> List[CallReference]:
         """Walk tree to extract function calls."""
         calls: List[CallReference] = []
-        symbol_ranges = [(s.start_line, s.end_line, s.path or s.name) for s in symbols]
+        # Only include functions/methods/classes as valid callers - NOT assignments/constants
+        # This prevents calls like `result = foo()` from being attributed to `result` instead of
+        # the enclosing function
+        valid_caller_kinds = {"function", "method", "class", "async_function", "module"}
+        symbol_ranges = [
+            (s.start_line, s.end_line, s.path or s.name)
+            for s in symbols
+            if s.kind in valid_caller_kinds
+        ]
         
         def find_enclosing_symbol(line: int) -> str:
             best_match = ""