diff --git a/ERROR b/ERROR new file mode 100644 index 0000000..e69de29 diff --git a/docker-compose.yml b/docker-compose.yml index c57741d..d68178e 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -6,8 +6,13 @@ services: NEO4J_dbms_security_procedures_unrestricted: apoc.* NEO4J_dbms_security_procedures_allowlist: apoc.* ports: +<<<<<<< HEAD + - "7475:7474" # Host 7475 maps to Container 7474 + - "7688:7687" # Host 7688 maps to Container 7687 +======= - "7474:7474" - "7687:7687" +>>>>>>> 87cfd9650622e51c4c94d43d490450a82a87ad3d volumes: - neo4j_data:/data - neo4j_logs:/logs @@ -49,4 +54,8 @@ volumes: neo4j_data: neo4j_logs: chroma_data: - smp_data: \ No newline at end of file +<<<<<<< HEAD + smp_data: +======= + smp_data: +>>>>>>> 87cfd9650622e51c4c94d43d490450a82a87ad3d diff --git a/pyproject.toml b/pyproject.toml index e57074f..ae2aef7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,6 +21,10 @@ dependencies = [ "tree-sitter-typescript>=0.23", "python-dotenv>=1.0", "structlog>=24.0", +<<<<<<< HEAD + "chromadb", +======= +>>>>>>> 87cfd9650622e51c4c94d43d490450a82a87ad3d ] [project.optional-dependencies] diff --git a/smp/engine/embedding.py b/smp/engine/embedding.py new file mode 100644 index 0000000..4d10940 --- /dev/null +++ b/smp/engine/embedding.py @@ -0,0 +1,124 @@ +"""Embedding service using NVIDIA NIM or OpenAI.""" + +from __future__ import annotations + +import os +from typing import Any + +import httpx + +from smp.logging import get_logger + +log = get_logger(__name__) + + +class EmbeddingService: + """Generate embeddings via NVIDIA NIM or OpenAI.""" + + def __init__( + self, + provider: str = "nvidia", + api_key: str | None = None, + model: str | None = None, + base_url: str | None = None, + dimension: int = 768, + ) -> None: + self._provider = provider + self._api_key = api_key or os.environ.get("NVIDIA_NIM_API_KEY") or os.environ.get("OPENAI_API_KEY", "") + self._model = model or os.environ.get("EMBEDDING_MODEL", "nvidia/nv-embed-qa-4") + self._base_url = base_url or os.environ.get( + "EMBEDDING_BASE_URL", "https://integrate.api.nvidia.com/v1" + ) + self._dimension = dimension + self._client: httpx.AsyncClient | None = None + + async def connect(self) -> None: + self._client = httpx.AsyncClient( + base_url=self._base_url, + headers={"Authorization": f"Bearer {self._api_key}"}, + timeout=60.0, + ) + log.info("embedding_service_connected", provider=self._provider, model=self._model) + + async def close(self) -> None: + if self._client: + await self._client.aclose() + self._client = None + + @property + def dimension(self) -> int: + return self._dimension + + async def embed(self, text: str) -> list[float]: + """Generate embedding for a single text.""" + if self._client is None: + raise RuntimeError("EmbeddingService not connected") + + if self._provider == "nvidia": + return await self._embed_nvidia(text) + elif self._provider == "openai": + return await self._embed_openai(text) + else: + raise ValueError(f"Unknown provider: {self._provider}") + + async def embed_batch(self, texts: list[str]) -> list[list[float]]: + """Generate embeddings for multiple texts.""" + if self._client is None: + raise RuntimeError("EmbeddingService not connected") + + if self._provider == "nvidia": + return await self._embed_batch_nvidia(texts) + elif self._provider == "openai": + return await self._embed_batch_openai(texts) + else: + raise ValueError(f"Unknown provider: {self._provider}") + + async def _embed_nvidia(self, text: str) -> list[float]: + payload = { + "input": text, + "model": self._model, + } + response = await self._client.post("/embeddings", json=payload) + response.raise_for_status() + data = response.json() + return data["data"][0]["embedding"] + + async def _embed_batch_nvidia(self, texts: list[str]) -> list[list[float]]: + payload = { + "input": texts, + "model": self._model, + } + response = await self._client.post("/embeddings", json=payload) + response.raise_for_status() + data = response.json() + return [item["embedding"] for item in data["data"]] + + async def _embed_openai(self, text: str) -> list[float]: + payload = { + "input": text, + "model": self._model, + } + response = await self._client.post("/embeddings", json=payload) + response.raise_for_status() + data = response.json() + return data["data"][0]["embedding"] + + async def _embed_batch_openai(self, texts: list[str]) -> list[list[float]]: + payload = { + "input": texts, + "model": self._model, + } + response = await self._client.post("/embeddings", json=payload) + response.raise_for_status() + data = response.json() + return [item["embedding"] for item in data["data"]] + + +def create_embedding_service() -> EmbeddingService: + """Create embedding service from environment variables.""" + provider = os.getenv("EMBEDDING_PROVIDER", "nvidia") + api_key = os.getenv("NVIDIA_NIM_API_KEY") or os.getenv("OPENAI_API_KEY") + model = os.getenv("EMBEDDING_MODEL") + base_url = os.getenv("EMBEDDING_BASE_URL") + dimension = int(os.getenv("EMBEDDING_DIMENSION", "768")) + return EmbeddingService(provider=provider, api_key=api_key, model=model, base_url=base_url, dimension=dimension) \ No newline at end of file diff --git a/smp/engine/enricher.py b/smp/engine/enricher.py index adc31f6..191dbc7 100644 --- a/smp/engine/enricher.py +++ b/smp/engine/enricher.py @@ -1,19 +1,33 @@ +<<<<<<< HEAD +"""Static semantic enricher with optional LLM-based embedding.""" +======= """Static semantic enricher — AST-based extraction. Extracts docstrings, inline comments, decorators, type annotations, and computes source hashes purely from the AST. No LLM or embedding generation. """ +>>>>>>> 87cfd9650622e51c4c94d43d490450a82a87ad3d from __future__ import annotations import hashlib from datetime import UTC, datetime +<<<<<<< HEAD +from typing import TYPE_CHECKING +======= +>>>>>>> 87cfd9650622e51c4c94d43d490450a82a87ad3d from smp.core.models import GraphNode from smp.engine.interfaces import SemanticEnricher as SemanticEnricherInterface from smp.logging import get_logger +<<<<<<< HEAD +if TYPE_CHECKING: + from smp.engine.embedding import EmbeddingService + +======= +>>>>>>> 87cfd9650622e51c4c94d43d490450a82a87ad3d log = get_logger(__name__) @@ -24,15 +38,28 @@ def _compute_source_hash(name: str, file_path: str, start: int, end: int, signat class StaticSemanticEnricher(SemanticEnricherInterface): +<<<<<<< HEAD + """Static AST-based semantic enricher with optional embedding support.""" + + def __init__(self, embedding_service: EmbeddingService | None = None) -> None: +======= """Static AST-based semantic enricher. No LLM, no embeddings.""" def __init__(self) -> None: +>>>>>>> 87cfd9650622e51c4c94d43d490450a82a87ad3d self._enrichment_counts: dict[str, int] = { "enriched": 0, "skipped": 0, "no_metadata": 0, "failed": 0, } +<<<<<<< HEAD + self._embedding_service = embedding_service + + def set_embedding_service(self, service: EmbeddingService) -> None: + self._embedding_service = service +======= +>>>>>>> 87cfd9650622e51c4c94d43d490450a82a87ad3d async def enrich_node( self, @@ -83,9 +110,28 @@ async def enrich_batch( enriched.append(result) return enriched +<<<<<<< HEAD + @property + def has_llm(self) -> bool: + """Check if LLM-based embedding is available.""" + return self._embedding_service is not None + + async def embed(self, text: str) -> list[float]: + """Generate embedding using the embedding service if available.""" + if self._embedding_service is None: + return [] + return await self._embedding_service.embed(text) + + async def embed_batch(self, texts: list[str]) -> list[list[float]]: + """Generate embeddings for multiple texts.""" + if self._embedding_service is None: + return [[] for _ in texts] + return await self._embedding_service.embed_batch(texts) +======= async def embed(self, text: str) -> list[float]: """No-op embedding — static enricher does not use vectors.""" return [] +>>>>>>> 87cfd9650622e51c4c94d43d490450a82a87ad3d def get_counts(self) -> dict[str, int]: """Return enrichment statistics.""" @@ -94,4 +140,8 @@ def get_counts(self) -> dict[str, int]: def reset_counts(self) -> None: """Reset enrichment counters.""" for key in self._enrichment_counts: +<<<<<<< HEAD + self._enrichment_counts[key] = 0 +======= self._enrichment_counts[key] = 0 +>>>>>>> 87cfd9650622e51c4c94d43d490450a82a87ad3d diff --git a/smp/engine/interfaces.py b/smp/engine/interfaces.py index 7a68317..26a8ce8 100644 --- a/smp/engine/interfaces.py +++ b/smp/engine/interfaces.py @@ -55,6 +55,13 @@ async def enrich_batch(self, nodes: list[GraphNode], force: bool = False) -> lis async def embed(self, text: str) -> list[float]: """No-op for static enricher.""" +<<<<<<< HEAD + @abc.abstractmethod + async def embed_batch(self, texts: list[str]) -> list[list[float]]: + """Generate embeddings for multiple texts.""" + +======= +>>>>>>> 87cfd9650622e51c4c94d43d490450a82a87ad3d class QueryEngine(abc.ABC): """High-level query interface over the memory store.""" diff --git a/smp/engine/seed_walk.py b/smp/engine/seed_walk.py index e77c1be..65cc77c 100644 --- a/smp/engine/seed_walk.py +++ b/smp/engine/seed_walk.py @@ -99,6 +99,10 @@ def __init__( beta: float = BETA, gamma: float = GAMMA, route_threshold: float = ROUTE_CONFIDENCE_THRESHOLD, +<<<<<<< HEAD + delegate: QueryEngineInterface | None = None, +======= +>>>>>>> 87cfd9650622e51c4c94d43d490450a82a87ad3d ) -> None: self._graph = graph_store self._vector = vector_store @@ -107,6 +111,10 @@ def __init__( self._beta = beta self._gamma = gamma self._route_threshold = route_threshold +<<<<<<< HEAD + self._delegate = delegate +======= +>>>>>>> 87cfd9650622e51c4c94d43d490450a82a87ad3d async def _route_to_community(self, query: str) -> tuple[str | None, float]: if self._vector is None: @@ -373,22 +381,47 @@ async def locate( return [msgspec.structs.asdict(result)] async def navigate(self, query: str, include_relationships: bool = True) -> dict[str, Any]: +<<<<<<< HEAD + if self._delegate: + return await self._delegate.navigate(query, include_relationships) +======= +>>>>>>> 87cfd9650622e51c4c94d43d490450a82a87ad3d return {} async def trace( self, start: str, relationship: str = "CALLS", depth: int = 3, direction: str = "outgoing" ) -> list[dict[str, Any]]: +<<<<<<< HEAD + if self._delegate: + return await self._delegate.trace(start, relationship, depth, direction) return [] async def get_context(self, file_path: str, scope: str = "edit", depth: int = 2) -> dict[str, Any]: + if self._delegate: + return await self._delegate.get_context(file_path, scope, depth) return {} async def assess_impact(self, entity: str, change_type: str = "delete") -> dict[str, Any]: + if self._delegate: + return await self._delegate.assess_impact(entity, change_type) +======= + return [] + + async def get_context(self, file_path: str, scope: str = "edit", depth: int = 2) -> dict[str, Any]: + return {} + + async def assess_impact(self, entity: str, change_type: str = "delete") -> dict[str, Any]: +>>>>>>> 87cfd9650622e51c4c94d43d490450a82a87ad3d return {} async def search( self, query: str, match: str = "any", filters: dict[str, Any] | None = None, top_k: int = 5 ) -> dict[str, Any]: +<<<<<<< HEAD + if self._delegate: + return await self._delegate.search(query, match, filters, top_k) +======= +>>>>>>> 87cfd9650622e51c4c94d43d490450a82a87ad3d return {} async def conflict( @@ -397,6 +430,11 @@ async def conflict( proposed_change: str = "", context: dict[str, Any] | None = None, ) -> dict[str, Any]: +<<<<<<< HEAD + if self._delegate: + return await self._delegate.conflict(entity, proposed_change, context) +======= +>>>>>>> 87cfd9650622e51c4c94d43d490450a82a87ad3d return {"conflicts": []} async def diff( @@ -405,6 +443,11 @@ async def diff( to_snapshot: str, scope: str = "full", ) -> dict[str, Any]: +<<<<<<< HEAD + if self._delegate: + return await self._delegate.diff(from_snapshot, to_snapshot, scope) +======= +>>>>>>> 87cfd9650622e51c4c94d43d490450a82a87ad3d return {"diff": {}} async def plan( @@ -414,6 +457,11 @@ async def plan( change_type: str = "refactor", scope: str = "full", ) -> dict[str, Any]: +<<<<<<< HEAD + if self._delegate: + return await self._delegate.plan(change_description, target_file, change_type, scope) +======= +>>>>>>> 87cfd9650622e51c4c94d43d490450a82a87ad3d return {"steps": []} async def why( @@ -422,9 +470,19 @@ async def why( relationship: str = "", depth: int = 3, ) -> dict[str, Any]: +<<<<<<< HEAD + if self._delegate: + return await self._delegate.why(entity, relationship, depth) + return {"reasoning": []} + + async def find_flow(self, start: str, end: str, flow_type: str = "data") -> dict[str, Any]: + if self._delegate: + return await self._delegate.find_flow(start, end, flow_type) +======= return {"reasoning": []} async def find_flow(self, start: str, end: str, flow_type: str = "data") -> dict[str, Any]: +>>>>>>> 87cfd9650622e51c4c94d43d490450a82a87ad3d return {} diff --git a/smp/protocol/server.py b/smp/protocol/server.py index 40dd426..2e382a8 100644 --- a/smp/protocol/server.py +++ b/smp/protocol/server.py @@ -6,8 +6,10 @@ from __future__ import annotations try: - import pysqlite3 import sys + + import pysqlite3 + sys.modules["sqlite3"] = pysqlite3 except ImportError: pass @@ -19,16 +21,18 @@ from fastapi import FastAPI, Request from fastapi.responses import Response +from smp.core.merkle import MerkleIndex, MerkleTree +from smp.engine.community import CommunityDetector +from smp.engine.embedding import create_embedding_service from smp.engine.enricher import StaticSemanticEnricher from smp.engine.graph_builder import DefaultGraphBuilder from smp.engine.seed_walk import SeedWalkEngine -from smp.engine.community import CommunityDetector -from smp.core.merkle import MerkleIndex +from smp.engine.query import DefaultQueryEngine from smp.logging import get_logger from smp.parser.registry import ParserRegistry from smp.protocol.dispatcher import handle_rpc -from smp.store.graph.neo4j_store import Neo4jGraphStore from smp.store.chroma_store import ChromaVectorStore +from smp.store.graph.neo4j_store import Neo4jGraphStore log = get_logger(__name__) @@ -53,12 +57,16 @@ async def lifespan(app: FastAPI): # type: ignore[no-untyped-def] # noqa: ANN20 vector = ChromaVectorStore() await vector.connect() - enricher = StaticSemanticEnricher() + embedding_service = create_embedding_service() + await embedding_service.connect() + + enricher = StaticSemanticEnricher(embedding_service=embedding_service) community_detector = CommunityDetector(graph_store=graph, vector_store=vector) - engine = SeedWalkEngine(graph_store=graph, vector_store=vector, enricher=enricher) + default_engine = DefaultQueryEngine(graph_store=graph, enricher=enricher) + engine = SeedWalkEngine(graph_store=graph, vector_store=vector, enricher=enricher, delegate=default_engine) builder = DefaultGraphBuilder(graph) registry = ParserRegistry() - merkle_index = MerkleIndex() + merkle_index = MerkleIndex(MerkleTree()) safety: dict[str, Any] | None = None if safety_enabled: