Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file added ERROR
Empty file.
11 changes: 10 additions & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,13 @@ services:
NEO4J_dbms_security_procedures_unrestricted: apoc.*
NEO4J_dbms_security_procedures_allowlist: apoc.*
ports:
<<<<<<< HEAD
- "7475:7474" # Host 7475 maps to Container 7474
- "7688:7687" # Host 7688 maps to Container 7687
=======
- "7474:7474"
- "7687:7687"
>>>>>>> 87cfd9650622e51c4c94d43d490450a82a87ad3d
Comment on lines +9 to +15
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

Merge conflict markers have been committed to the repository. This results in invalid YAML syntax and will cause the docker-compose command to fail. Please resolve the conflicts and remove the markers.

volumes:
- neo4j_data:/data
- neo4j_logs:/logs
Expand Down Expand Up @@ -49,4 +54,8 @@ volumes:
neo4j_data:
neo4j_logs:
chroma_data:
smp_data:
<<<<<<< HEAD
smp_data:
=======
smp_data:
>>>>>>> 87cfd9650622e51c4c94d43d490450a82a87ad3d
4 changes: 4 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@ dependencies = [
"tree-sitter-typescript>=0.23",
"python-dotenv>=1.0",
"structlog>=24.0",
<<<<<<< HEAD
"chromadb",
=======
>>>>>>> 87cfd9650622e51c4c94d43d490450a82a87ad3d
Comment on lines +24 to +27
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

Merge conflict markers found in the dependencies section. This will prevent the project from being installed or built correctly. Please resolve the conflict.

]

[project.optional-dependencies]
Expand Down
124 changes: 124 additions & 0 deletions smp/engine/embedding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
"""Embedding service using NVIDIA NIM or OpenAI."""

from __future__ import annotations

import os
from typing import Any

import httpx

from smp.logging import get_logger

log = get_logger(__name__)


class EmbeddingService:
"""Generate embeddings via NVIDIA NIM or OpenAI."""

def __init__(
self,
provider: str = "nvidia",
api_key: str | None = None,
model: str | None = None,
base_url: str | None = None,
dimension: int = 768,
) -> None:
self._provider = provider
self._api_key = api_key or os.environ.get("NVIDIA_NIM_API_KEY") or os.environ.get("OPENAI_API_KEY", "")
self._model = model or os.environ.get("EMBEDDING_MODEL", "nvidia/nv-embed-qa-4")
self._base_url = base_url or os.environ.get(
"EMBEDDING_BASE_URL", "https://integrate.api.nvidia.com/v1"
)
Comment on lines +26 to +31
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The initialization logic for _api_key and _base_url does not correctly differentiate between providers. If provider='openai' is used, it might still pick up an NVIDIA API key or use the NVIDIA base URL by default. It is safer to select defaults based on the provider value.

        self._provider = provider
        if api_key:
            self._api_key = api_key
        else:
            env_key = "NVIDIA_NIM_API_KEY" if provider == "nvidia" else "OPENAI_API_KEY"
            self._api_key = os.environ.get(env_key, "")

        self._model = model or os.environ.get("EMBEDDING_MODEL", "nvidia/nv-embed-qa-4" if provider == "nvidia" else "text-embedding-3-small")

        default_url = "https://integrate.api.nvidia.com/v1" if provider == "nvidia" else "https://api.openai.com/v1"
        self._base_url = base_url or os.environ.get("EMBEDDING_BASE_URL", default_url)

self._dimension = dimension
self._client: httpx.AsyncClient | None = None

async def connect(self) -> None:
self._client = httpx.AsyncClient(
base_url=self._base_url,
headers={"Authorization": f"Bearer {self._api_key}"},
timeout=60.0,
)
log.info("embedding_service_connected", provider=self._provider, model=self._model)

async def close(self) -> None:
if self._client:
await self._client.aclose()
self._client = None

@property
def dimension(self) -> int:
return self._dimension

async def embed(self, text: str) -> list[float]:
"""Generate embedding for a single text."""
if self._client is None:
raise RuntimeError("EmbeddingService not connected")

if self._provider == "nvidia":
return await self._embed_nvidia(text)
elif self._provider == "openai":
return await self._embed_openai(text)
else:
raise ValueError(f"Unknown provider: {self._provider}")

async def embed_batch(self, texts: list[str]) -> list[list[float]]:
"""Generate embeddings for multiple texts."""
if self._client is None:
raise RuntimeError("EmbeddingService not connected")

if self._provider == "nvidia":
return await self._embed_batch_nvidia(texts)
elif self._provider == "openai":
return await self._embed_batch_openai(texts)
else:
raise ValueError(f"Unknown provider: {self._provider}")

async def _embed_nvidia(self, text: str) -> list[float]:
payload = {
"input": text,
"model": self._model,
}
response = await self._client.post("/embeddings", json=payload)
response.raise_for_status()
data = response.json()
return data["data"][0]["embedding"]

async def _embed_batch_nvidia(self, texts: list[str]) -> list[list[float]]:
payload = {
"input": texts,
"model": self._model,
}
response = await self._client.post("/embeddings", json=payload)
response.raise_for_status()
data = response.json()
return [item["embedding"] for item in data["data"]]

async def _embed_openai(self, text: str) -> list[float]:
payload = {
"input": text,
"model": self._model,
}
response = await self._client.post("/embeddings", json=payload)
response.raise_for_status()
data = response.json()
return data["data"][0]["embedding"]

async def _embed_batch_openai(self, texts: list[str]) -> list[list[float]]:
payload = {
"input": texts,
"model": self._model,
}
response = await self._client.post("/embeddings", json=payload)
response.raise_for_status()
data = response.json()
return [item["embedding"] for item in data["data"]]
Comment on lines +76 to +114
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The implementation of embedding methods for both NVIDIA and OpenAI providers is identical. Consolidating this logic into a single private method will reduce code duplication and simplify future updates.

    async def _fetch_embeddings(self, input_data: str | list[str]) -> Any:
        payload = {"input": input_data, "model": self._model}
        response = await self._client.post("/embeddings", json=payload)
        response.raise_for_status()
        return response.json()["data"]

    async def _embed_nvidia(self, text: str) -> list[float]:
        data = await self._fetch_embeddings(text)
        return data[0]["embedding"]

    async def _embed_batch_nvidia(self, texts: list[str]) -> list[list[float]]:
        data = await self._fetch_embeddings(texts)
        return [item["embedding"] for item in data]

    async def _embed_openai(self, text: str) -> list[float]:
        data = await self._fetch_embeddings(text)
        return data[0]["embedding"]

    async def _embed_batch_openai(self, texts: list[str]) -> list[list[float]]:
        data = await self._fetch_embeddings(texts)
        return [item["embedding"] for item in data]



def create_embedding_service() -> EmbeddingService:
"""Create embedding service from environment variables."""
provider = os.getenv("EMBEDDING_PROVIDER", "nvidia")
api_key = os.getenv("NVIDIA_NIM_API_KEY") or os.getenv("OPENAI_API_KEY")
model = os.getenv("EMBEDDING_MODEL")
base_url = os.getenv("EMBEDDING_BASE_URL")
dimension = int(os.getenv("EMBEDDING_DIMENSION", "768"))
return EmbeddingService(provider=provider, api_key=api_key, model=model, base_url=base_url, dimension=dimension)
50 changes: 50 additions & 0 deletions smp/engine/enricher.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,33 @@
<<<<<<< HEAD
"""Static semantic enricher with optional LLM-based embedding."""
=======
"""Static semantic enricher — AST-based extraction.

Extracts docstrings, inline comments, decorators, type annotations,
and computes source hashes purely from the AST.
No LLM or embedding generation.
"""
>>>>>>> 87cfd9650622e51c4c94d43d490450a82a87ad3d
Comment on lines +1 to +10
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

Merge conflict markers detected in the module docstring. This is invalid Python code and will cause a SyntaxError at runtime.


from __future__ import annotations

import hashlib
from datetime import UTC, datetime
<<<<<<< HEAD
from typing import TYPE_CHECKING
=======
>>>>>>> 87cfd9650622e51c4c94d43d490450a82a87ad3d

from smp.core.models import GraphNode
from smp.engine.interfaces import SemanticEnricher as SemanticEnricherInterface
from smp.logging import get_logger

<<<<<<< HEAD
if TYPE_CHECKING:
from smp.engine.embedding import EmbeddingService

=======
>>>>>>> 87cfd9650622e51c4c94d43d490450a82a87ad3d
log = get_logger(__name__)


Expand All @@ -24,15 +38,28 @@ def _compute_source_hash(name: str, file_path: str, start: int, end: int, signat


class StaticSemanticEnricher(SemanticEnricherInterface):
<<<<<<< HEAD
"""Static AST-based semantic enricher with optional embedding support."""

def __init__(self, embedding_service: EmbeddingService | None = None) -> None:
=======
"""Static AST-based semantic enricher. No LLM, no embeddings."""

def __init__(self) -> None:
>>>>>>> 87cfd9650622e51c4c94d43d490450a82a87ad3d
self._enrichment_counts: dict[str, int] = {
"enriched": 0,
"skipped": 0,
"no_metadata": 0,
"failed": 0,
}
<<<<<<< HEAD
self._embedding_service = embedding_service

def set_embedding_service(self, service: EmbeddingService) -> None:
self._embedding_service = service
=======
>>>>>>> 87cfd9650622e51c4c94d43d490450a82a87ad3d

async def enrich_node(
self,
Expand Down Expand Up @@ -83,9 +110,28 @@ async def enrich_batch(
enriched.append(result)
return enriched

<<<<<<< HEAD
@property
def has_llm(self) -> bool:
"""Check if LLM-based embedding is available."""
return self._embedding_service is not None

async def embed(self, text: str) -> list[float]:
"""Generate embedding using the embedding service if available."""
if self._embedding_service is None:
return []
return await self._embedding_service.embed(text)

async def embed_batch(self, texts: list[str]) -> list[list[float]]:
"""Generate embeddings for multiple texts."""
if self._embedding_service is None:
return [[] for _ in texts]
return await self._embedding_service.embed_batch(texts)
=======
async def embed(self, text: str) -> list[float]:
"""No-op embedding — static enricher does not use vectors."""
return []
>>>>>>> 87cfd9650622e51c4c94d43d490450a82a87ad3d

def get_counts(self) -> dict[str, int]:
"""Return enrichment statistics."""
Expand All @@ -94,4 +140,8 @@ def get_counts(self) -> dict[str, int]:
def reset_counts(self) -> None:
"""Reset enrichment counters."""
for key in self._enrichment_counts:
<<<<<<< HEAD
self._enrichment_counts[key] = 0
=======
self._enrichment_counts[key] = 0
>>>>>>> 87cfd9650622e51c4c94d43d490450a82a87ad3d
7 changes: 7 additions & 0 deletions smp/engine/interfaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,13 @@ async def enrich_batch(self, nodes: list[GraphNode], force: bool = False) -> lis
async def embed(self, text: str) -> list[float]:
"""No-op for static enricher."""

<<<<<<< HEAD
@abc.abstractmethod
async def embed_batch(self, texts: list[str]) -> list[list[float]]:
"""Generate embeddings for multiple texts."""

=======
>>>>>>> 87cfd9650622e51c4c94d43d490450a82a87ad3d
Comment on lines +58 to +64
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

Merge conflict markers found in the SemanticEnricher abstract class. This will break the engine's interface definitions.


class QueryEngine(abc.ABC):
"""High-level query interface over the memory store."""
Expand Down
Loading
Loading