[api][integration] Enrich interface for vector store.

wenjin272 · wenjin272 · commit dcac4155563a · 2025-12-09T19:00:27.000+08:00
diff --git a/python/flink_agents/api/vector_stores/vector_store.py b/python/flink_agents/api/vector_stores/vector_store.py
@@ -59,11 +59,17 @@ class VectorStoreQuery(BaseModel):
         namespaces, or other search configurations.
     """
 
-    mode: VectorStoreQueryMode = Field(default=VectorStoreQueryMode.SEMANTIC, description="The type of query "
-                                                                                          "operation to perform.")
-    query_text: str = Field(description="Text query to be converted to embedding for semantic search.")
+    mode: VectorStoreQueryMode = Field(
+        default=VectorStoreQueryMode.SEMANTIC,
+        description="The type of query operation to perform.",
+    )
+    query_text: str = Field(
+        description="Text query to be converted to embedding for semantic search."
+    )
     limit: int = Field(default=10, description="Maximum number of results to return.")
-    extra_args: Dict[str, Any] = Field(default_factory=dict, description="Vector store-specific parameters.")
+    extra_args: Dict[str, Any] = Field(
+        default_factory=dict, description="Vector store-specific parameters."
+    )
 
     def __str__(self) -> str:
         return f"{self.mode.value} query: '{self.query_text}' (limit={self.limit})"
@@ -85,11 +91,18 @@ class Document(BaseModel):
     """
 
     content: str = Field(description="The actual text content of the document.")
-    metadata: Dict[str, Any] = Field(default_factory=dict, description="Document metadata such as source, author, timestamp, etc.")
-    id: str | None = Field(default=None, description="Unique identifier of the document.")
+    metadata: Dict[str, Any] = Field(
+        default_factory=dict,
+        description="Document metadata such as source, author, timestamp, etc.",
+    )
+    id: str | None = Field(
+        default=None, description="Unique identifier of the document."
+    )
 
     def __str__(self) -> str:
-        content_preview = self.content[:50] + "..." if len(self.content) > 50 else self.content
+        content_preview = (
+            self.content[:50] + "..." if len(self.content) > 50 else self.content
+        )
         return f"Document: {content_preview}"
 
 
@@ -104,7 +117,9 @@ class VectorStoreQueryResult(BaseModel):
         List of documents retrieved from the vector store.
     """
 
-    documents: List[Document] = Field(description="List of documents retrieved from the vector store.")
+    documents: List[Document] = Field(
+        description="List of documents retrieved from the vector store."
+    )
 
     def __str__(self) -> str:
         return f"QueryResult: {len(self.documents)} documents"
@@ -118,7 +133,9 @@ class BaseVectorStore(Resource, ABC):
     embedding generation internally.
     """
 
-    embedding_model: str = Field(description="Name of the embedding model resource to use.")
+    embedding_model: str = Field(
+        description="Name of the embedding model resource to use."
+    )
 
     @classmethod
     @override
@@ -135,6 +152,40 @@ def store_kwargs(self) -> Dict[str, Any]:
         when performing vector search operations.
         """
 
+    def add(
+        self, documents: Document | List[Document], collection_name: str | None = None, **kwargs: Any
+    ) -> List[str]:
+        """Add documents to the vector store.
+
+        Converts document content to embeddings and stores them in the vector store.
+        The implementation may generate IDs for documents that don't have one.
+
+        Args:
+            documents: Single Document or list of Documents to add.
+            collection_name: The collection name of the documents to add to. Optional.
+            **kwargs: Vector store specific parameters.
+
+        Returns:
+            List of document IDs that were added to the vector store
+        """
+        # Normalize to list
+        documents = maybe_cast_to_list(documents)
+
+        # Generate embeddings for all documents
+        embedding_model = self.get_resource(
+            self.embedding_model, ResourceType.EMBEDDING_MODEL
+        )
+
+        # Generate embeddings for each document
+        embeddings = [embedding_model.embed(doc.content) for doc in documents]
+
+        # Merge setup kwargs with add-specific args
+        merged_kwargs = self.store_kwargs.copy()
+        merged_kwargs.update(kwargs)
+
+        # Perform add operation using the abstract method
+        return self.add_embedding(documents=documents, embeddings=embeddings, collection_name=collection_name, **merged_kwargs)
+
     def query(self, query: VectorStoreQuery) -> VectorStoreQueryResult:
         """Perform vector search using structured query object.
 
@@ -160,12 +211,35 @@ def query(self, query: VectorStoreQuery) -> VectorStoreQueryResult:
         documents = self.query_embedding(query_embedding, query.limit, **merged_kwargs)
 
         # Return structured result
-        return VectorStoreQueryResult(
-            documents=documents
-        )
+        return VectorStoreQueryResult(documents=documents)
 
     @abstractmethod
-    def query_embedding(self, embedding: List[float], limit: int = 10, **kwargs: Any) -> List[Document]:
+    def get(self, ids: str | List[str] | None = None, collection_name: str | None = None, **kwargs: Any) -> List[Document]:
+        """Retrieve a document from the vector store by its ID.
+
+        Args:
+            ids: Unique identifier of the documents to retrieve
+            collection_name: The collection name of the documents to retrieve. Optional.
+            **kwargs: Vector store specific parameters (offset, limit, filter etc.)
+
+        Returns:
+            Document object if found, None otherwise
+        """
+
+    @abstractmethod
+    def delete(self, ids: str | List[str] | None = None, collection_name: str | None = None, **kwargs: Any) -> None:
+        """Delete documents in the vector store by its IDs.
+
+        Args:
+            ids: Unique identifier of the documents to delete
+            collection_name: The collection name of the documents belong to. Optional.
+            **kwargs: Vector store specific parameters (filter etc.)
+        """
+
+    @abstractmethod
+    def query_embedding(
+        self, embedding: List[float], limit: int = 10, **kwargs: Any
+    ) -> List[Document]:
         """Perform vector search using pre-computed embedding.
 
         Args:
@@ -176,3 +250,74 @@ def query_embedding(self, embedding: List[float], limit: int = 10, **kwargs: Any
         Returns:
             List of documents matching the search criteria
         """
+
+    @abstractmethod
+    def add_embedding(
+        self,
+        *,
+        documents: List[Document],
+        embeddings: List[List[float]],
+        collection_name: str | None = None,
+        **kwargs: Any,
+    ) -> List[str]:
+        """Add documents with pre-computed embeddings to the vector store.
+
+        Args:
+            documents: Documents to add to the vector store
+            embeddings: Pre-computed embedding vector for each document
+            collection_name: The collection name of the documents to add. Optional.
+            **kwargs: Vector store-specific parameters (collection, namespace, etc.)
+
+        Returns:
+            List of document IDs that were added to the vector store
+        """
+
+
+class Collection(BaseModel):
+    """Represents a collection of documents."""
+    name: str
+    size: int
+    metadata: Dict[str, Any] | None = None
+
+
+class CollectionManageableVectorStore(BaseVectorStore, ABC):
+    """Base abstract class for vector store which support collection management."""
+
+    @abstractmethod
+    def get_or_create_collection(
+        self, name: str, metadata: Dict[str, Any]
+    ) -> Collection:
+        """Get a collection, or create it if it doesn't exist.
+
+        Args:
+            name: Name of the collection
+            metadata: Metadata of the collection
+        Returns:
+            The retrieved or created collection
+        """
+
+    @abstractmethod
+    def get_collection(self, name: str) -> Collection:
+        """Get a collection, raise an exception if it doesn't exist.
+
+        Args:
+            name: Name of the collection
+        Returns:
+            The retrieved collection
+        """
+
+    @abstractmethod
+    def delete_collection(self, name: str) -> Collection:
+        """Delete a collection.
+
+        Args:
+            name: Name of the collection
+        Returns:
+            The deleted collection
+        """
+
+def maybe_cast_to_list(value: Any | List[Any]) -> List[Any] | None:
+    """Cast T to List[T] if T is not None."""
+    if value is None:
+        return None
+    return [value] if not isinstance(value, list) else value
diff --git a/python/flink_agents/integrations/vector_stores/chroma/chroma_vector_store.py b/python/flink_agents/integrations/vector_stores/chroma/chroma_vector_store.py
@@ -15,23 +15,26 @@
 #  See the License for the specific language governing permissions and
 # limitations under the License.
 ################################################################################
+import uuid
 from typing import Any, Dict, List
 
 import chromadb
 from chromadb import ClientAPI as ChromaClient
 from chromadb import CloudClient
 from chromadb.config import Settings
 from pydantic import Field
+from typing_extensions import override
 
 from flink_agents.api.vector_stores.vector_store import (
-    BaseVectorStore,
+    Collection,
+    CollectionManageableVectorStore,
     Document,
 )
 
 DEFAULT_COLLECTION = "flink_agents_chroma_collection"
 
 
-class ChromaVectorStore(BaseVectorStore):
+class ChromaVectorStore(CollectionManageableVectorStore):
     """ChromaDB vector store that handles connection and semantic search.
 
     Visit https://docs.trychroma.com/ for ChromaDB documentation.
@@ -194,7 +197,100 @@ def store_kwargs(self) -> Dict[str, Any]:
             "create_collection_if_not_exists": self.create_collection_if_not_exists,
         }
 
-    def query_embedding(self, embedding: List[float], limit: int = 10, **kwargs: Any) -> List[Document]:
+    @override
+    def get_or_create_collection(
+        self, name: str, metadata: Dict[str, Any] | None = None
+    ) -> Collection:
+        collection = self.client.get_or_create_collection(name=name, metadata=metadata)
+        return Collection(
+            name=collection.name, size=collection.count(), metadata=collection.metadata
+        )
+
+    @override
+    def get_collection(self, name: str) -> Collection:
+        collection = self.client.get_collection(name=name)
+        return Collection(
+            name=collection.name, size=collection.count(), metadata=collection.metadata
+        )
+
+    @override
+    def delete_collection(self, name: str) -> Collection:
+        collection = self.get_collection(name=name)
+        self.client.delete_collection(name=collection.name)
+        return collection
+
+    @override
+    def get(
+        self,
+        ids: str | List[str] | None = None,
+        collection_name: str | None = None,
+        **kwargs: Any,
+    ) -> List[Document]:
+        where = kwargs.get("where")
+        limit = kwargs.get("limit")
+        offset = kwargs.get("offset")
+        where_document = kwargs.get("where_document")
+        results = self.client.get_collection(
+            name=collection_name or self.collection
+        ).get(
+            ids=ids,
+            where=where,
+            limit=limit,
+            offset=offset,
+            where_document=where_document,
+        )
+
+        ids = results["ids"]
+        documents = results["documents"]
+        metadatas = results["metadatas"]
+
+        return [
+            Document(id=id, content=document, metadata=dict(metadata))
+            for id, document, metadata in zip(ids, documents, metadatas, strict=False)
+        ]
+
+    @override
+    def delete(
+        self,
+        ids: str | List[str] | None = None,
+        collection_name: str | None = None,
+        **kwargs: Any,
+    ) -> None:
+        where = kwargs.get("where")
+        where_document = kwargs.get("where_document")
+        if ids is None and where is None and where_document is None:
+            ids = (
+                self.client.get_collection(collection_name or self.collection)
+                .get(include=[])
+                .get("ids")
+            )
+            # collection is empty
+            if len(ids) == 0:
+                return
+        self.client.get_collection(name=collection_name or self.collection).delete(
+            ids=ids, where=where, where_document=where_document
+        )
+
+    @override
+    def add_embedding(
+        self,
+        *,
+        documents: List[Document],
+        embeddings: List[List[float]],
+        collection_name: str | None = None,
+        **kwargs: Any,
+    ) -> List[str]:
+        ids = [doc.id or str(uuid.uuid4()) for doc in documents]
+        docs = [doc.content for doc in documents]
+        metadatas = [doc.metadata for doc in documents]
+        self.client.get_collection(name=collection_name or self.collection).add(
+            ids=ids, documents=docs, embeddings=embeddings, metadatas=metadatas
+        )
+        return ids
+
+    def query_embedding(
+        self, embedding: List[float], limit: int = 10, **kwargs: Any
+    ) -> List[Document]:
         """Perform vector search using pre-computed embedding.
 
         Args:
@@ -207,8 +303,12 @@ def query_embedding(self, embedding: List[float], limit: int = 10, **kwargs: Any
         """
         # Extract ChromaDB-specific parameters
         collection_name = kwargs.get("collection", self.collection)
-        collection_metadata = kwargs.get("collection_metadata", self.collection_metadata)
-        create_collection_if_not_exists = kwargs.get("create_collection_if_not_exists", self.create_collection_if_not_exists)
+        collection_metadata = kwargs.get(
+            "collection_metadata", self.collection_metadata
+        )
+        create_collection_if_not_exists = kwargs.get(
+            "create_collection_if_not_exists", self.create_collection_if_not_exists
+        )
         where = kwargs.get("where")  # Metadata filters
 
         # Get or create collection based on configuration
@@ -235,13 +335,18 @@ def query_embedding(self, embedding: List[float], limit: int = 10, **kwargs: Any
         if results["documents"] and results["documents"][0]:
             for i, doc_content in enumerate(results["documents"][0]):
                 doc_id = results["ids"][0][i] if results["ids"] else None
-                metadata = results["metadatas"][0][i] if results["metadatas"] and results["metadatas"][0] else {}
+                metadata = (
+                    results["metadatas"][0][i]
+                    if results["metadatas"] and results["metadatas"][0]
+                    else {}
+                )
 
-                documents.append(Document(
-                    content=doc_content,
-                    id=doc_id,
-                    metadata=metadata,
-                ))
+                documents.append(
+                    Document(
+                        content=doc_content,
+                        id=doc_id,
+                        metadata=metadata,
+                    )
+                )
 
         return documents
-
diff --git a/python/flink_agents/integrations/vector_stores/chroma/tests/test_chroma_vector_store.py b/python/flink_agents/integrations/vector_stores/chroma/tests/test_chroma_vector_store.py