From 71fe00b04dd0798a6c56b6e8ec8e45dd0cec761e Mon Sep 17 00:00:00 2001
From: Burhan Abdullah <131237388+burh121@users.noreply.github.com>
Date: Sun, 4 Jan 2026 15:55:40 +0530
Subject: [PATCH 1/2] git commit -m "mcp: auto-expand datasets when dataset_ids
 not provided"

---
 mcp/server/server.py | 54 ++++++++++++++++++++++++++++++++------------
 1 file changed, 39 insertions(+), 15 deletions(-)

diff --git a/mcp/server/server.py b/mcp/server/server.py
index 8d0d12c25..176a2be8f 100644
--- a/mcp/server/server.py
+++ b/mcp/server/server.py
@@ -446,7 +446,13 @@ async def list_tools(*, connector) -> list[types.Tool]:
 
 @app.call_tool()
 @with_api_key(required=True)
-async def call_tool(name: str, arguments: dict, *, connector) -> list[types.TextContent | types.ImageContent | types.EmbeddedResource]:
+async def call_tool(
+    name: str,
+    arguments: dict,
+    *,
+    connector,
+) -> list[types.TextContent | types.ImageContent | types.EmbeddedResource]:
+
     if name == "ragflow_retrieval":
         document_ids = arguments.get("document_ids", [])
         dataset_ids = arguments.get("dataset_ids", [])
@@ -460,23 +466,18 @@ async def call_tool(name: str, arguments: dict, *, connector) -> list[types.Text
         rerank_id = arguments.get("rerank_id")
         force_refresh = arguments.get("force_refresh", False)
 
-        
-        # If no dataset_ids provided or empty list, get all available dataset IDs
+        # Auto-expand datasets if none provided
         if not dataset_ids:
             dataset_list_str = connector.list_datasets()
             dataset_ids = []
-            
-            # Parse the dataset list to extract IDs
+
             if dataset_list_str:
-                for line in dataset_list_str.strip().split('\n'):
-                    if line.strip():
-                        try:
-                            dataset_info = json.loads(line.strip())
-                            dataset_ids.append(dataset_info["id"])
-                        except (json.JSONDecodeError, KeyError):
-                            # Skip malformed lines
-                            continue
-        
+                for line in dataset_list_str.strip().split("\n"):
+                    try:
+                        dataset_ids.append(json.loads(line)["id"])
+                    except Exception:
+                        continue
+
         return connector.retrieval(
             dataset_ids=dataset_ids,
             document_ids=document_ids,
@@ -490,9 +491,32 @@ async def call_tool(name: str, arguments: dict, *, connector) -> list[types.Text
             rerank_id=rerank_id,
             force_refresh=force_refresh,
         )
-    raise ValueError(f"Tool not found: {name}")
 
+    elif name == "ragflow_dataset_summary":
+        dataset_list_str = connector.list_datasets()
+        datasets = []
 
+        if dataset_list_str:
+            for line in dataset_list_str.split("\n"):
+                try:
+                    datasets.append(json.loads(line))
+                except Exception:
+                    continue
+
+        return [
+            types.TextContent(
+                type="text",
+                text=json.dumps(
+                    {
+                        "dataset_count": len(datasets),
+                        "datasets": datasets,
+                    },
+                    ensure_ascii=False,
+                ),
+            )
+        ]
+
+    raise ValueError(f"Tool not found: {name}")
 def create_starlette_app():
     routes = []
     middleware = None

From da4486b59a019a16e36c2b55ef0c95a6943d2000 Mon Sep 17 00:00:00 2001
From: Burhan Abdullah <131237388+burh121@users.noreply.github.com>
Date: Sun, 4 Jan 2026 16:49:41 +0530
Subject: [PATCH 2/2] git commit -m "mcp: add dataset summary tool and safely
 auto-expand datasets"

---
 mcp/server/server.py | 59 +++++++++++++++++++-------------------------
 1 file changed, 26 insertions(+), 33 deletions(-)

diff --git a/mcp/server/server.py b/mcp/server/server.py
index 176a2be8f..d754f29f5 100644
--- a/mcp/server/server.py
+++ b/mcp/server/server.py
@@ -149,23 +149,7 @@ def retrieval(
     ):
         if document_ids is None:
             document_ids = []
-        
-        # If no dataset_ids provided or empty list, get all available dataset IDs
-        if not dataset_ids:
-            dataset_list_str = self.list_datasets()
-            dataset_ids = []
-            
-            # Parse the dataset list to extract IDs
-            if dataset_list_str:
-                for line in dataset_list_str.strip().split('\n'):
-                    if line.strip():
-                        try:
-                            dataset_info = json.loads(line.strip())
-                            dataset_ids.append(dataset_info["id"])
-                        except (json.JSONDecodeError, KeyError):
-                            # Skip malformed lines
-                            continue
-        
+         
         data_json = {
             "page": page,
             "page_size": page_size,
@@ -368,73 +352,78 @@ async def list_tools(*, connector) -> list[types.Tool]:
     dataset_description = connector.list_datasets()
 
     return [
+        types.Tool(
+            name="ragflow_dataset_summary",
+            description="Return a summary of all available datasets, including count and metadata.",
+            inputSchema={
+                "type": "object",
+                "properties": {},
+            },
+        ),
         types.Tool(
             name="ragflow_retrieval",
-            description="Retrieve relevant chunks from the RAGFlow retrieve interface based on the question. You can optionally specify dataset_ids to search only specific datasets, or omit dataset_ids entirely to search across ALL available datasets. You can also optionally specify document_ids to search within specific documents. When dataset_ids is not provided or is empty, the system will automatically search across all available datasets. Below is the list of all available datasets, including their descriptions and IDs:"
-            + dataset_description,
+            description=(
+                "Retrieve relevant chunks from the RAGFlow retrieve interface based on the question. "
+                "You can optionally specify dataset_ids to search only specific datasets, or omit "
+                "dataset_ids entirely to search across ALL available datasets. You can also optionally "
+                "specify document_ids to search within specific documents. When dataset_ids is not "
+                "provided or is empty, the system will automatically search across all available "
+                "datasets. Below is the list of all available datasets, including their descriptions "
+                "and IDs:\n"
+                + dataset_description
+            ),
             inputSchema={
                 "type": "object",
                 "properties": {
                     "dataset_ids": {
                         "type": "array",
                         "items": {"type": "string"},
-                        "description": "Optional array of dataset IDs to search. If not provided or empty, all datasets will be searched."
                     },
                     "document_ids": {
                         "type": "array",
                         "items": {"type": "string"},
-                        "description": "Optional array of document IDs to search within."
                     },
                     "question": {
                         "type": "string",
-                        "description": "The question or query to search for."
                     },
                     "page": {
                         "type": "integer",
-                        "description": "Page number for pagination",
                         "default": 1,
                         "minimum": 1,
                     },
                     "page_size": {
                         "type": "integer",
-                        "description": "Number of results to return per page (default: 10, max recommended: 50 to avoid token limits)",
                         "default": 10,
                         "minimum": 1,
                         "maximum": 100,
                     },
                     "similarity_threshold": {
                         "type": "number",
-                        "description": "Minimum similarity threshold for results",
                         "default": 0.2,
                         "minimum": 0.0,
                         "maximum": 1.0,
                     },
                     "vector_similarity_weight": {
                         "type": "number",
-                        "description": "Weight for vector similarity vs term similarity",
                         "default": 0.3,
                         "minimum": 0.0,
                         "maximum": 1.0,
                     },
                     "keyword": {
                         "type": "boolean",
-                        "description": "Enable keyword-based search",
                         "default": False,
                     },
                     "top_k": {
                         "type": "integer",
-                        "description": "Maximum results to consider before ranking",
                         "default": 1024,
                         "minimum": 1,
                         "maximum": 1024,
                     },
                     "rerank_id": {
                         "type": "string",
-                        "description": "Optional reranking model identifier",
                     },
                     "force_refresh": {
                         "type": "boolean",
-                        "description": "Set to true only if fresh dataset and document metadata is explicitly required. Otherwise, cached metadata is used (default: false).",
                         "default": False,
                     },
                 },
@@ -442,8 +431,7 @@ async def list_tools(*, connector) -> list[types.Tool]:
             },
         ),
     ]
-
-
+    
 @app.call_tool()
 @with_api_key(required=True)
 async def call_tool(
@@ -455,7 +443,9 @@ async def call_tool(
 
     if name == "ragflow_retrieval":
         document_ids = arguments.get("document_ids", [])
-        dataset_ids = arguments.get("dataset_ids", [])
+        dataset_ids = arguments.get("dataset_ids") or []
+
+
         question = arguments.get("question", "")
         page = arguments.get("page", 1)
         page_size = arguments.get("page_size", 10)
@@ -517,7 +507,10 @@ async def call_tool(
         ]
 
     raise ValueError(f"Tool not found: {name}")
+
+
 def create_starlette_app():
+
     routes = []
     middleware = None
     if MODE == LaunchMode.HOST: