Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 64 additions & 47 deletions mcp/server/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,23 +149,7 @@ def retrieval(
):
if document_ids is None:
document_ids = []

# If no dataset_ids provided or empty list, get all available dataset IDs
if not dataset_ids:
dataset_list_str = self.list_datasets()
dataset_ids = []

# Parse the dataset list to extract IDs
if dataset_list_str:
for line in dataset_list_str.strip().split('\n'):
if line.strip():
try:
dataset_info = json.loads(line.strip())
dataset_ids.append(dataset_info["id"])
except (json.JSONDecodeError, KeyError):
# Skip malformed lines
continue


data_json = {
"page": page,
"page_size": page_size,
Expand Down Expand Up @@ -368,88 +352,100 @@ async def list_tools(*, connector) -> list[types.Tool]:
dataset_description = connector.list_datasets()

return [
types.Tool(
name="ragflow_dataset_summary",
description="Return a summary of all available datasets, including count and metadata.",
inputSchema={
"type": "object",
"properties": {},
},
),
types.Tool(
name="ragflow_retrieval",
description="Retrieve relevant chunks from the RAGFlow retrieve interface based on the question. You can optionally specify dataset_ids to search only specific datasets, or omit dataset_ids entirely to search across ALL available datasets. You can also optionally specify document_ids to search within specific documents. When dataset_ids is not provided or is empty, the system will automatically search across all available datasets. Below is the list of all available datasets, including their descriptions and IDs:"
+ dataset_description,
description=(
"Retrieve relevant chunks from the RAGFlow retrieve interface based on the question. "
"You can optionally specify dataset_ids to search only specific datasets, or omit "
"dataset_ids entirely to search across ALL available datasets. You can also optionally "
"specify document_ids to search within specific documents. When dataset_ids is not "
"provided or is empty, the system will automatically search across all available "
"datasets. Below is the list of all available datasets, including their descriptions "
"and IDs:\n"
+ dataset_description
),
inputSchema={
"type": "object",
"properties": {
"dataset_ids": {
"type": "array",
"items": {"type": "string"},
"description": "Optional array of dataset IDs to search. If not provided or empty, all datasets will be searched."
},
"document_ids": {
"type": "array",
"items": {"type": "string"},
"description": "Optional array of document IDs to search within."
},
"question": {
"type": "string",
"description": "The question or query to search for."
},
"page": {
"type": "integer",
"description": "Page number for pagination",
"default": 1,
"minimum": 1,
},
"page_size": {
"type": "integer",
"description": "Number of results to return per page (default: 10, max recommended: 50 to avoid token limits)",
"default": 10,
"minimum": 1,
"maximum": 100,
},
"similarity_threshold": {
"type": "number",
"description": "Minimum similarity threshold for results",
"default": 0.2,
"minimum": 0.0,
"maximum": 1.0,
},
"vector_similarity_weight": {
"type": "number",
"description": "Weight for vector similarity vs term similarity",
"default": 0.3,
"minimum": 0.0,
"maximum": 1.0,
},
"keyword": {
"type": "boolean",
"description": "Enable keyword-based search",
"default": False,
},
"top_k": {
"type": "integer",
"description": "Maximum results to consider before ranking",
"default": 1024,
"minimum": 1,
"maximum": 1024,
},
"rerank_id": {
"type": "string",
"description": "Optional reranking model identifier",
},
"force_refresh": {
"type": "boolean",
"description": "Set to true only if fresh dataset and document metadata is explicitly required. Otherwise, cached metadata is used (default: false).",
"default": False,
},
Comment on lines 378 to 428
Copy link

Copilot AI Jan 5, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The inputSchema properties are missing their "description" fields. While the properties have appropriate defaults and constraints, adding descriptions would improve the developer experience by providing inline documentation about what each parameter does. This is especially important for parameters like "similarity_threshold", "vector_similarity_weight", and "top_k" that require domain knowledge to use effectively.

Copilot uses AI. Check for mistakes.
},
"required": ["question"],
},
),
]



@app.call_tool()
@with_api_key(required=True)
async def call_tool(name: str, arguments: dict, *, connector) -> list[types.TextContent | types.ImageContent | types.EmbeddedResource]:
async def call_tool(
name: str,
arguments: dict,
*,
connector,
) -> list[types.TextContent | types.ImageContent | types.EmbeddedResource]:

if name == "ragflow_retrieval":
document_ids = arguments.get("document_ids", [])
dataset_ids = arguments.get("dataset_ids", [])
dataset_ids = arguments.get("dataset_ids") or []
Copy link

Copilot AI Jan 5, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The expression arguments.get("dataset_ids") or [] will treat an empty list as falsy and replace it with a new empty list. This is redundant but harmless. However, for consistency with document_ids on line 445 which uses arguments.get("document_ids", []), consider using the same pattern: arguments.get("dataset_ids", []).

Suggested change
dataset_ids = arguments.get("dataset_ids") or []
dataset_ids = arguments.get("dataset_ids", [])

Copilot uses AI. Check for mistakes.


Copy link

Copilot AI Jan 5, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are two consecutive blank lines here. Consider removing one to maintain consistent spacing throughout the file, as single blank lines are used elsewhere (e.g., between the comment and the if statement on lines 459-460).

Suggested change

Copilot uses AI. Check for mistakes.
question = arguments.get("question", "")
page = arguments.get("page", 1)
page_size = arguments.get("page_size", 10)
Expand All @@ -460,23 +456,18 @@ async def call_tool(name: str, arguments: dict, *, connector) -> list[types.Text
rerank_id = arguments.get("rerank_id")
force_refresh = arguments.get("force_refresh", False)


# If no dataset_ids provided or empty list, get all available dataset IDs
# Auto-expand datasets if none provided
if not dataset_ids:
dataset_list_str = connector.list_datasets()
dataset_ids = []

# Parse the dataset list to extract IDs

Copy link

Copilot AI Jan 5, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Empty line 462 immediately followed by another empty line on 463 creates inconsistent spacing. Consider removing one of these blank lines to maintain uniform code style.

Suggested change

Copilot uses AI. Check for mistakes.
if dataset_list_str:
for line in dataset_list_str.strip().split('\n'):
if line.strip():
try:
dataset_info = json.loads(line.strip())
dataset_ids.append(dataset_info["id"])
except (json.JSONDecodeError, KeyError):
# Skip malformed lines
continue

for line in dataset_list_str.strip().split("\n"):
try:
dataset_ids.append(json.loads(line)["id"])
except Exception:
continue
Comment on lines +466 to +469
Copy link

Copilot AI Jan 5, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The broad exception handler except Exception: silently ignores all parsing errors, including potentially serious issues like network errors or unexpected JSON structure. Consider at least logging these exceptions for debugging purposes, or being more specific about which exceptions to catch (e.g., json.JSONDecodeError, KeyError).

Copilot uses AI. Check for mistakes.

return connector.retrieval(
dataset_ids=dataset_ids,
document_ids=document_ids,
Expand All @@ -490,10 +481,36 @@ async def call_tool(name: str, arguments: dict, *, connector) -> list[types.Text
rerank_id=rerank_id,
force_refresh=force_refresh,
)

elif name == "ragflow_dataset_summary":
dataset_list_str = connector.list_datasets()
datasets = []

if dataset_list_str:
for line in dataset_list_str.split("\n"):
try:
datasets.append(json.loads(line))
except Exception:
continue
Comment on lines +491 to +494
Copy link

Copilot AI Jan 5, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The broad exception handler except Exception: silently ignores all parsing errors. Similar to the issue in the ragflow_retrieval handler, consider logging these exceptions or being more specific about which exceptions to catch (e.g., json.JSONDecodeError).

Copilot uses AI. Check for mistakes.

return [
types.TextContent(
type="text",
text=json.dumps(
{
"dataset_count": len(datasets),
"datasets": datasets,
},
ensure_ascii=False,
),
)
]

raise ValueError(f"Tool not found: {name}")


def create_starlette_app():

routes = []
middleware = None
if MODE == LaunchMode.HOST:
Expand Down