-
Notifications
You must be signed in to change notification settings - Fork 10
Mcp auto expand datasets #25
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -149,23 +149,7 @@ def retrieval( | |||||
| ): | ||||||
| if document_ids is None: | ||||||
| document_ids = [] | ||||||
|
|
||||||
| # If no dataset_ids provided or empty list, get all available dataset IDs | ||||||
| if not dataset_ids: | ||||||
| dataset_list_str = self.list_datasets() | ||||||
| dataset_ids = [] | ||||||
|
|
||||||
| # Parse the dataset list to extract IDs | ||||||
| if dataset_list_str: | ||||||
| for line in dataset_list_str.strip().split('\n'): | ||||||
| if line.strip(): | ||||||
| try: | ||||||
| dataset_info = json.loads(line.strip()) | ||||||
| dataset_ids.append(dataset_info["id"]) | ||||||
| except (json.JSONDecodeError, KeyError): | ||||||
| # Skip malformed lines | ||||||
| continue | ||||||
|
|
||||||
|
|
||||||
| data_json = { | ||||||
| "page": page, | ||||||
| "page_size": page_size, | ||||||
|
|
@@ -368,88 +352,100 @@ async def list_tools(*, connector) -> list[types.Tool]: | |||||
| dataset_description = connector.list_datasets() | ||||||
|
|
||||||
| return [ | ||||||
| types.Tool( | ||||||
| name="ragflow_dataset_summary", | ||||||
| description="Return a summary of all available datasets, including count and metadata.", | ||||||
| inputSchema={ | ||||||
| "type": "object", | ||||||
| "properties": {}, | ||||||
| }, | ||||||
| ), | ||||||
| types.Tool( | ||||||
| name="ragflow_retrieval", | ||||||
| description="Retrieve relevant chunks from the RAGFlow retrieve interface based on the question. You can optionally specify dataset_ids to search only specific datasets, or omit dataset_ids entirely to search across ALL available datasets. You can also optionally specify document_ids to search within specific documents. When dataset_ids is not provided or is empty, the system will automatically search across all available datasets. Below is the list of all available datasets, including their descriptions and IDs:" | ||||||
| + dataset_description, | ||||||
| description=( | ||||||
| "Retrieve relevant chunks from the RAGFlow retrieve interface based on the question. " | ||||||
| "You can optionally specify dataset_ids to search only specific datasets, or omit " | ||||||
| "dataset_ids entirely to search across ALL available datasets. You can also optionally " | ||||||
| "specify document_ids to search within specific documents. When dataset_ids is not " | ||||||
| "provided or is empty, the system will automatically search across all available " | ||||||
| "datasets. Below is the list of all available datasets, including their descriptions " | ||||||
| "and IDs:\n" | ||||||
| + dataset_description | ||||||
| ), | ||||||
| inputSchema={ | ||||||
| "type": "object", | ||||||
| "properties": { | ||||||
| "dataset_ids": { | ||||||
| "type": "array", | ||||||
| "items": {"type": "string"}, | ||||||
| "description": "Optional array of dataset IDs to search. If not provided or empty, all datasets will be searched." | ||||||
| }, | ||||||
| "document_ids": { | ||||||
| "type": "array", | ||||||
| "items": {"type": "string"}, | ||||||
| "description": "Optional array of document IDs to search within." | ||||||
| }, | ||||||
| "question": { | ||||||
| "type": "string", | ||||||
| "description": "The question or query to search for." | ||||||
| }, | ||||||
| "page": { | ||||||
| "type": "integer", | ||||||
| "description": "Page number for pagination", | ||||||
| "default": 1, | ||||||
| "minimum": 1, | ||||||
| }, | ||||||
| "page_size": { | ||||||
| "type": "integer", | ||||||
| "description": "Number of results to return per page (default: 10, max recommended: 50 to avoid token limits)", | ||||||
| "default": 10, | ||||||
| "minimum": 1, | ||||||
| "maximum": 100, | ||||||
| }, | ||||||
| "similarity_threshold": { | ||||||
| "type": "number", | ||||||
| "description": "Minimum similarity threshold for results", | ||||||
| "default": 0.2, | ||||||
| "minimum": 0.0, | ||||||
| "maximum": 1.0, | ||||||
| }, | ||||||
| "vector_similarity_weight": { | ||||||
| "type": "number", | ||||||
| "description": "Weight for vector similarity vs term similarity", | ||||||
| "default": 0.3, | ||||||
| "minimum": 0.0, | ||||||
| "maximum": 1.0, | ||||||
| }, | ||||||
| "keyword": { | ||||||
| "type": "boolean", | ||||||
| "description": "Enable keyword-based search", | ||||||
| "default": False, | ||||||
| }, | ||||||
| "top_k": { | ||||||
| "type": "integer", | ||||||
| "description": "Maximum results to consider before ranking", | ||||||
| "default": 1024, | ||||||
| "minimum": 1, | ||||||
| "maximum": 1024, | ||||||
| }, | ||||||
| "rerank_id": { | ||||||
| "type": "string", | ||||||
| "description": "Optional reranking model identifier", | ||||||
| }, | ||||||
| "force_refresh": { | ||||||
| "type": "boolean", | ||||||
| "description": "Set to true only if fresh dataset and document metadata is explicitly required. Otherwise, cached metadata is used (default: false).", | ||||||
| "default": False, | ||||||
| }, | ||||||
| }, | ||||||
| "required": ["question"], | ||||||
| }, | ||||||
| ), | ||||||
| ] | ||||||
|
|
||||||
|
|
||||||
|
|
||||||
| @app.call_tool() | ||||||
| @with_api_key(required=True) | ||||||
| async def call_tool(name: str, arguments: dict, *, connector) -> list[types.TextContent | types.ImageContent | types.EmbeddedResource]: | ||||||
| async def call_tool( | ||||||
| name: str, | ||||||
| arguments: dict, | ||||||
| *, | ||||||
| connector, | ||||||
| ) -> list[types.TextContent | types.ImageContent | types.EmbeddedResource]: | ||||||
|
|
||||||
| if name == "ragflow_retrieval": | ||||||
| document_ids = arguments.get("document_ids", []) | ||||||
| dataset_ids = arguments.get("dataset_ids", []) | ||||||
| dataset_ids = arguments.get("dataset_ids") or [] | ||||||
|
||||||
| dataset_ids = arguments.get("dataset_ids") or [] | |
| dataset_ids = arguments.get("dataset_ids", []) |
Copilot
AI
Jan 5, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There are two consecutive blank lines here. Consider removing one to maintain consistent spacing throughout the file, as single blank lines are used elsewhere (e.g., between the comment and the if statement on lines 459-460).
Copilot
AI
Jan 5, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Empty line 462 immediately followed by another empty line on 463 creates inconsistent spacing. Consider removing one of these blank lines to maintain uniform code style.
Copilot
AI
Jan 5, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The broad exception handler except Exception: silently ignores all parsing errors, including potentially serious issues like network errors or unexpected JSON structure. Consider at least logging these exceptions for debugging purposes, or being more specific about which exceptions to catch (e.g., json.JSONDecodeError, KeyError).
Copilot
AI
Jan 5, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The broad exception handler except Exception: silently ignores all parsing errors. Similar to the issue in the ragflow_retrieval handler, consider logging these exceptions or being more specific about which exceptions to catch (e.g., json.JSONDecodeError).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The inputSchema properties are missing their "description" fields. While the properties have appropriate defaults and constraints, adding descriptions would improve the developer experience by providing inline documentation about what each parameter does. This is especially important for parameters like "similarity_threshold", "vector_similarity_weight", and "top_k" that require domain knowledge to use effectively.