From 352fa8fbc4ba9669cbc651ff8c1eca0378cd51fe Mon Sep 17 00:00:00 2001
From: "suiyu.zg" <suiyu.zg@oceanbase.com>
Date: Wed, 31 Dec 2025 16:55:48 +0800
Subject: [PATCH 01/19] feat:add PowerRAG SDK and API Proxy

---
 api/apps/sdk/powerrag_proxy.py                |  594 +++++++++
 api/apps/system_app.py                        |    2 +-
 powerrag/sdk/README.md                        | 1148 +++++++++++++++++
 powerrag/sdk/__init__.py                      |   29 +
 powerrag/sdk/client.py                        |  155 +++
 powerrag/sdk/modules/chunk.py                 |   32 +
 powerrag/sdk/modules/chunk_manager.py         |  271 ++++
 powerrag/sdk/modules/document.py              |   40 +
 powerrag/sdk/modules/document_manager.py      |  789 +++++++++++
 powerrag/sdk/modules/extraction.py            |   50 +
 powerrag/sdk/modules/extraction_manager.py    |  239 ++++
 powerrag/sdk/modules/knowledge_base.py        |   34 +
 .../sdk/modules/knowledge_base_manager.py     |  232 ++++
 powerrag/sdk/modules/knowledge_graph.py       |   52 +
 .../sdk/modules/knowledge_graph_manager.py    |  102 ++
 powerrag/sdk/modules/raptor.py                |   29 +
 powerrag/sdk/modules/raptor_manager.py        |   83 ++
 powerrag/sdk/modules/retrieval.py             |   41 +
 powerrag/sdk/modules/retrieval_manager.py     |  158 +++
 powerrag/sdk/tests/conftest.py                |  231 ++++
 powerrag/sdk/tests/pytest.ini                 |   20 +
 powerrag/sdk/tests/test_chunk.py              |  163 +++
 powerrag/sdk/tests/test_document.py           |  430 ++++++
 powerrag/sdk/tests/test_extraction.py         |  137 ++
 powerrag/sdk/tests/test_knowledge_base.py     |  193 +++
 powerrag/sdk/tests/test_knowledge_graph.py    |   90 ++
 powerrag/sdk/tests/test_raptor.py             |   69 +
 powerrag/sdk/tests/test_retrieval.py          |  105 ++
 powerrag/server/app.py                        |   41 +-
 powerrag/server/powerrag_server.py            |   15 +-
 powerrag/server/routes/powerrag_routes.py     |  511 +++++++-
 powerrag/server/routes/task_routes.py         |   12 +-
 powerrag/server/services/convert_service.py   |    4 +-
 powerrag/server/services/extract_service.py   |   22 +-
 powerrag/server/services/parse_service.py     |   97 ++
 .../services/parse_to_md_task_manager.py      |  237 ++++
 powerrag/server/services/split_service.py     |    2 +-
 37 files changed, 6394 insertions(+), 65 deletions(-)
 create mode 100644 api/apps/sdk/powerrag_proxy.py
 create mode 100644 powerrag/sdk/README.md
 create mode 100644 powerrag/sdk/__init__.py
 create mode 100644 powerrag/sdk/client.py
 create mode 100644 powerrag/sdk/modules/chunk.py
 create mode 100644 powerrag/sdk/modules/chunk_manager.py
 create mode 100644 powerrag/sdk/modules/document.py
 create mode 100644 powerrag/sdk/modules/document_manager.py
 create mode 100644 powerrag/sdk/modules/extraction.py
 create mode 100644 powerrag/sdk/modules/extraction_manager.py
 create mode 100644 powerrag/sdk/modules/knowledge_base.py
 create mode 100644 powerrag/sdk/modules/knowledge_base_manager.py
 create mode 100644 powerrag/sdk/modules/knowledge_graph.py
 create mode 100644 powerrag/sdk/modules/knowledge_graph_manager.py
 create mode 100644 powerrag/sdk/modules/raptor.py
 create mode 100644 powerrag/sdk/modules/raptor_manager.py
 create mode 100644 powerrag/sdk/modules/retrieval.py
 create mode 100644 powerrag/sdk/modules/retrieval_manager.py
 create mode 100644 powerrag/sdk/tests/conftest.py
 create mode 100644 powerrag/sdk/tests/pytest.ini
 create mode 100644 powerrag/sdk/tests/test_chunk.py
 create mode 100644 powerrag/sdk/tests/test_document.py
 create mode 100644 powerrag/sdk/tests/test_extraction.py
 create mode 100644 powerrag/sdk/tests/test_knowledge_base.py
 create mode 100644 powerrag/sdk/tests/test_knowledge_graph.py
 create mode 100644 powerrag/sdk/tests/test_raptor.py
 create mode 100644 powerrag/sdk/tests/test_retrieval.py
 create mode 100644 powerrag/server/services/parse_to_md_task_manager.py

diff --git a/api/apps/sdk/powerrag_proxy.py b/api/apps/sdk/powerrag_proxy.py
new file mode 100644
index 000000000..3bafdf867
--- /dev/null
+++ b/api/apps/sdk/powerrag_proxy.py
@@ -0,0 +1,594 @@
+#
+#  Copyright 2025 The OceanBase Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+"""
+PowerRAG API Proxy
+
+将 PowerRAG API 请求代理转发到独立的 PowerRAG server（端口6000）
+这样 SDK 可以通过主 RAGFlow 服务访问 PowerRAG 功能，无需直接连接到 PowerRAG server
+"""
+
+import os
+import logging
+import httpx
+from quart import request, jsonify
+from api.utils.api_utils import token_required, get_error_data_result
+
+logger = logging.getLogger(__name__)
+
+# manager 变量由 api/apps/__init__.py 的 register_page 函数自动注入
+# 这里使用 # noqa: F821 来忽略未定义的警告
+
+# PowerRAG server 地址配置
+# 可以通过环境变量 POWERRAG_SERVER_URL 配置，默认为 http://localhost:6000
+POWERRAG_SERVER_URL = os.environ.get("POWERRAG_SERVER_URL", "http://localhost:6000")
+POWERRAG_API_PREFIX = f"{POWERRAG_SERVER_URL}/api/v1/powerrag"
+
+# 创建异步 HTTP 客户端（使用连接池提高性能）
+_http_client = httpx.AsyncClient(
+    timeout=httpx.Timeout(300.0, connect=10.0),  # 5分钟总超时，10秒连接超时
+    limits=httpx.Limits(max_keepalive_connections=20, max_connections=100),
+    follow_redirects=True,
+)
+
+
+async def _forward_request(method: str, endpoint: str, tenant_id: str = None):
+    """
+    将请求转发到 PowerRAG server（使用异步 HTTP 客户端）
+    
+    Args:
+        method: HTTP 方法 (GET, POST, PUT, DELETE)
+        endpoint: PowerRAG API 端点（不包含 /api/v1/powerrag 前缀）
+        tenant_id: 租户ID（可选，用于日志）
+    
+    Returns:
+        PowerRAG server 的响应
+    """
+    url = f"{POWERRAG_API_PREFIX}{endpoint}"
+    
+    # 获取请求数据
+    if method == "GET":
+        params = dict(request.args)
+        json_data = None
+        files = None
+        data = None
+    else:
+        params = None
+        json_data = None
+        files = None
+        data = None
+        
+        # 尝试获取 JSON 数据
+        try:
+            json_data = await request.get_json(silent=True)
+        except Exception:
+            pass
+        
+        # 如果没有 JSON 数据，尝试获取表单数据或文件
+        if json_data is None:
+            try:
+                form = await request.form
+                if form:
+                    data = dict(form)
+            except Exception:
+                pass
+            
+            try:
+                files_dict = await request.files
+                if files_dict:
+                    # 保留文件名信息！重要：不能直接 dict(files_dict)
+                    # 因为会丢失文件名。需要构造 httpx 期望的格式
+                    files = {}
+                    for field_name, file_storage in files_dict.items():
+                        # httpx 期望格式: (filename, content, content_type)
+                        files[field_name] = (
+                            file_storage.filename,
+                            file_storage.read(),
+                            file_storage.content_type or 'application/octet-stream'
+                        )
+            except Exception:
+                pass
+    
+    # 获取请求头（传递 Authorization）
+    headers = {}
+    if "Authorization" in request.headers:
+        headers["Authorization"] = request.headers["Authorization"]
+    
+    try:
+        logger.info(f"Forwarding {method} {endpoint} to PowerRAG server: {url}")
+        
+        # 使用异步 HTTP 客户端发送请求
+        response = await _http_client.request(
+            method=method,
+            url=url,
+            params=params,
+            json=json_data,
+            data=data,
+            files=files,
+            headers=headers,
+        )
+        
+        logger.info(f"PowerRAG server response status: {response.status_code}")
+        
+        # 返回响应
+        try:
+            response_json = response.json()
+            logger.debug(f"Response JSON: {response_json}")
+            return jsonify(response_json), response.status_code
+        except Exception as e:
+            # 如果不是 JSON 响应，记录错误并返回错误信息
+            logger.error(f"Failed to parse response as JSON: {e}", exc_info=True)
+            logger.error(f"Response status: {response.status_code}")
+            
+            # 尝试读取响应文本
+            try:
+                response_text = response.text
+                error_msg = response_text[:200] if response_text else "(empty response body)"
+            except Exception:
+                error_msg = "(unable to read response body)"
+            
+            logger.error(f"Response content (first 200 chars): {error_msg}")
+            logger.error(f"Response headers: {dict(response.headers)}")
+            
+            # 返回错误响应
+            return get_error_data_result(
+                message=f"PowerRAG server returned invalid JSON response: {error_msg}"
+            ), response.status_code if response.status_code >= 400 else 500
+    
+    except httpx.ConnectError as e:
+        logger.error(f"Failed to connect to PowerRAG server at {POWERRAG_SERVER_URL}: {e}", exc_info=True)
+        return get_error_data_result(
+            message=f"PowerRAG server is not available at {POWERRAG_SERVER_URL}. "
+                   f"Please ensure the PowerRAG server is running on port 6000."
+        ), 503
+    
+    except httpx.TimeoutException as e:
+        logger.error(f"Request to PowerRAG server timed out: {url}", exc_info=True)
+        return get_error_data_result(
+            message="Request to PowerRAG server timed out"
+        ), 504
+    
+    except httpx.HTTPStatusError as e:
+        logger.error(f"PowerRAG server returned error status {e.response.status_code}: {e}", exc_info=True)
+        try:
+            error_json = e.response.json()
+            return jsonify(error_json), e.response.status_code
+        except Exception:
+            return get_error_data_result(
+                message=f"PowerRAG server error: {e.response.status_code} {e.response.text[:200]}"
+            ), e.response.status_code
+    
+    except Exception as e:
+        logger.error(f"Error forwarding request to PowerRAG server: {e}", exc_info=True)
+        error_msg = str(e)
+        if hasattr(e, '__class__'):
+            error_msg = f"{e.__class__.__name__}: {error_msg}"
+        return get_error_data_result(
+            message=f"Error forwarding request to PowerRAG server: {error_msg}"
+        ), 500
+
+
+@manager.route("/powerrag/split", methods=["POST"])  # noqa: F821
+@token_required
+async def split_text_proxy(tenant_id):
+    """
+    智能路由文本切片请求：
+    - PowerRAG chunkers (title, regex, smart) -> 转发到 PowerRAG server
+    - RAGFlow chunkers (naive, paper, book 等) -> 使用 RAGFlow 的实现
+    
+    ---
+    tags:
+      - PowerRAG Proxy
+    security:
+      - ApiKeyAuth: []
+    parameters:
+      - in: body
+        name: body
+        description: Split text parameters.
+        required: true
+        schema:
+          type: object
+          properties:
+            text:
+              type: string
+              description: Text to split.
+            parser_id:
+              type: string
+              description: Parser ID.
+            config:
+              type: object
+              description: Parser configuration.
+      - in: header
+        name: Authorization
+        type: string
+        required: true
+        description: Bearer token for authentication.
+    responses:
+      200:
+        description: Split result.
+    """
+    try:
+        # 获取请求数据
+        data = await request.get_json()
+        if not data:
+            return get_error_data_result(message="No JSON data provided"), 400
+        
+        text = data.get("text")
+        parser_id = data.get("parser_id", "title")
+        config = data.get("config", {})
+        
+        if not text:
+            return get_error_data_result(message="text is required"), 400
+        
+        # PowerRAG 支持的纯文本 chunker（专为文本切片设计）
+        POWERRAG_CHUNKERS = {"title", "regex", "smart"}
+        
+        # RAGFlow 支持纯文本切片的 chunker（使用 naive_merge）
+        RAGFLOW_TEXT_CHUNKERS = {"naive", "general"}
+        
+        # RAGFlow 需要文件处理的 chunker（不支持纯文本切片）
+        # RAGFLOW_DOCUMENT_CHUNKERS = {
+        #     "paper", "book", "laws", "presentation", "manual", 
+        #     "qa", "table", "resume", "picture", "one", 
+        #     "knowledge_graph", "email", "tag"
+        # }
+        
+        if parser_id.lower() in POWERRAG_CHUNKERS:
+            # 转发到 PowerRAG server
+            logger.info(f"Forwarding '{parser_id}' chunker to PowerRAG server")
+            return await _forward_request("POST", "/split", tenant_id)
+            
+        elif parser_id.lower() in RAGFLOW_TEXT_CHUNKERS:
+            # 使用 RAGFlow 的 naive_merge 处理纯文本
+            logger.info(f"Using RAGFlow naive_merge for '{parser_id}' chunker")
+            from rag.nlp import naive_merge
+            
+            # 默认配置
+            chunk_token_num = config.get("chunk_token_num", 128)
+            delimiter = config.get("delimiter", "\n!?。；！？")
+            
+            # naive_merge 需要 sections 参数，格式为 [(text, position), ...]
+            sections = [(text, "")]
+            
+            # 调用 RAGFlow 的 naive_merge
+            chunks = naive_merge(sections, chunk_token_num=chunk_token_num, delimiter=delimiter)
+            
+            # 过滤掉空白块
+            chunks = [chunk.strip() for chunk in chunks if chunk.strip()]
+            
+            # 返回结果
+            return jsonify({
+                "code": 0,
+                "data": {
+                    "parser_id": parser_id,
+                    "chunks": chunks,
+                    "total_chunks": len(chunks),
+                    "text_length": len(text),
+                    "metadata": {
+                        "chunker": "ragflow",
+                        "config": config
+                    }
+                },
+                "message": "success"
+            }), 200            
+        # elif parser_id.lower() in RAGFLOW_DOCUMENT_CHUNKERS:
+        #     # 这些 chunker 需要文档文件，不支持纯文本切片
+        #     return get_error_data_result(
+        #         message=f"Chunker '{parser_id}' requires document file processing and does not support pure text splitting. "
+        #                 f"Supported text chunkers are: {', '.join(sorted(POWERRAG_CHUNKERS | RAGFLOW_TEXT_CHUNKERS))}"
+        #     ), 400
+        else:
+            # 未知的 chunker
+            return get_error_data_result(
+                message=f"Unknown chunker '{parser_id}'. "
+                        f"Supported text chunkers are: {', '.join(sorted(POWERRAG_CHUNKERS | RAGFLOW_TEXT_CHUNKERS))}"
+            ), 400
+            
+    except Exception as e:
+        logger.error(f"Error in split_text_proxy: {e}", exc_info=True)
+        return get_error_data_result(message=f"Failed to split text: {str(e)}"), 500
+
+
+@manager.route("/powerrag/extract", methods=["POST"])  # noqa: F821
+@token_required
+async def extract_from_document_proxy(tenant_id):
+    """
+    代理 extract_from_document API 请求到 PowerRAG server
+    """
+    return await _forward_request("POST", "/extract", tenant_id)
+
+
+@manager.route("/powerrag/extract/text", methods=["POST"])  # noqa: F821
+@token_required
+async def extract_from_text_proxy(tenant_id):
+    """
+    代理 extract_from_text API 请求到 PowerRAG server
+    """
+    return await _forward_request("POST", "/extract/text", tenant_id)
+
+
+@manager.route("/powerrag/extract/batch", methods=["POST"])  # noqa: F821
+@token_required
+async def extract_batch_proxy(tenant_id):
+    """
+    代理 extract_batch API 请求到 PowerRAG server
+    """
+    return await _forward_request("POST", "/extract/batch", tenant_id)
+
+
+@manager.route("/powerrag/struct_extract/submit", methods=["POST"])  # noqa: F821
+@token_required
+async def struct_extract_submit_proxy(tenant_id):
+    """
+    代理 struct_extract/submit API 请求到 PowerRAG server
+    """
+    return await _forward_request("POST", "/struct_extract/submit", tenant_id)
+
+
+@manager.route("/powerrag/struct_extract/status/<task_id>", methods=["GET"])  # noqa: F821
+@token_required
+async def struct_extract_status_proxy(tenant_id, task_id):
+    """
+    代理 struct_extract/status API 请求到 PowerRAG server
+    """
+    return await _forward_request("GET", f"/struct_extract/status/{task_id}", tenant_id)
+
+
+@manager.route("/powerrag/parse", methods=["POST"])  # noqa: F821
+@token_required
+async def parse_document_proxy(tenant_id):
+    """
+    代理 parse API 请求到 PowerRAG server
+    """
+    return await _forward_request("POST", "/parse", tenant_id)
+
+
+@manager.route("/powerrag/parse/batch", methods=["POST"])  # noqa: F821
+@token_required
+async def parse_batch_proxy(tenant_id):
+    """
+    代理 parse/batch API 请求到 PowerRAG server
+    """
+    return await _forward_request("POST", "/parse/batch", tenant_id)
+
+
+@manager.route("/powerrag/parse/upload", methods=["POST"])  # noqa: F821
+@token_required
+async def parse_upload_proxy(tenant_id):
+    """
+    代理 parse/upload API 请求到 PowerRAG server
+    """
+    return await _forward_request("POST", "/parse/upload", tenant_id)
+
+
+@manager.route("/powerrag/convert", methods=["POST"])  # noqa: F821
+@token_required
+async def convert_document_proxy(tenant_id):
+    """
+    代理 convert API 请求到 PowerRAG server
+    """
+    return await _forward_request("POST", "/convert", tenant_id)
+
+
+@manager.route("/powerrag/convert/upload", methods=["POST"])  # noqa: F821
+@token_required
+async def convert_upload_proxy(tenant_id):
+    """
+    代理 convert/upload API 请求到 PowerRAG server
+    """
+    return await _forward_request("POST", "/convert/upload", tenant_id)
+
+
+@manager.route("/powerrag/parse_to_md", methods=["POST"])  # noqa: F821
+@token_required
+async def parse_to_md_proxy(tenant_id):
+    """
+    代理 parse_to_md API 请求到 PowerRAG server
+    
+    将文档解析为 Markdown 格式，但不进行切分。
+    适用于需要完整文档内容或外部系统自行处理切分的场景。
+    
+    支持的文件格式:
+    - PDF (.pdf)
+    - Office 文档 (.doc, .docx, .ppt, .pptx)
+    - 图片 (.jpg, .png)
+    - HTML (.html, .htm)
+    - Markdown (.md)
+    
+    ---
+    tags:
+      - PowerRAG Proxy
+    security:
+      - ApiKeyAuth: []
+    parameters:
+      - in: body
+        name: body
+        description: Parse to markdown parameters.
+        required: true
+        schema:
+          type: object
+          properties:
+            doc_id:
+              type: string
+              required: true
+              description: RAGFlow document ID.
+            config:
+              type: object
+              description: Parser configuration.
+              properties:
+                layout_recognize:
+                  type: string
+                  description: Layout recognition engine (mineru or dots_ocr).
+                enable_ocr:
+                  type: boolean
+                  description: Enable OCR.
+                enable_formula:
+                  type: boolean
+                  description: Enable formula recognition.
+                enable_table:
+                  type: boolean
+                  description: Enable table recognition.
+                from_page:
+                  type: integer
+                  description: Start page number (for PDF).
+                to_page:
+                  type: integer
+                  description: End page number (for PDF).
+      - in: header
+        name: Authorization
+        type: string
+        required: true
+        description: Bearer token for authentication.
+    responses:
+      200:
+        description: Parse to markdown result.
+        schema:
+          type: object
+          properties:
+            code:
+              type: integer
+            data:
+              type: object
+              properties:
+                doc_id:
+                  type: string
+                doc_name:
+                  type: string
+                markdown:
+                  type: string
+                markdown_length:
+                  type: integer
+                images:
+                  type: object
+                total_images:
+                  type: integer
+            message:
+              type: string
+    """
+    return await _forward_request("POST", "/parse_to_md", tenant_id)
+
+
+@manager.route("/powerrag/parse_to_md/async", methods=["POST"])  # noqa: F821
+@token_required
+async def parse_to_md_async_proxy(tenant_id):
+    """
+    代理 parse_to_md/async API 请求到 PowerRAG server (异步提交任务)
+    
+    异步解析文档为 Markdown，返回任务 ID。
+    适用于大文档或需要长时间处理的场景。
+    
+    ---
+    tags:
+      - PowerRAG Proxy
+    security:
+      - ApiKeyAuth: []
+    parameters:
+      - in: body
+        name: body
+        required: true
+        schema:
+          type: object
+          properties:
+            doc_id:
+              type: string
+              description: Document ID
+            config:
+              type: object
+              description: Parser configuration
+      - in: header
+        name: Authorization
+        type: string
+        required: true
+        description: Bearer token for authentication.
+    responses:
+      200:
+        description: Task submitted successfully, returns task_id.
+    """
+    return await _forward_request("POST", "/parse_to_md/async", tenant_id)
+
+
+@manager.route("/powerrag/parse_to_md/status/<task_id>", methods=["GET"])  # noqa: F821
+@token_required
+async def parse_to_md_status_proxy(tenant_id, task_id):
+    """
+    代理 parse_to_md/status API 请求到 PowerRAG server (查询任务状态)
+    
+    查询异步解析任务的状态和结果。
+    
+    ---
+    tags:
+      - PowerRAG Proxy
+    security:
+      - ApiKeyAuth: []
+    parameters:
+      - in: path
+        name: task_id
+        type: string
+        required: true
+        description: Task ID returned from async submission
+      - in: header
+        name: Authorization
+        type: string
+        required: true
+        description: Bearer token for authentication.
+    responses:
+      200:
+        description: Task status and result.
+    """
+    return await _forward_request("GET", f"/parse_to_md/status/{task_id}", tenant_id)
+
+
+@manager.route("/powerrag/parse_to_md/upload", methods=["POST"])  # noqa: F821
+@token_required
+async def parse_to_md_upload_proxy(tenant_id):
+    """
+    代理 parse_to_md/upload API 请求到 PowerRAG server
+    
+    直接上传文件并解析为 Markdown，不进行切分。
+    
+    支持的文件格式:
+    - PDF (.pdf)
+    - Office 文档 (.doc, .docx, .ppt, .pptx)
+    - 图片 (.jpg, .png)
+    - HTML (.html, .htm)
+    - Markdown (.md)
+    
+    ---
+    tags:
+      - PowerRAG Proxy
+    security:
+      - ApiKeyAuth: []
+    parameters:
+      - in: formData
+        name: file
+        type: file
+        required: true
+        description: File to parse (PDF, Office (doc/docx/ppt/pptx), Images (jpg/png), HTML, Markdown).
+      - in: formData
+        name: config
+        type: string
+        description: JSON string of parser configuration.
+      - in: header
+        name: Authorization
+        type: string
+        required: true
+        description: Bearer token for authentication.
+    responses:
+      200:
+        description: Parse to markdown result.
+    """
+    return await _forward_request("POST", "/parse_to_md/upload", tenant_id)
+
diff --git a/api/apps/system_app.py b/api/apps/system_app.py
index 7e646927e..640fbe37a 100644
--- a/api/apps/system_app.py
+++ b/api/apps/system_app.py
@@ -217,7 +217,7 @@ def new_token():
         obj = {
             "tenant_id": tenant_id,
             "token": generate_confirmation_token(),
-            "beta": generate_confirmation_token().replace("ragflow-", "")[:32],
+            "beta": generate_confirmation_token().replace("powerrag-", "")[:32],
             "create_time": current_timestamp(),
             "create_date": datetime_format(datetime.now()),
             "update_time": None,
diff --git a/powerrag/sdk/README.md b/powerrag/sdk/README.md
new file mode 100644
index 000000000..81b69be76
--- /dev/null
+++ b/powerrag/sdk/README.md
@@ -0,0 +1,1148 @@
+# PowerRAG SDK
+
+[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
+[![Python](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/)
+
+PowerRAG SDK 是一个功能强大的 Python SDK，为 PowerRAG API 提供了简单易用的接口，支持知识库管理、文档处理、Markdown 解析、文本切片、信息抽取、RAPTOR 构建、知识图谱和检索等功能。
+
+## 特性
+
+- 🚀 **简单易用**: 面向对象的 API 设计，直观的方法调用
+- 📚 **完整功能**: 支持 PowerRAG 所有核心功能模块
+- 🔄 **异步支持**: 支持异步任务的状态查询和轮询等待
+- 📦 **批量操作**: 支持批量上传、删除、抽取等操作
+- 📝 **Markdown 解析**: 支持文档解析为 Markdown 格式（同步/异步）
+- 🎯 **类型提示**: 完整的类型注解，IDE 友好
+- ✅ **全面测试**: 包含完整的测试用例
+
+## 安装
+
+### 方式 1: 使用 pip（推荐）
+
+```bash
+pip install powerrag-sdk
+```
+
+### 方式 2: 从源码安装
+
+```bash
+git clone https://github.com/oceanbase/powerrag.git
+cd powerrag
+pip install -e .
+```
+
+### 方式 3: 仅安装 SDK 模块
+
+如果你只需要 SDK 功能：
+
+```bash
+git clone https://github.com/oceanbase/powerrag.git
+cd powerrag/powerrag/sdk
+pip install -e .
+```
+
+### 依赖要求
+
+- Python 3.8+
+- requests >= 2.28.0
+- typing-extensions (Python < 3.11)
+
+### 验证安装
+
+```python
+from powerrag.sdk import PowerRAGClient
+
+print(f"PowerRAG SDK installed successfully!")
+```
+
+## 快速开始
+
+### 初始化客户端
+
+```python
+from powerrag.sdk import PowerRAGClient
+
+# 创建客户端
+client = PowerRAGClient(
+    api_key="your-api-key",
+    base_url="http://localhost:9380"
+)
+```
+
+### 创建知识库
+
+```python
+# 创建知识库
+kb = client.knowledge_base.create(
+    name="my_knowledge_base",
+    description="My first knowledge base",
+    chunk_method="naive"
+)
+print(f"Knowledge Base ID: {kb['id']}")
+```
+
+### 上传文档
+
+```python
+# 上传单个文档
+docs = client.document.upload(kb['id'], "document.pdf")
+
+# 上传多个文档
+docs = client.document.upload(kb['id'], ["doc1.pdf", "doc2.pdf", "doc3.pdf"])
+```
+
+### 解析文档为切片
+
+```python
+# 异步解析文档为切片
+task_id = client.document.parse_to_chunk(
+    kb['id'], 
+    [docs[0]['id']], 
+    wait=False
+)
+
+# 同步解析并等待完成
+results = client.document.parse_to_chunk(
+    kb['id'], 
+    [docs[0]['id']], 
+    wait=True,
+    delete_existing=False
+)
+```
+
+### 解析文档为 Markdown
+
+```python
+# 同步解析为 Markdown（不切分）
+result = client.document.parse_to_md(
+    doc_id=docs[0]['id'],
+    config={
+        "layout_recognize": "mineru",  # 或 "dots_ocr"
+        "enable_ocr": False,
+        "enable_formula": False,
+        "enable_table": True
+    }
+)
+print(f"Markdown: {result['markdown']}")
+print(f"Total images: {result['total_images']}")
+
+# 异步解析为 Markdown
+task_id = client.document.parse_to_md_async(
+    doc_id=docs[0]['id'],
+    config={"layout_recognize": "mineru"}
+)
+
+# 查询异步任务状态
+status = client.document.get_parse_to_md_status(task_id)
+print(f"Status: {status['status']}")
+
+# 等待任务完成
+result = client.document.wait_for_parse_to_md(task_id, timeout=300)
+
+# 直接上传文件并解析为 Markdown
+result = client.document.parse_to_md_upload(
+    "document.pdf",
+    config={"layout_recognize": "mineru"}
+)
+```
+
+### 检索
+
+```python
+# 执行检索
+result = client.retrieval.search(
+    kb_ids=[kb['id']],
+    question="什么是 PowerRAG?",
+    page_size=10,
+    similarity_threshold=0.2
+)
+
+# 打印结果
+for chunk in result['chunks']:
+    print(f"Content: {chunk['content']}")
+    print(f"Score: {chunk['similarity']}")
+    print(f"Document: {chunk['document_name']}")
+```
+
+## 核心功能亮点
+
+### 文档解析为 Markdown
+
+PowerRAG SDK 提供了强大的文档解析为 Markdown 的功能，支持多种文档格式：
+
+**支持的格式：**
+- PDF (.pdf)
+- Office 文档 (.doc, .docx, .ppt, .pptx)
+- 图片 (.jpg, .png)
+- HTML (.html, .htm)
+
+**三种使用方式：**
+
+1. **同步解析**（适合小文档）：
+```python
+result = client.document.parse_to_md(doc_id, config={...})
+```
+
+2. **异步解析**（适合大文档）：
+```python
+task_id = client.document.parse_to_md_async(doc_id, config={...})
+status = client.document.get_parse_to_md_status(task_id)
+# 或等待完成
+result = client.document.wait_for_parse_to_md(task_id, timeout=300)
+```
+
+3. **直接上传解析**（无需知识库）：
+```python
+result = client.document.parse_to_md_upload("file.pdf", config={...})
+```
+
+**配置选项：**
+- `layout_recognize`: 布局识别引擎 (`"mineru"` 或 `"dots_ocr"`)
+- `enable_ocr`: 是否启用 OCR
+- `enable_formula`: 是否识别公式
+- `enable_table`: 是否识别表格
+- `from_page`/`to_page`: PDF 页面范围
+
+### 结构化信息抽取
+
+支持使用 LangExtract 进行结构化信息抽取：
+
+```python
+task = client.extraction.struct_extract(
+    text_or_documents="...",
+    prompt_description="Extract person information",
+    examples=[...],
+    temperature=0.0
+)
+status = client.extraction.get_struct_extract_status(task['task_id'])
+```
+
+### 文本切片
+
+无需上传文档即可对文本进行切片：
+
+```python
+result = client.chunk.split_text(
+    text="# Title\n\nContent...",
+    parser_id="title",
+    config={"chunk_token_num": 512}
+)
+```
+
+## 核心模块
+
+PowerRAG SDK 包含以下 7 个核心模块：
+
+### 1. 知识库管理 (Knowledge Base)
+
+管理知识库的创建、查询、更新和删除。
+
+```python
+# 创建知识库
+kb = client.knowledge_base.create(
+    name="test_kb",
+    description="Test knowledge base",
+    embedding_model="BAAI/bge-small-en-v1.5@Builtin",
+    permission="me",
+    chunk_method="naive"
+)
+
+# 获取知识库
+kb_info = client.knowledge_base.get(kb['id'])
+
+# 列出知识库
+kbs, total = client.knowledge_base.list(
+    name="test",
+    page=1,
+    page_size=10
+)
+
+# 更新知识库
+updated_kb = client.knowledge_base.update(
+    kb['id'],
+    description="Updated description",
+    pagerank=True
+)
+
+# 删除知识库
+client.knowledge_base.delete([kb['id']])
+```
+
+### 2. 文档管理 (Document)
+
+处理文档的上传、列表、查询、更新、删除、下载和解析。
+
+```python
+# 上传文档
+docs = client.document.upload(kb_id, ["file1.pdf", "file2.docx"])
+
+# 从URL上传文档
+success = client.document.upload_from_url(
+    kb_id,
+    url="https://example.com/doc.pdf",
+    name="document.pdf"
+)
+
+# 列出文档
+docs, total = client.document.list(
+    kb_id,
+    name="report",
+    page=1,
+    page_size=20,
+    keywords="机器学习",  # 关键词搜索
+    suffix=["pdf", "docx"],  # 按后缀过滤
+    run=["DONE", "FAIL"]  # 按状态过滤
+)
+
+# 获取文档详情
+doc = client.document.get(kb_id, doc_id)
+
+# 更新文档
+updated_doc = client.document.update(
+    kb_id,
+    doc_id,
+    name="new_name.pdf",
+    meta_fields={"author": "John", "category": "AI"},
+    enabled=True
+)
+
+# 快捷方法：重命名文档
+client.document.rename(kb_id, doc_id, "renamed.pdf")
+
+# 快捷方法：设置元数据
+client.document.set_meta(kb_id, doc_id, {"version": "1.0"})
+
+# 下载文档
+# 下载为字节流
+file_bytes = client.document.download(kb_id, doc_id)
+
+# 下载到文件
+saved_path = client.document.download(kb_id, doc_id, save_path="downloaded.pdf")
+
+# 解析文档为切片（异步）
+task_id = client.document.parse_to_chunk(kb_id, [doc_id], wait=False)
+
+# 解析文档为切片（同步等待）
+results = client.document.parse_to_chunk(
+    kb_id, 
+    [doc_id], 
+    wait=True,
+    delete_existing=False,  # 是否删除已有切片
+    config={"max_token": 512}  # 自定义配置
+)
+
+# 解析文档为 Markdown（同步）
+result = client.document.parse_to_md(
+    doc_id,
+    config={
+        "layout_recognize": "mineru",  # mineru 或 dots_ocr
+        "enable_ocr": False,
+        "enable_formula": False,
+        "enable_table": True,
+        "from_page": 0,  # PDF起始页
+        "to_page": 100   # PDF结束页
+    }
+)
+print(result['markdown'])
+
+# 解析文档为 Markdown（异步）
+task_id = client.document.parse_to_md_async(doc_id, config={...})
+
+# 查询 parse_to_md 任务状态
+status = client.document.get_parse_to_md_status(task_id)
+if status["status"] == "success":
+    print(status["result"]["markdown"])
+
+# 等待 parse_to_md 任务完成
+result = client.document.wait_for_parse_to_md(task_id, timeout=300)
+
+# 上传并解析为 Markdown（无需知识库）
+result = client.document.parse_to_md_upload("file.pdf", config={...})
+
+# 解析URL文档（同步等待）
+doc = client.document.parse_url(
+    kb_id,
+    url="https://example.com/doc.pdf",
+    name="web_doc.pdf",
+    wait=True
+)
+
+# 取消解析任务
+client.document.cancel_parse(kb_id, [doc_id])
+
+# 删除文档
+client.document.delete(kb_id, [doc_id])
+```
+
+### 3. 切片管理 (Chunk)
+
+管理文档切片的查询、创建、更新、删除和文本切片。
+
+```python
+# 列出文档的切片
+chunks, total, doc_info = client.chunk.list(
+    kb_id,
+    doc_id,
+    keywords="机器学习",
+    page=1,
+    page_size=30
+)
+
+# 获取切片详情
+chunk = client.chunk.get(kb_id, doc_id, chunk_id)
+
+# 创建切片
+chunk = client.chunk.create(
+    kb_id,
+    doc_id,
+    content="This is a chunk content",
+    important_keywords=["keyword1", "keyword2"],
+    questions=["What is this about?"]
+)
+
+# 更新切片
+updated_chunk = client.chunk.update(
+    kb_id,
+    doc_id,
+    chunk_id,
+    content="Updated content",
+    important_keywords=["new_keyword"],
+    questions=["Updated question?"],
+    available=True,
+    positions=[[0, 100]]
+)
+
+# 删除切片
+client.chunk.delete(kb_id, doc_id, [chunk_id])
+
+# 删除文档的所有切片
+client.chunk.delete(kb_id, doc_id, None)
+
+# 文本切片（无需上传文档）
+result = client.chunk.split_text(
+    text="# Title\n\nLong text to be chunked...",
+    parser_id="title",  # 解析器ID
+    config={"chunk_token_num": 512}  # 自定义配置
+)
+print(f"Total chunks: {result['total_chunks']}")
+for chunk in result['chunks']:
+    print(chunk['content'])
+```
+
+### 4. 信息抽取 (Extraction)
+
+从文档或文本中抽取实体、关键词、摘要等信息。
+
+```python
+# 从文档抽取
+result = client.extraction.extract_from_document(
+    doc_id=doc_id,
+    extractor_type="entity",  # entity, keyword, summary
+    config={
+        "entity_types": ["PERSON", "ORG", "LOC"],
+        "use_regex": True,
+        "use_llm": False
+    }
+)
+print(result['entities'])
+
+# 从文本抽取
+result = client.extraction.extract_from_text(
+    text="PowerRAG is an advanced RAG framework developed by OceanBase",
+    extractor_type="entity",
+    config={"entity_types": ["ORG", "PRODUCT"]}
+)
+
+# 抽取关键词
+result = client.extraction.extract_from_document(
+    doc_id=doc_id,
+    extractor_type="keyword",
+    config={
+        "max_keywords": 20,
+        "min_word_length": 3
+    }
+)
+
+# 抽取摘要
+result = client.extraction.extract_from_document(
+    doc_id=doc_id,
+    extractor_type="summary",
+    config={
+        "max_length": 200,
+        "min_length": 50
+    }
+)
+
+# 批量抽取
+results = client.extraction.extract_batch(
+    doc_ids=[doc_id1, doc_id2, doc_id3],
+    extractor_type="keyword",
+    config={"max_keywords": 15}
+)
+for result in results:
+    if result['success']:
+        print(f"Doc {result['doc_id']}: {result['data']}")
+
+# 结构化抽取 (LangExtract)
+task = client.extraction.struct_extract(
+    text_or_documents="John Doe is 30 years old. His email is john@example.com",
+    prompt_description="Extract person information including name, age, and email",
+    examples=[
+        {
+            "text": "Jane Smith is 25 years old. Email: jane@example.com",
+            "extractions": [
+                {"name": "Jane Smith", "age": 25, "email": "jane@example.com"}
+            ]
+        }
+    ],
+    fetch_urls=False,
+    max_char_buffer=1000,
+    temperature=0.0,
+    extraction_passes=1
+)
+print(f"Task ID: {task['task_id']}")
+
+# 获取结构化抽取状态
+status = client.extraction.get_struct_extract_status(task['task_id'])
+print(f"Status: {status['status']}")
+if status['status'] == 'completed':
+    print(f"Result: {status['result']}")
+```
+
+### 5. RAPTOR
+
+构建和管理 RAPTOR（Recursive Abstractive Processing for Tree-Organized Retrieval）。
+
+**注意**: RAPTOR 的配置参数需要在创建或更新知识库时通过 `parser_config.raptor` 设置。
+
+```python
+# 创建知识库时配置 RAPTOR 参数
+kb = client.knowledge_base.create(
+    name="raptor_kb",
+    chunk_method="naive",
+    parser_config={
+        "raptor": {
+            "max_cluster": 64,
+            "random_seed": 224,
+            "llm_model": "deepseek-chat"
+        }
+    }
+)
+
+# 构建 RAPTOR（异步）
+task = client.raptor.build(kb_id)
+print(f"RAPTOR Task ID: {task['raptor_task_id']}")
+
+# 获取 RAPTOR 构建状态
+status = client.raptor.get_status(kb_id)
+if status:
+    print(f"Status: {status['status']}")
+    print(f"Progress: {status['progress']}")
+else:
+    print("No RAPTOR task found")
+```
+
+### 6. 知识图谱 (Knowledge Graph)
+
+构建和管理知识图谱。
+
+**注意**: 知识图谱的配置参数需要在创建或更新知识库时通过 `parser_config.graphrag` 设置。
+
+```python
+# 创建知识库时配置知识图谱参数
+kb = client.knowledge_base.create(
+    name="kg_kb",
+    chunk_method="naive",
+    parser_config={
+        "graphrag": {
+            "entity_types": ["PERSON", "ORG", "LOC", "EVENT"],
+            "llm_model": "deepseek-chat"
+        }
+    }
+)
+
+# 构建知识图谱（异步）
+task = client.knowledge_graph.build(kb_id)
+print(f"Knowledge Graph Task ID: {task['graphrag_task_id']}")
+
+# 获取知识图谱数据
+kg = client.knowledge_graph.get(kb_id)
+print(f"Graph nodes: {len(kg['graph'].get('nodes', []))}")
+print(f"Graph edges: {len(kg['graph'].get('edges', []))}")
+print(f"Mind map: {kg['mind_map']}")
+
+# 获取构建状态
+status = client.knowledge_graph.get_status(kb_id)
+if status:
+    print(f"Status: {status['status']}")
+    print(f"Progress: {status['progress']}")
+else:
+    print("No knowledge graph task found")
+```
+
+### 7. 检索 (Retrieval)
+
+执行语义检索和混合检索。
+
+```python
+# 基本检索
+result = client.retrieval.search(
+    kb_ids=[kb_id],
+    question="What is PowerRAG?",
+    page=1,
+    page_size=10
+)
+
+# 打印结果
+print(f"Total results: {result['total']}")
+for chunk in result['chunks']:
+    print(f"Content: {chunk['content']}")
+    print(f"Similarity: {chunk['similarity']}")
+    print(f"Document: {chunk['document_name']}")
+
+# 高级检索
+result = client.retrieval.search(
+    kb_ids=[kb_id1, kb_id2],
+    question="机器学习的应用",
+    document_ids=[doc_id],  # 限定文档范围
+    page=1,
+    page_size=30,
+    similarity_threshold=0.3,  # 相似度阈值
+    vector_similarity_weight=0.3,  # 向量相似度权重（混合检索）
+    top_k=1024,  # 最大返回数量
+    keyword=True,  # 启用关键词增强
+    use_kg=True,  # 使用知识图谱检索
+    rerank_id="bge-reranker-v2-m3",  # 重排序模型
+    highlight=True,  # 高亮匹配内容
+    cross_languages=["en", "zh"],  # 跨语言检索
+    metadata_condition={"status": "published", "year": 2024}  # 元数据过滤
+)
+
+# 检索测试（与 search 功能相同，用于测试场景）
+test_result = client.retrieval.test(
+    kb_ids=[kb_id],
+    question="测试查询",
+    page=1,
+    page_size=50,
+    similarity_threshold=0.2,
+    keyword=True,
+    use_kg=False
+)
+```
+
+## 完整示例
+
+以下是一个完整的工作流程示例：
+
+```python
+from powerrag.sdk import PowerRAGClient
+import time
+
+# 初始化客户端
+client = PowerRAGClient(
+    api_key="your-api-key",
+    base_url="http://localhost:9380"
+)
+
+# 1. 创建知识库
+kb = client.knowledge_base.create(
+    name="research_papers",
+    description="Collection of AI research papers",
+    chunk_method="naive"
+)
+print(f"Created knowledge base: {kb['id']}")
+
+# 2. 上传文档
+docs = client.document.upload(
+    kb['id'],
+    ["paper1.pdf", "paper2.pdf", "paper3.pdf"]
+)
+print(f"Uploaded {len(docs)} documents")
+
+# 3. 解析文档为切片（同步等待）
+doc_ids = [doc['id'] for doc in docs]
+results = client.document.parse_to_chunk(kb['id'], doc_ids, wait=True)
+print(f"Parsed {len(results)} documents")
+for result in results:
+    print(f"Doc {result['doc_id']}: {result['status']}, {result['chunk_count']} chunks")
+
+# 4. 构建知识图谱（可选）
+kg_task = client.knowledge_graph.build(kb['id'])
+print(f"Building knowledge graph: {kg_task['graphrag_task_id']}")
+
+# 等待知识图谱构建完成
+while True:
+    status = client.knowledge_graph.get_status(kb['id'])
+    if not status:
+        break
+    print(f"KG status: {status['status']}, progress: {status.get('progress', 0)}")
+    if status['status'] in ['DONE', 'FAIL']:
+        break
+    time.sleep(5)
+
+# 5. 执行检索
+result = client.retrieval.search(
+    kb_ids=[kb['id']],
+    question="What are the latest advances in transformer models?",
+    page_size=5,
+    similarity_threshold=0.2,
+    use_kg=True,
+    highlight=True
+)
+
+# 打印检索结果
+print(f"\nFound {result['total']} results:")
+for i, chunk in enumerate(result['chunks'], 1):
+    print(f"\n{i}. Score: {chunk['similarity']:.3f}")
+    print(f"   Content: {chunk['content'][:200]}...")
+    print(f"   Document: {chunk['document_name']}")
+
+# 6. 从文档抽取关键信息
+for doc_id in doc_ids[:3]:  # 抽取前3个文档
+    extraction = client.extraction.extract_from_document(
+        doc_id=doc_id,
+        extractor_type="keyword",
+        config={"max_keywords": 10}
+    )
+    print(f"\nExtracted keywords from {doc_id}: {extraction.get('keywords', [])}")
+
+# 7. 解析文档为 Markdown（可选）
+md_result = client.document.parse_to_md(
+    doc_id=doc_ids[0],
+    config={"layout_recognize": "mineru"}
+)
+print(f"\nMarkdown length: {md_result['markdown_length']}")
+print(f"Total images: {md_result['total_images']}")
+
+# 8. 清理（如需要）
+# 删除特定文档
+# client.document.delete(kb['id'], [doc_ids[0]])
+
+# 删除整个知识库（包括所有文档）
+# client.knowledge_base.delete([kb['id']])
+```
+
+## 环境配置
+
+使用 SDK 前需要配置以下环境变量（可选）：
+
+```bash
+# PowerRAG 服务地址
+export HOST_ADDRESS="http://127.0.0.1:9380"
+
+# API 密钥
+export POWERRAG_API_KEY="your-api-key"
+```
+
+或在代码中直接指定：
+
+```python
+client = PowerRAGClient(
+    api_key="your-api-key",
+    base_url="http://127.0.0.1:9380",
+    version="v1"  # API 版本，默认为 v1
+)
+```
+
+## 测试
+
+SDK 包含完整的测试套件，覆盖所有功能模块。
+
+### 运行所有测试
+
+```bash
+# 设置环境变量
+export HOST_ADDRESS="http://127.0.0.1:9380"
+export POWERRAG_API_KEY="your-api-key"
+
+# 运行测试
+pytest powerrag/sdk/tests/
+```
+
+### 运行特定模块测试
+
+```bash
+# 测试知识库模块
+pytest powerrag/sdk/tests/test_knowledge_base.py
+
+# 测试文档模块
+pytest powerrag/sdk/tests/test_document.py
+
+# 测试检索模块
+pytest powerrag/sdk/tests/test_retrieval.py
+```
+
+更多测试说明请参考 [tests/README.md](tests/README.md)。
+
+## 项目结构
+
+```
+powerrag/sdk/
+├── __init__.py                      # SDK 入口，导出 PowerRAGClient
+├── client.py                        # 主客户端类，提供 HTTP 请求方法
+├── README.md                        # SDK 文档（本文件）
+├── modules/                         # 功能模块
+│   ├── knowledge_base.py            # 知识库数据模型 (TypedDict)
+│   ├── knowledge_base_manager.py    # 知识库管理器
+│   ├── document.py                  # 文档数据模型 (TypedDict)
+│   ├── document_manager.py          # 文档管理器
+│   ├── chunk.py                     # 切片数据模型 (TypedDict)
+│   ├── chunk_manager.py             # 切片管理器
+│   ├── extraction.py                # 抽取数据模型 (TypedDict)
+│   ├── extraction_manager.py        # 抽取管理器
+│   ├── raptor.py                    # RAPTOR 数据模型 (TypedDict)
+│   ├── raptor_manager.py            # RAPTOR 管理器
+│   ├── knowledge_graph.py           # 知识图谱数据模型 (TypedDict)
+│   ├── knowledge_graph_manager.py   # 知识图谱管理器
+│   ├── retrieval.py                 # 检索数据模型 (TypedDict)
+│   └── retrieval_manager.py         # 检索管理器
+└── tests/                           # 完整的测试套件
+    ├── README.md                    # 测试文档
+    ├── conftest.py                  # pytest 配置和 fixtures
+    ├── pytest.ini                   # pytest 配置文件
+    ├── test_knowledge_base.py       # 知识库测试
+    ├── test_document.py             # 文档测试
+    ├── test_chunk.py                # 切片测试
+    ├── test_extraction.py           # 抽取测试
+    ├── test_raptor.py               # RAPTOR 测试
+    ├── test_knowledge_graph.py      # 知识图谱测试
+    └── test_retrieval.py            # 检索测试
+```
+
+## API 参考
+
+### PowerRAGClient
+
+主客户端类，提供对所有功能模块的访问。
+
+**初始化参数：**
+- `api_key` (str): API 密钥，必填
+- `base_url` (str): 服务地址，默认 `"http://localhost:9380"`
+- `version` (str): API 版本，默认 `"v1"`
+
+**属性：**
+- `knowledge_base` (KnowledgeBaseManager): 知识库管理器
+- `document` (DocumentManager): 文档管理器
+- `chunk` (ChunkManager): 切片管理器
+- `extraction` (ExtractionManager): 抽取管理器
+- `raptor` (RAPTORManager): RAPTOR 管理器
+- `knowledge_graph` (KnowledgeGraphManager): 知识图谱管理器
+- `retrieval` (RetrievalManager): 检索管理器
+
+**内部方法：**
+- `post(url, json=None, files=None, data=None, stream=False)`: POST 请求
+- `get(url, params=None, stream=False)`: GET 请求
+- `put(url, json=None)`: PUT 请求
+- `delete(url, json=None, params=None)`: DELETE 请求
+
+### 数据模型
+
+所有数据模型都使用 `TypedDict` 定义，提供完整的类型提示：
+
+**知识库相关：**
+- `KnowledgeBaseInfo`: 知识库信息
+
+**文档相关：**
+- `DocumentInfo`: 文档信息
+
+**切片相关：**
+- `ChunkInfo`: 切片信息
+
+**抽取相关：**
+- `ExtractionResult`: 抽取结果
+- `StructExtractTaskInfo`: 结构化抽取任务信息
+
+**RAPTOR 相关：**
+- `RAPTORTaskInfo`: RAPTOR 任务信息
+
+**知识图谱相关：**
+- `KnowledgeGraphData`: 知识图谱数据
+- `KnowledgeGraphTaskInfo`: 知识图谱任务信息
+
+**检索相关：**
+- `RetrievalResult`: 检索结果
+
+## 最佳实践
+
+### 1. 错误处理
+
+SDK 会抛出异常，建议在生产环境中进行适当的错误处理：
+
+```python
+from requests.exceptions import RequestException, HTTPError
+
+try:
+    result = client.retrieval.search(
+        kb_ids=[kb_id],
+        question="test query"
+    )
+except RequestException as e:
+    print(f"Network error: {e}")
+except Exception as e:
+    print(f"API error: {e}")
+```
+
+### 2. 异步任务处理
+
+对于长时间运行的任务，建议使用异步方式：
+
+```python
+# 提交任务
+task_id = client.document.parse_to_chunk(kb_id, doc_ids, wait=False)
+
+# 轮询状态
+import time
+for doc_id in doc_ids:
+    while True:
+        doc = client.document.get(kb_id, doc_id)
+        if doc['run'] in ['DONE', 'FAIL']:
+            break
+        time.sleep(2)
+```
+
+### 3. 批量操作
+
+充分利用批量操作提高效率：
+
+```python
+# 批量上传
+docs = client.document.upload(kb_id, ["doc1.pdf", "doc2.pdf", "doc3.pdf"])
+
+# 批量解析
+doc_ids = [doc['id'] for doc in docs]
+results = client.document.parse_to_chunk(kb_id, doc_ids, wait=True)
+
+# 批量抽取
+results = client.extraction.extract_batch(doc_ids, extractor_type="keyword")
+```
+
+### 4. 知识库配置
+
+在创建知识库时就配置好所需的参数：
+
+```python
+kb = client.knowledge_base.create(
+    name="my_kb",
+    chunk_method="naive",
+    embedding_model="BAAI/bge-large-zh-v1.5@Builtin",
+    parser_config={
+        "chunk_token_num": 512,
+        "raptor": {
+            "max_cluster": 64,
+            "llm_model": "deepseek-chat"
+        },
+        "graphrag": {
+            "entity_types": ["PERSON", "ORG", "LOC"],
+            "llm_model": "deepseek-chat"
+        }
+    }
+)
+```
+
+### 5. 检索优化
+
+根据场景调整检索参数：
+
+```python
+# 精确检索（高阈值）
+result = client.retrieval.search(
+    kb_ids=[kb_id],
+    question="query",
+    similarity_threshold=0.5,  # 更高的阈值
+    page_size=10
+)
+
+# 召回优化（低阈值 + 大top_k + 重排序）
+result = client.retrieval.search(
+    kb_ids=[kb_id],
+    question="query",
+    similarity_threshold=0.1,  # 更低的阈值
+    top_k=2048,  # 更大的候选集
+    rerank_id="bge-reranker-v2-m3",  # 使用重排序
+    keyword=True,  # 启用关键词
+    use_kg=True  # 使用知识图谱
+)
+```
+
+## 错误处理
+
+SDK 会抛出以下类型的异常：
+
+**常见异常：**
+- `FileNotFoundError`: 文件不存在
+- `Exception`: API 调用失败（包含错误消息）
+- `TimeoutError`: 任务超时（仅在使用 `wait_for_*` 方法时）
+- `RequestException`: 网络请求错误
+
+**示例：**
+
+```python
+try:
+    kb = client.knowledge_base.get("nonexistent-id")
+except Exception as e:
+    print(f"Error: {e}")
+```
+
+```python
+from requests.exceptions import RequestException
+
+try:
+    result = client.retrieval.search(
+        kb_ids=[kb_id],
+        question="test query"
+    )
+except RequestException as e:
+    print(f"Network error: {e}")
+except Exception as e:
+    print(f"API error: {e}")
+```
+
+## 常见问题 (FAQ)
+
+### Q: 如何处理大文档的解析？
+
+A: 对于大文档，建议使用异步解析：
+```python
+# 使用异步解析
+task_id = client.document.parse_to_md_async(doc_id)
+result = client.document.wait_for_parse_to_md(task_id, timeout=600)
+```
+
+### Q: RAPTOR 和知识图谱的配置在哪里设置？
+
+A: 需要在创建或更新知识库时通过 `parser_config` 设置：
+```python
+kb = client.knowledge_base.create(
+    name="my_kb",
+    parser_config={
+        "raptor": {"max_cluster": 64},
+        "graphrag": {"entity_types": ["PERSON", "ORG"]}
+    }
+)
+```
+
+### Q: 如何查看知识库的 RAPTOR 和知识图谱状态？
+
+A: 使用对应的 `get_status` 方法：
+```python
+raptor_status = client.raptor.get_status(kb_id)
+kg_status = client.knowledge_graph.get_status(kb_id)
+```
+返回 `None` 表示没有运行中的任务。
+
+### Q: 如何实现混合检索？
+
+A: 调整 `vector_similarity_weight` 参数和启用 `keyword`：
+```python
+result = client.retrieval.search(
+    kb_ids=[kb_id],
+    question="query",
+    vector_similarity_weight=0.3,  # 向量权重
+    keyword=True,  # 启用关键词
+    use_kg=True  # 使用知识图谱
+)
+```
+
+### Q: 支持哪些抽取类型？
+
+A: 支持三种抽取类型：
+- `entity`: 实体抽取（人名、地名、组织等）
+- `keyword`: 关键词抽取
+- `summary`: 摘要生成
+
+还支持结构化抽取 (`struct_extract`)，可以自定义抽取模式。
+
+### Q: 如何处理解析失败的文档？
+
+A: 检查文档状态并根据错误信息处理：
+```python
+results = client.document.parse_to_chunk(kb_id, doc_ids, wait=True)
+for result in results:
+    if result['status'] == 'FAIL':
+        print(f"Document {result['doc_id']} failed to parse")
+        # 重新解析或删除
+```
+
+### Q: SDK 是否支持流式返回？
+
+A: 当前版本主要支持标准 REST API 调用。对于下载等操作，SDK 内部使用了流式传输。
+
+### Q: 如何设置请求超时？
+
+A: 当前 SDK 使用 `requests` 库的默认超时。如需自定义，可以在调用前设置：
+```python
+import requests
+requests.adapters.DEFAULT_RETRIES = 5
+```
+
+## 贡献
+
+欢迎贡献代码！请遵循以下步骤：
+
+1. Fork 本仓库
+2. 创建特性分支 (`git checkout -b feature/amazing-feature`)
+3. 提交更改 (`git commit -m 'Add amazing feature'`)
+4. 推送到分支 (`git push origin feature/amazing-feature`)
+5. 创建 Pull Request
+
+**贡献指南：**
+- 遵循 PEP 8 代码规范
+- 添加适当的类型注解
+- 为新功能添加测试用例
+- 更新相关文档
+
+## 许可证
+
+Copyright 2025 The OceanBase Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+## 链接
+
+- [PowerRAG 项目主页](https://github.com/oceanbase/powerrag)
+- [API 文档](https://github.com/oceanbase/powerrag/docs)
+- [问题反馈](https://github.com/oceanbase/powerrag/issues)
+- [更新日志](https://github.com/oceanbase/powerrag/CHANGELOG.md)
+
+## 支持
+
+### 获取帮助
+
+如有问题或建议，请：
+
+1. 📖 查看 [完整文档](https://github.com/oceanbase/powerrag/docs)
+2. 🔍 搜索 [已有问题](https://github.com/oceanbase/powerrag/issues)
+3. 💬 创建 [新问题](https://github.com/oceanbase/powerrag/issues/new)
+4. 📧 联系 OceanBase 团队
+
+### 社区
+
+- GitHub: [oceanbase/powerrag](https://github.com/oceanbase/powerrag)
+- 文档: [PowerRAG Documentation](https://github.com/oceanbase/powerrag/docs)
+- 问题跟踪: [GitHub Issues](https://github.com/oceanbase/powerrag/issues)
+
+### 反馈
+
+我们非常重视您的反馈！如果您：
+- 发现了 bug
+- 有功能建议
+- 需要帮助
+- 想要贡献代码
+
+请通过 GitHub Issues 联系我们。
+
+---
+
+**Made with ❤️ by OceanBase Team**
+
+
diff --git a/powerrag/sdk/__init__.py b/powerrag/sdk/__init__.py
new file mode 100644
index 000000000..90d01aaf0
--- /dev/null
+++ b/powerrag/sdk/__init__.py
@@ -0,0 +1,29 @@
+#
+#  Copyright 2025 The OceanBase Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+"""
+PowerRAG SDK
+
+A Python SDK for PowerRAG API, providing easy-to-use interfaces for knowledge base management,
+document processing, chunking, extraction, RAPTOR, knowledge graph, and retrieval.
+"""
+
+from .client import PowerRAGClient
+
+__all__ = ["PowerRAGClient"]
+
+# Alias for convenience
+PowerRAG = PowerRAGClient
\ No newline at end of file
diff --git a/powerrag/sdk/client.py b/powerrag/sdk/client.py
new file mode 100644
index 000000000..c9c44810e
--- /dev/null
+++ b/powerrag/sdk/client.py
@@ -0,0 +1,155 @@
+#
+#  Copyright 2025 The OceanBase Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import requests
+from typing import Optional, Dict, Any
+
+from .modules.knowledge_base_manager import KnowledgeBaseManager
+from .modules.document_manager import DocumentManager
+from .modules.chunk_manager import ChunkManager
+from .modules.extraction_manager import ExtractionManager
+from .modules.raptor_manager import RAPTORManager
+from .modules.knowledge_graph_manager import KnowledgeGraphManager
+from .modules.retrieval_manager import RetrievalManager
+
+
+class PowerRAGClient:
+    """PowerRAG SDK 主客户端"""
+    
+    def __init__(self, api_key: str, base_url: str = "http://localhost:9380", version: str = "v1"):
+        """
+        初始化客户端
+        
+        Args:
+            api_key: API密钥
+            base_url: 服务地址
+            version: API版本，默认v1
+        """
+        self.api_key = api_key
+        self.base_url = base_url.rstrip("/")
+        self.api_url = f"{self.base_url}/api/{version}"
+        self.authorization_header = {"Authorization": f"Bearer {self.api_key}"}
+        
+        # 初始化各个管理模块
+        self.knowledge_base = KnowledgeBaseManager(self)
+        self.document = DocumentManager(self)
+        self.chunk = ChunkManager(self)
+        self.extraction = ExtractionManager(self)
+        self.raptor = RAPTORManager(self)
+        self.knowledge_graph = KnowledgeGraphManager(self)
+        self.retrieval = RetrievalManager(self)
+    
+    def post(self, url: str, json=None, files=None, data=None, stream=False):
+        """
+        POST请求
+        
+        Args:
+            url: 请求URL
+            json: JSON数据
+            files: 文件数据
+            data: 表单数据
+            stream: 是否流式传输
+        
+        Returns:
+            Response对象
+        """
+        headers = self.authorization_header.copy()
+        
+        # 如果有文件上传，不设置Content-Type，让requests自动设置
+        if files:
+            res = requests.post(
+                url=self.api_url + url,
+                json=json,
+                files=files,
+                data=data,
+                headers=headers,
+                stream=stream
+            )
+        else:
+            if json:
+                headers["Content-Type"] = "application/json"
+            res = requests.post(
+                url=self.api_url + url,
+                json=json,
+                data=data,
+                headers=headers,
+                stream=stream
+            )
+        return res
+    
+    def get(self, url: str, params=None, stream=False):
+        """
+        GET请求
+        
+        Args:
+            url: 请求URL
+            params: 查询参数
+            stream: 是否流式传输
+        
+        Returns:
+            Response对象
+        """
+        res = requests.get(
+            url=self.api_url + url,
+            params=params,
+            headers=self.authorization_header,
+            stream=stream
+        )
+        return res
+    
+    def put(self, url: str, json=None):
+        """
+        PUT请求
+        
+        Args:
+            url: 请求URL
+            json: JSON数据
+        
+        Returns:
+            Response对象
+        """
+        headers = self.authorization_header.copy()
+        headers["Content-Type"] = "application/json"
+        res = requests.put(
+            url=self.api_url + url,
+            json=json,
+            headers=headers
+        )
+        return res
+    
+    def delete(self, url: str, json=None, params=None):
+        """
+        DELETE请求
+        
+        Args:
+            url: 请求URL
+            json: JSON数据
+            params: 查询参数
+        
+        Returns:
+            Response对象
+        """
+        headers = self.authorization_header.copy()
+        if json:
+            headers["Content-Type"] = "application/json"
+        res = requests.delete(
+            url=self.api_url + url,
+            json=json,
+            params=params,
+            headers=headers
+        )
+        return res
+
diff --git a/powerrag/sdk/modules/chunk.py b/powerrag/sdk/modules/chunk.py
new file mode 100644
index 000000000..23732e508
--- /dev/null
+++ b/powerrag/sdk/modules/chunk.py
@@ -0,0 +1,32 @@
+#
+#  Copyright 2025 The OceanBase Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+from typing import TypedDict, Optional, List, Dict, Any
+
+
+class ChunkInfo(TypedDict, total=False):
+    """切片信息类型定义"""
+    id: str
+    content: str
+    document_id: str
+    dataset_id: str  # 知识库ID
+    important_keywords: List[str]
+    questions: List[str]
+    image_id: Optional[str]
+    available: bool
+    positions: List[List[int]]  # 位置信息，每个子列表包含5个整数
+    docnm_kwd: str  # 文档名称关键词
+
diff --git a/powerrag/sdk/modules/chunk_manager.py b/powerrag/sdk/modules/chunk_manager.py
new file mode 100644
index 000000000..61acccd1b
--- /dev/null
+++ b/powerrag/sdk/modules/chunk_manager.py
@@ -0,0 +1,271 @@
+#
+#  Copyright 2025 The OceanBase Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+from typing import Optional, List, Dict, Any
+from .chunk import ChunkInfo
+
+
+class ChunkManager:
+    """切片管理模块"""
+    
+    def __init__(self, client):
+        """
+        初始化切片管理模块
+        
+        Args:
+            client: PowerRAG客户端实例
+        """
+        self.client = client
+    
+    def list(
+        self,
+        kb_id: str,
+        doc_id: str,
+        id: Optional[str] = None,
+        keywords: Optional[str] = None,
+        page: int = 1,
+        page_size: int = 30,
+    ) -> tuple[List[ChunkInfo], int, Dict[str, Any]]:
+        """
+        列出文档的切片
+        
+        Args:
+            kb_id: 知识库ID
+            doc_id: 文档ID
+            id: 切片ID（可选，用于精确查询）
+            keywords: 关键词搜索（可选）
+            page: 页码，默认1
+            page_size: 每页数量，默认30
+        
+        Returns:
+            (切片列表, 总数, 文档信息)
+        
+        Raises:
+            Exception: API调用失败
+        """
+        params = {
+            "page": page,
+            "page_size": page_size,
+        }
+        
+        if id:
+            params["id"] = id
+        if keywords:
+            params["keywords"] = keywords
+        
+        url = f"/datasets/{kb_id}/documents/{doc_id}/chunks"
+        res = self.client.get(url, params=params)
+        res_json = res.json()
+        
+        if res_json.get("code") == 0:
+            data = res_json.get("data", {})
+            return data.get("chunks", []), data.get("total", 0), data.get("doc", {})
+        
+        raise Exception(res_json.get("message", "List chunks failed"))
+    
+    def get(self, kb_id: str, doc_id: str, chunk_id: str) -> ChunkInfo:
+        """
+        获取切片信息
+        
+        Args:
+            kb_id: 知识库ID
+            doc_id: 文档ID
+            chunk_id: 切片ID
+        
+        Returns:
+            切片信息
+        
+        Raises:
+            Exception: API调用失败或切片不存在
+        """
+        chunks, total, _ = self.list(kb_id, doc_id, id=chunk_id, page_size=1)
+        if not chunks:
+            raise Exception(f"Chunk '{chunk_id}' not found")
+        return chunks[0]
+    
+    def create(
+        self,
+        kb_id: str,
+        doc_id: str,
+        content: str,
+        important_keywords: Optional[List[str]] = None,
+        questions: Optional[List[str]] = None,
+    ) -> ChunkInfo:
+        """
+        创建切片
+        
+        Args:
+            kb_id: 知识库ID
+            doc_id: 文档ID
+            content: 切片内容
+            important_keywords: 重要关键词列表（可选）
+            questions: 问题列表（可选）
+        
+        Returns:
+            创建的切片信息
+        
+        Raises:
+            Exception: API调用失败
+        """
+        payload = {
+            "content": content,
+        }
+        
+        if important_keywords is not None:
+            payload["important_keywords"] = important_keywords
+        if questions is not None:
+            payload["questions"] = questions
+        
+        url = f"/datasets/{kb_id}/documents/{doc_id}/chunks"
+        res = self.client.post(url, json=payload)
+        res_json = res.json()
+        
+        if res_json.get("code") != 0:
+            raise Exception(res_json.get("message", "Create chunk failed"))
+        
+        return res_json.get("data", {}).get("chunk", {})
+    
+    def update(
+        self,
+        kb_id: str,
+        doc_id: str,
+        chunk_id: str,
+        content: Optional[str] = None,
+        important_keywords: Optional[List[str]] = None,
+        questions: Optional[List[str]] = None,
+        available: Optional[bool] = None,
+        positions: Optional[List[List[int]]] = None,
+    ) -> ChunkInfo:
+        """
+        更新切片
+        
+        Args:
+            kb_id: 知识库ID
+            doc_id: 文档ID
+            chunk_id: 切片ID
+            content: 切片内容（可选）
+            important_keywords: 重要关键词列表（可选）
+            questions: 问题列表（可选）
+            available: 是否可用（可选）
+            positions: 位置信息（可选）
+        
+        Returns:
+            更新后的切片信息
+        
+        Raises:
+            Exception: API调用失败
+        """
+        update_data = {}
+        
+        if content is not None:
+            update_data["content"] = content
+        if important_keywords is not None:
+            update_data["important_keywords"] = important_keywords
+        if questions is not None:
+            update_data["questions"] = questions
+        if available is not None:
+            update_data["available"] = available
+        if positions is not None:
+            update_data["positions"] = positions
+        
+        if not update_data:
+            raise Exception("No fields to update")
+        
+        url = f"/datasets/{kb_id}/documents/{doc_id}/chunks/{chunk_id}"
+        res = self.client.put(url, json=update_data)
+        res_json = res.json()
+        
+        if res_json.get("code") != 0:
+            raise Exception(res_json.get("message", "Update chunk failed"))
+        
+        # API返回成功但不包含chunk数据，需要重新获取
+        return self.get(kb_id, doc_id, chunk_id)
+    
+    def delete(
+        self,
+        kb_id: str,
+        doc_id: str,
+        chunk_ids: Optional[List[str]] = None,
+    ) -> None:
+        """
+        删除切片
+        
+        Args:
+            kb_id: 知识库ID
+            doc_id: 文档ID
+            chunk_ids: 切片ID列表，如果为None则删除文档的所有切片
+        
+        Raises:
+            Exception: API调用失败
+        """
+        payload = {}
+        if chunk_ids is not None:
+            payload["chunk_ids"] = chunk_ids
+        
+        url = f"/datasets/{kb_id}/documents/{doc_id}/chunks"
+        res = self.client.delete(url, json=payload)
+        res_json = res.json()
+        
+        if res_json.get("code") != 0:
+            raise Exception(res_json.get("message", "Delete chunks failed"))
+    
+    def split_text(
+        self,
+        text: str,
+        parser_id: str = "title",
+        config: Optional[Dict[str, Any]] = None,
+    ) -> Dict[str, Any]:
+        """
+        文本切片（无需上传文档）
+        
+        Args:
+            text: 要切片的文本（Markdown格式）
+            parser_id: 解析器ID（默认"title"）
+            config: 解析配置（可选）
+        
+        Returns:
+            切片结果，包含chunks列表和total_chunks数量
+        
+        Raises:
+            Exception: API调用失败
+        """
+        payload = {
+            "text": text,
+            "parser_id": parser_id,
+        }
+        
+        if config:
+            payload["config"] = config
+        
+        url = "/powerrag/split"
+        res = self.client.post(url, json=payload)
+        
+        # 检查响应状态码
+        if res.status_code != 200:
+            try:
+                error_json = res.json()
+                error_msg = error_json.get("message", f"HTTP {res.status_code}")
+            except Exception:
+                error_msg = f"HTTP {res.status_code}: {res.text[:200]}"
+            raise Exception(error_msg)
+        
+        res_json = res.json()
+        
+        if res_json.get("code") != 0:
+            raise Exception(res_json.get("message", "Split text failed"))
+        
+        return res_json.get("data", {})
+
diff --git a/powerrag/sdk/modules/document.py b/powerrag/sdk/modules/document.py
new file mode 100644
index 000000000..c76c05eae
--- /dev/null
+++ b/powerrag/sdk/modules/document.py
@@ -0,0 +1,40 @@
+#
+#  Copyright 2025 The OceanBase Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+from typing import TypedDict, Optional, Dict, Any, List
+
+
+class DocumentInfo(TypedDict, total=False):
+    """文档信息类型定义"""
+    id: str
+    name: str
+    dataset_id: str  # 知识库ID
+    chunk_count: int
+    token_count: int
+    chunk_method: str
+    run: str  # UNSTART, RUNNING, CANCEL, DONE, FAIL
+    progress: float  # 0.0-1.0
+    progress_msg: Optional[str]
+    type: str  # 文件类型
+    size: int  # 文件大小（字节）
+    suffix: str  # 文件后缀
+    thumbnail: Optional[str]  # 缩略图
+    create_time: int  # 创建时间戳
+    update_time: int  # 更新时间戳
+    meta_fields: Optional[Dict[str, Any]]  # 元数据字段
+    enabled: bool  # 是否启用
+    parser_config: Optional[Dict[str, Any]]  # 解析器配置
+
diff --git a/powerrag/sdk/modules/document_manager.py b/powerrag/sdk/modules/document_manager.py
new file mode 100644
index 000000000..34971524a
--- /dev/null
+++ b/powerrag/sdk/modules/document_manager.py
@@ -0,0 +1,789 @@
+#
+#  Copyright 2025 The OceanBase Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+from typing import Optional, List, Dict, Any, Union
+from pathlib import Path
+from .document import DocumentInfo
+
+
+class DocumentManager:
+    """文档管理模块"""
+    
+    def __init__(self, client):
+        """
+        初始化文档管理模块
+        
+        Args:
+            client: PowerRAG客户端实例
+        """
+        self.client = client
+    
+    def upload(
+        self,
+        kb_id: str,
+        file_paths: Union[str, List[str]],
+        parent_path: Optional[str] = None,
+    ) -> List[DocumentInfo]:
+        """
+        上传文档到知识库
+        
+        Args:
+            kb_id: 知识库ID
+            file_paths: 文件路径（单个文件或文件列表）
+            parent_path: 父路径（可选，用于嵌套文件夹）
+        
+        Returns:
+            文档信息列表
+        
+        Raises:
+            Exception: API调用失败
+        """
+        if isinstance(file_paths, str):
+            file_paths = [file_paths]
+        
+        files = []
+        for file_path in file_paths:
+            path = Path(file_path)
+            if not path.exists():
+                raise FileNotFoundError(f"File not found: {file_path}")
+            
+            with open(path, "rb") as f:
+                files.append(("file", (path.name, f.read())))
+        
+        form_data = {}
+        if parent_path:
+            form_data["parent_path"] = parent_path
+        
+        url = f"/datasets/{kb_id}/documents"
+        res = self.client.post(url, json=None, files=files, data=form_data)
+        res_json = res.json()
+        
+        if res_json.get("code") != 0:
+            raise Exception(res_json.get("message", "Upload documents failed"))
+        
+        return res_json.get("data", [])
+    
+    def upload_from_url(
+        self,
+        kb_id: str,
+        url: str,
+        name: str,
+    ) -> bool:
+        """
+        从URL上传文档到知识库
+        
+        Args:
+            kb_id: 知识库ID
+            url: 文档URL
+            name: 文档名称
+        
+        Returns:
+            是否成功
+        
+        Raises:
+            Exception: API调用失败
+        """
+        form_data = {
+            "kb_id": kb_id,
+            "name": name,
+            "url": url,
+        }
+        
+        res = self.client.post("/document/web_crawl", json=None, data=form_data)
+        res_json = res.json()
+        
+        if res_json.get("code") != 0:
+            raise Exception(res_json.get("message", "Upload from URL failed"))
+        
+        return res_json.get("data", False)
+    
+    def list(
+        self,
+        kb_id: str,
+        id: Optional[str] = None,
+        name: Optional[str] = None,
+        keywords: Optional[str] = None,
+        page: int = 1,
+        page_size: int = 30,
+        orderby: str = "create_time",
+        desc: bool = True,
+        create_time_from: int = 0,
+        create_time_to: int = 0,
+        suffix: Optional[List[str]] = None,
+        run: Optional[List[str]] = None,
+    ) -> tuple[List[DocumentInfo], int]:
+        """
+        列出知识库中的文档
+        
+        Args:
+            kb_id: 知识库ID
+            id: 文档ID（可选）
+            name: 文档名称（可选）
+            keywords: 关键词搜索（可选）
+            page: 页码，默认1
+            page_size: 每页数量，默认30
+            orderby: 排序字段，默认create_time
+            desc: 是否降序，默认True
+            create_time_from: 创建时间起始（时间戳）
+            create_time_to: 创建时间结束（时间戳）
+            suffix: 文件后缀过滤（可选）
+            run: 运行状态过滤（可选，UNSTART/RUNNING/CANCEL/DONE/FAIL）
+        
+        Returns:
+            (文档列表, 总数)
+        
+        Raises:
+            Exception: API调用失败
+        """
+        params = {
+            "page": page,
+            "page_size": page_size,
+            "orderby": orderby,
+            "desc": desc,
+        }
+        
+        if id:
+            params["id"] = id
+        if name:
+            params["name"] = name
+        if keywords:
+            params["keywords"] = keywords
+        if create_time_from:
+            params["create_time_from"] = create_time_from
+        if create_time_to:
+            params["create_time_to"] = create_time_to
+        if suffix:
+            params["suffix"] = suffix
+        if run:
+            params["run"] = run
+        
+        url = f"/datasets/{kb_id}/documents"
+        res = self.client.get(url, params=params)
+        res_json = res.json()
+        
+        if res_json.get("code") == 0:
+            data = res_json.get("data", {})
+            return data.get("docs", []), data.get("total", 0)
+        
+        raise Exception(res_json.get("message", "List documents failed"))
+    
+    def get(self, kb_id: str, doc_id: str) -> DocumentInfo:
+        """
+        获取文档信息
+        
+        Args:
+            kb_id: 知识库ID
+            doc_id: 文档ID
+        
+        Returns:
+            文档信息
+        
+        Raises:
+            Exception: API调用失败或文档不存在
+        """
+        docs, _ = self.list(kb_id, id=doc_id, page_size=1)
+        if not docs:
+            raise Exception(f"Document '{doc_id}' not found")
+        return docs[0]
+    
+    def update(
+        self,
+        kb_id: str,
+        doc_id: str,
+        name: Optional[str] = None,
+        meta_fields: Optional[Dict[str, Any]] = None,
+        chunk_method: Optional[str] = None,
+        parser_config: Optional[Dict[str, Any]] = None,
+        enabled: Optional[bool] = None,
+    ) -> DocumentInfo:
+        """
+        更新文档
+        
+        Args:
+            kb_id: 知识库ID
+            doc_id: 文档ID
+            name: 文档名称（可选）
+            meta_fields: 元数据字段（可选）
+            chunk_method: 切片方法（可选）
+            parser_config: 解析器配置（可选）
+            enabled: 是否启用（可选）
+        
+        Returns:
+            更新后的文档信息
+        
+        Raises:
+            Exception: API调用失败
+        """
+        update_data = {}
+        
+        if name is not None:
+            update_data["name"] = name
+        if meta_fields is not None:
+            update_data["meta_fields"] = meta_fields
+        if chunk_method is not None:
+            update_data["chunk_method"] = chunk_method
+        if parser_config is not None:
+            update_data["parser_config"] = parser_config
+        if enabled is not None:
+            update_data["enabled"] = enabled
+        
+        if not update_data:
+            raise Exception("No fields to update")
+        
+        url = f"/datasets/{kb_id}/documents/{doc_id}"
+        res = self.client.put(url, json=update_data)
+        res_json = res.json()
+        
+        if res_json.get("code") != 0:
+            raise Exception(res_json.get("message", "Update document failed"))
+        
+        return res_json.get("data", {})
+    
+    def rename(self, kb_id: str, doc_id: str, new_name: str) -> DocumentInfo:
+        """
+        重命名文档
+        
+        Args:
+            kb_id: 知识库ID
+            doc_id: 文档ID
+            new_name: 新名称
+        
+        Returns:
+            更新后的文档信息
+        
+        Raises:
+            Exception: API调用失败
+        """
+        return self.update(kb_id, doc_id, name=new_name)
+    
+    def set_meta(self, kb_id: str, doc_id: str, meta_fields: Dict[str, Any]) -> DocumentInfo:
+        """
+        设置文档元数据
+        
+        Args:
+            kb_id: 知识库ID
+            doc_id: 文档ID
+            meta_fields: 元数据字段字典
+        
+        Returns:
+            更新后的文档信息
+        
+        Raises:
+            Exception: API调用失败
+        """
+        return self.update(kb_id, doc_id, meta_fields=meta_fields)
+    
+    def delete(self, kb_id: str, doc_ids: Optional[List[str]] = None) -> None:
+        """
+        删除文档
+        
+        Args:
+            kb_id: 知识库ID
+            doc_ids: 文档ID列表，如果为None则删除所有文档
+        
+        Raises:
+            Exception: API调用失败
+        """
+        payload = {"ids": doc_ids}
+        url = f"/datasets/{kb_id}/documents"
+        res = self.client.delete(url, json=payload)
+        res_json = res.json()
+        
+        if res_json.get("code") != 0:
+            raise Exception(res_json.get("message", "Delete documents failed"))
+    
+    def download(self, kb_id: str, doc_id: str, save_path: Optional[str] = None) -> Union[bytes, str]:
+        """
+        下载文档
+        
+        Args:
+            kb_id: 知识库ID
+            doc_id: 文档ID
+            save_path: 保存路径（可选），如果提供则保存到文件，否则返回字节流
+        
+        Returns:
+            如果提供save_path则返回文件路径，否则返回文件字节流
+        
+        Raises:
+            Exception: API调用失败
+        """
+        url = f"/datasets/{kb_id}/documents/{doc_id}"
+        res = self.client.get(url, stream=True)
+        
+        if res.status_code != 200:
+            res_json = res.json() if res.headers.get("content-type", "").startswith("application/json") else {}
+            raise Exception(res_json.get("message", "Download document failed"))
+        
+        file_content = res.content
+        
+        if save_path:
+            Path(save_path).parent.mkdir(parents=True, exist_ok=True)
+            with open(save_path, "wb") as f:
+                f.write(file_content)
+            return save_path
+        
+        return file_content
+    
+    def parse_to_chunk(
+        self,
+        kb_id: str,
+        doc_ids: List[str],
+        wait: bool = True,
+        delete_existing: bool = False,
+        config: Optional[Dict[str, Any]] = None,
+    ) -> Union[List[Dict[str, Any]], str]:
+        """
+        解析文档为切片
+        
+        Args:
+            kb_id: 知识库ID
+            doc_ids: 文档ID列表
+            wait: 是否等待解析完成（默认True）
+            delete_existing: 是否删除已存在的切片（默认False）
+            config: 解析配置（可选）
+        
+        Returns:
+            如果wait=True，返回解析结果列表；如果wait=False，返回任务ID
+        
+        Raises:
+            Exception: API调用失败
+        """
+        payload = {
+            "document_ids": doc_ids,
+        }
+        
+        if delete_existing:
+            payload["delete_existing"] = True
+        if config:
+            payload["config"] = config
+        
+        url = f"/datasets/{kb_id}/chunks"
+        res = self.client.post(url, json=payload)
+        res_json = res.json()
+        
+        if res_json.get("code") != 0:
+            raise Exception(res_json.get("message", "Parse documents failed"))
+        
+        if wait:
+            return self._wait_for_parse(kb_id, doc_ids)
+        
+        return res_json.get("data", {}).get("task_id", "")
+    
+    def parse_to_md_async(
+        self,
+        doc_id: str,
+        config: Optional[Dict[str, Any]] = None,
+    ) -> str:
+        """
+        异步解析文档为 Markdown（不切分）
+        
+        提交异步解析任务，立即返回任务 ID。
+        适用于大文档或需要长时间处理的场景。
+        
+        支持的文件格式:
+        - PDF (.pdf)
+        - Office 文档 (.doc, .docx, .ppt, .pptx)
+        - 图片 (.jpg, .png)
+        - HTML (.html, .htm)
+        
+        Args:
+            doc_id: 文档ID
+            config: 解析配置（可选）
+                - layout_recognize: 布局识别引擎 (mineru 或 dots_ocr，默认 mineru)
+                - enable_ocr: 是否启用 OCR (默认 False)
+                - enable_formula: 是否识别公式 (默认 False)
+                - enable_table: 是否识别表格 (默认 True)
+                - from_page: 起始页（仅 PDF，默认 0）
+                - to_page: 结束页（仅 PDF，默认 100000）
+        
+        Returns:
+            task_id: 任务ID，用于查询任务状态和结果
+        
+        Raises:
+            Exception: API调用失败
+        
+        Example:
+            >>> # 提交异步任务
+            >>> task_id = client.document.parse_to_md_async(
+            ...     doc_id="doc_123",
+            ...     config={"layout_recognize": "mineru"}
+            ... )
+            >>> print(f"Task ID: {task_id}")
+            
+            >>> # 查询任务状态
+            >>> status = client.document.get_parse_to_md_status(task_id)
+            >>> if status["status"] == "success":
+            ...     print(f"Markdown: {status['result']['markdown']}")
+        """
+        payload = {
+            "doc_id": doc_id,
+        }
+        if config:
+            payload["config"] = config
+        
+        url = "/powerrag/parse_to_md/async"
+        res = self.client.post(url, json=payload)
+        res_json = res.json()
+        
+        if res_json.get("code") != 0:
+            raise Exception(res_json.get("message", "Submit parse_to_md task failed"))
+        
+        return res_json.get("data", {}).get("task_id", "")
+    
+    def get_parse_to_md_status(
+        self,
+        task_id: str,
+    ) -> Dict[str, Any]:
+        """
+        查询 parse_to_md 异步任务状态
+        
+        Args:
+            task_id: 任务ID（由 parse_to_md_async 返回）
+        
+        Returns:
+            任务状态字典:
+            {
+                "task_id": "...",
+                "status": "pending|processing|success|failed|not_found",
+                "created_at": "2025-01-01T00:00:00",
+                "updated_at": "2025-01-01T00:00:00",
+                "result": {  # 仅当 status="success" 时存在
+                    "doc_id": "...",
+                    "doc_name": "...",
+                    "markdown": "...",
+                    "markdown_length": 5000,
+                    "images": {...},
+                    "total_images": 2
+                },
+                "error": "..."  # 仅当 status="failed" 时存在
+            }
+        
+        Raises:
+            Exception: API调用失败
+        
+        Example:
+            >>> status = client.document.get_parse_to_md_status(task_id)
+            >>> print(f"Status: {status['status']}")
+            >>> 
+            >>> if status["status"] == "success":
+            ...     result = status["result"]
+            ...     print(f"Markdown length: {result['markdown_length']}")
+            >>> elif status["status"] == "failed":
+            ...     print(f"Error: {status['error']}")
+            >>> elif status["status"] in ["pending", "processing"]:
+            ...     print("Task is still running...")
+        """
+        url = f"/powerrag/parse_to_md/status/{task_id}"
+        res = self.client.get(url)
+        res_json = res.json()
+        
+        # For 404, still return the data (with status="not_found")
+        if res_json.get("code") == 404:
+            return res_json.get("data", {"task_id": task_id, "status": "not_found"})
+        
+        if res_json.get("code") != 0:
+            raise Exception(res_json.get("message", "Get task status failed"))
+        
+        return res_json.get("data", {})
+    
+    def wait_for_parse_to_md(
+        self,
+        task_id: str,
+        timeout: int = 300,
+        interval: float = 2.0,
+    ) -> Dict[str, Any]:
+        """
+        等待 parse_to_md 异步任务完成
+        
+        轮询任务状态直到完成（成功或失败）或超时。
+        
+        Args:
+            task_id: 任务ID
+            timeout: 超时时间（秒），默认 300 秒（5 分钟）
+            interval: 轮询间隔（秒），默认 2 秒
+        
+        Returns:
+            任务最终状态（同 get_parse_to_md_status）
+        
+        Raises:
+            TimeoutError: 超时
+            Exception: 任务失败或 API 调用失败
+        
+        Example:
+            >>> task_id = client.document.parse_to_md_async(doc_id)
+            >>> result = client.document.wait_for_parse_to_md(task_id, timeout=600)
+            >>> print(f"Markdown: {result['result']['markdown']}")
+        """
+        import time
+        
+        start_time = time.time()
+        terminal_states = {"success", "failed", "not_found"}
+        
+        while True:
+            status = self.get_parse_to_md_status(task_id)
+            
+            if status["status"] in terminal_states:
+                if status["status"] == "failed":
+                    raise Exception(f"Task failed: {status.get('error', 'Unknown error')}")
+                elif status["status"] == "not_found":
+                    raise Exception(f"Task not found: {task_id}")
+                return status
+            
+            # Check timeout
+            elapsed = time.time() - start_time
+            if elapsed > timeout:
+                raise TimeoutError(f"Task {task_id} did not complete within {timeout} seconds")
+            
+            # Sleep before next poll
+            time.sleep(interval)
+    
+    def parse_to_md(
+        self,
+        doc_id: str,
+        config: Optional[Dict[str, Any]] = None,
+    ) -> Dict[str, Any]:
+        """
+        解析文档为Markdown（不切分）
+        
+        将已上传的文档解析为 Markdown 格式，但不进行切分。
+        适用于需要完整文档内容或外部系统自行处理切分的场景。
+        
+        支持的文件格式:
+        - PDF (.pdf)
+        - Office 文档 (.doc, .docx, .ppt, .pptx)
+        - 图片 (.jpg, .png)
+        - HTML (.html, .htm)
+        
+        Args:
+            doc_id: 文档ID
+            config: 解析配置（可选）
+                - layout_recognize: 布局识别引擎 (mineru 或 dots_ocr，默认 mineru)
+                - enable_ocr: 是否启用 OCR (默认 False)
+                - enable_formula: 是否识别公式 (默认 False)
+                - enable_table: 是否识别表格 (默认 True)
+                - from_page: 起始页（仅 PDF，默认 0）
+                - to_page: 结束页（仅 PDF，默认 100000）
+        
+        Returns:
+            解析结果字典:
+            {
+                "doc_id": "...",
+                "doc_name": "...",
+                "markdown": "...",          # 完整的 Markdown 内容
+                "markdown_length": 5000,    # Markdown 长度
+                "images": {...},            # 图片字典 (base64)
+                "total_images": 2           # 图片总数
+            }
+        
+        Raises:
+            Exception: API调用失败
+        
+        Example:
+            >>> result = doc_manager.parse_to_md(
+            ...     doc_id="doc_123",
+            ...     config={"layout_recognize": "mineru", "enable_ocr": False}
+            ... )
+            >>> print(f"Markdown length: {result['markdown_length']}")
+            >>> print(f"First 200 chars: {result['markdown'][:200]}")
+        """
+        payload = {
+            "doc_id": doc_id,
+        }
+        
+        if config:
+            payload["config"] = config
+        
+        url = "/powerrag/parse_to_md"
+        res = self.client.post(url, json=payload)
+        res_json = res.json()
+        
+        if res_json.get("code") != 0:
+            raise Exception(res_json.get("message", "Parse to markdown failed"))
+        
+        return res_json.get("data", {})
+    
+    def parse_to_md_upload(
+        self,
+        file_path: str,
+        config: Optional[Dict[str, Any]] = None,
+    ) -> Dict[str, Any]:
+        """
+        上传文件并解析为Markdown（不切分）
+        
+        直接上传文件并解析为 Markdown 格式，不进行切分。
+        不需要先上传到知识库，适合一次性解析场景。
+        
+        支持的文件格式:
+        - PDF (.pdf)
+        - Office 文档 (.doc, .docx, .ppt, .pptx)
+        - 图片 (.jpg, .png)
+        - HTML (.html, .htm)
+        
+        Args:
+            file_path: 文件路径
+            config: 解析配置（可选），同 parse_to_md
+        
+        Returns:
+            解析结果字典，包含以下字段：
+            - filename: 文件名
+            - markdown: Markdown 内容
+            - markdown_length: Markdown 长度
+            - images: 图片字典
+            - total_images: 图片总数
+        
+        Raises:
+            FileNotFoundError: 文件不存在
+            Exception: API调用失败
+        
+        Example:
+            >>> result = doc_manager.parse_to_md_upload(
+            ...     file_path="document.pdf",
+            ...     config={"layout_recognize": "mineru"}
+            ... )
+            >>> print(result['markdown'])
+            >>> print(f"Parsed {result['total_images']} images")
+        """
+        path = Path(file_path)
+        if not path.exists():
+            raise FileNotFoundError(f"File not found: {file_path}")
+        
+        # Prepare files
+        with open(path, "rb") as f:
+            files = [("file", (path.name, f.read()))]
+        
+        # Prepare form data
+        import json
+        form_data = {}
+        if config:
+            form_data["config"] = json.dumps(config)
+        
+        url = "/powerrag/parse_to_md/upload"
+        res = self.client.post(url, json=None, files=files, data=form_data)
+        
+        # Parse JSON response
+        res_json = res.json()
+        
+        if res_json.get("code") != 0:
+            raise Exception(res_json.get("message", "Parse to markdown (upload) failed"))
+        
+        return res_json.get("data", {})
+    
+    def parse_url(
+        self,
+        kb_id: str,
+        url: str,
+        name: str,
+        wait: bool = True,
+        config: Optional[Dict[str, Any]] = None,
+    ) -> Union[DocumentInfo, str]:
+        """
+        解析URL文档
+        
+        Args:
+            kb_id: 知识库ID
+            url: 文档URL
+            name: 文档名称
+            wait: 是否等待解析完成（默认True）
+            config: 解析配置（可选）
+        
+        Returns:
+            如果wait=True，返回文档信息；如果wait=False，返回任务ID
+        
+        Raises:
+            Exception: API调用失败
+        """
+        self.upload_from_url(kb_id, url, name)
+        
+        docs, _ = self.list(kb_id, name=name)
+        if not docs:
+            raise Exception(f"Failed to upload document from URL: {url}")
+        
+        doc_id = docs[0]["id"]
+        
+        if wait:
+            self.parse_to_chunk(kb_id, [doc_id], wait=True, config=config)
+            return self.get(kb_id, doc_id)
+        
+        task_id = self.parse_to_chunk(kb_id, [doc_id], wait=False, config=config)
+        return task_id
+    
+    def cancel_parse(self, kb_id: str, doc_ids: List[str]) -> None:
+        """
+        取消解析任务
+        
+        Args:
+            kb_id: 知识库ID
+            doc_ids: 文档ID列表
+        
+        Raises:
+            Exception: API调用失败
+        """
+        payload = {"document_ids": doc_ids}
+        url = f"/datasets/{kb_id}/chunks"
+        res = self.client.delete(url, json=payload)
+        res_json = res.json()
+        
+        if res_json.get("code") != 0:
+            raise Exception(res_json.get("message", "Cancel parse failed"))
+    
+    def _wait_for_parse(self, kb_id: str, doc_ids: List[str]) -> List[Dict[str, Any]]:
+        """
+        等待解析完成（内部方法）
+        
+        Args:
+            kb_id: 知识库ID
+            doc_ids: 文档ID列表
+        
+        Returns:
+            解析结果列表
+        """
+        import time
+        
+        terminal_states = {"DONE", "FAIL", "CANCEL"}
+        interval_sec = 1
+        pending = set(doc_ids)
+        results = []
+        
+        while pending:
+            for doc_id in list(pending):
+                try:
+                    doc = self.get(kb_id, doc_id)
+                    run_status = doc.get("run", "")
+                    
+                    if run_status in terminal_states:
+                        results.append({
+                            "doc_id": doc_id,
+                            "status": run_status,
+                            "chunk_count": doc.get("chunk_count", 0),
+                            "token_count": doc.get("token_count", 0),
+                        })
+                        pending.discard(doc_id)
+                    elif doc.get("progress", 0.0) >= 1.0:
+                        results.append({
+                            "doc_id": doc_id,
+                            "status": "DONE",
+                            "chunk_count": doc.get("chunk_count", 0),
+                            "token_count": doc.get("token_count", 0),
+                        })
+                        pending.discard(doc_id)
+                except Exception:
+                    pass
+            
+            if pending:
+                time.sleep(interval_sec)
+        
+        return results
+
diff --git a/powerrag/sdk/modules/extraction.py b/powerrag/sdk/modules/extraction.py
new file mode 100644
index 000000000..a7086f21e
--- /dev/null
+++ b/powerrag/sdk/modules/extraction.py
@@ -0,0 +1,50 @@
+#
+#  Copyright 2025 The OceanBase Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+from typing import TypedDict, Optional, List, Dict, Any
+
+
+class ExtractionResult(TypedDict, total=False):
+    """抽取结果类型定义"""
+    doc_id: Optional[str]
+    doc_name: Optional[str]
+    extractor_type: str
+    data: Dict[str, Any]  # 抽取的数据（entities/keywords/summary等）
+    metadata: Dict[str, Any]
+
+
+class EntityInfo(TypedDict, total=False):
+    """实体信息"""
+    text: str
+    type: str
+    start: int
+    end: int
+    confidence: Optional[float]
+
+
+class KeywordInfo(TypedDict, total=False):
+    """关键词信息"""
+    keyword: str
+    score: float
+    frequency: Optional[int]
+
+
+class StructExtractTaskInfo(TypedDict, total=False):
+    """结构化抽取任务信息"""
+    task_id: str
+    status: str
+    result: Optional[Dict[str, Any]]
+
diff --git a/powerrag/sdk/modules/extraction_manager.py b/powerrag/sdk/modules/extraction_manager.py
new file mode 100644
index 000000000..a4d40d09b
--- /dev/null
+++ b/powerrag/sdk/modules/extraction_manager.py
@@ -0,0 +1,239 @@
+#
+#  Copyright 2025 The OceanBase Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+from typing import Optional, List, Dict, Any, Union
+from .extraction import ExtractionResult, StructExtractTaskInfo
+
+
+class ExtractionManager:
+    """抽取管理模块"""
+    
+    def __init__(self, client):
+        """
+        初始化抽取管理模块
+        
+        Args:
+            client: PowerRAG客户端实例
+        """
+        self.client = client
+    
+    def extract_from_document(
+        self,
+        doc_id: str,
+        extractor_type: str = "entity",
+        config: Optional[Dict[str, Any]] = None,
+    ) -> ExtractionResult:
+        """
+        从文档抽取信息
+        
+        Args:
+            doc_id: 文档ID
+            extractor_type: 抽取类型，'entity'、'keyword' 或 'summary'
+            config: 抽取配置（可选）
+                - entity: {"entity_types": ["PERSON", "ORG"], "use_regex": True, "use_llm": False}
+                - keyword: {"max_keywords": 20, "min_word_length": 3}
+                - summary: {"max_length": 200, "min_length": 50}
+        
+        Returns:
+            抽取结果
+        
+        Raises:
+            Exception: API调用失败
+        """
+        payload = {
+            "doc_id": doc_id,
+            "extractor_type": extractor_type,
+        }
+        
+        if config:
+            payload["config"] = config
+        
+        url = "/powerrag/extract"
+        res = self.client.post(url, json=payload)
+        res_json = res.json()
+        
+        if res_json.get("code") != 0:
+            raise Exception(res_json.get("message", "Extract from document failed"))
+        
+        return res_json.get("data", {})
+    
+    def extract_from_text(
+        self,
+        text: str,
+        extractor_type: str = "entity",
+        config: Optional[Dict[str, Any]] = None,
+    ) -> ExtractionResult:
+        """
+        从文本抽取信息
+        
+        Args:
+            text: 文本内容
+            extractor_type: 抽取类型，'entity'、'keyword' 或 'summary'
+            config: 抽取配置（可选）
+        
+        Returns:
+            抽取结果
+        
+        Raises:
+            Exception: API调用失败
+        """
+        payload = {
+            "text": text,
+            "extractor_type": extractor_type,
+        }
+        
+        if config:
+            payload["config"] = config
+        
+        url = "/powerrag/extract/text"
+        res = self.client.post(url, json=payload)
+        res_json = res.json()
+        
+        if res_json.get("code") != 0:
+            raise Exception(res_json.get("message", "Extract from text failed"))
+        
+        return res_json.get("data", {})
+    
+    def extract_batch(
+        self,
+        doc_ids: List[str],
+        extractor_type: str = "entity",
+        config: Optional[Dict[str, Any]] = None,
+    ) -> List[Dict[str, Any]]:
+        """
+        批量从文档抽取信息
+        
+        Args:
+            doc_ids: 文档ID列表
+            extractor_type: 抽取类型
+            config: 抽取配置（可选）
+        
+        Returns:
+            抽取结果列表，每个结果包含success字段
+        
+        Raises:
+            Exception: API调用失败
+        """
+        payload = {
+            "doc_ids": doc_ids,
+            "extractor_type": extractor_type,
+        }
+        
+        if config:
+            payload["config"] = config
+        
+        url = "/powerrag/extract/batch"
+        res = self.client.post(url, json=payload)
+        res_json = res.json()
+        
+        if res_json.get("code") != 0:
+            raise Exception(res_json.get("message", "Batch extract failed"))
+        
+        return res_json.get("data", [])
+    
+    def struct_extract(
+        self,
+        text_or_documents: Union[str, List[Dict[str, str]]],
+        prompt_description: str,
+        examples: List[Dict[str, Any]],
+        fetch_urls: bool = False,
+        max_char_buffer: int = 1000,
+        temperature: Optional[float] = None,
+        extraction_passes: int = 1,
+        additional_context: Optional[str] = None,
+        prompt_validation_level: str = "WARNING",
+        prompt_validation_strict: bool = False,
+        resolver_params: Optional[Dict[str, Any]] = None,
+        model_parameters: Optional[Dict[str, Any]] = None,
+        timeout: Optional[int] = None,
+    ) -> StructExtractTaskInfo:
+        """
+        结构化抽取（LangExtract）
+        
+        Args:
+            text_or_documents: 文本内容或文档列表
+            prompt_description: 抽取提示描述
+            examples: 示例列表，每个示例包含text和extractions
+            fetch_urls: 是否获取URL（默认False）
+            max_char_buffer: 最大字符缓冲区（默认1000）
+            temperature: 温度参数（可选）
+            extraction_passes: 抽取轮数（默认1）
+            additional_context: 额外上下文（可选）
+            prompt_validation_level: 提示验证级别（默认"WARNING"）
+            prompt_validation_strict: 是否严格验证（默认False）
+            resolver_params: 解析器参数（可选）
+            model_parameters: 模型参数（可选）
+            timeout: 超时时间（可选）
+        
+        Returns:
+            任务信息，包含task_id
+        
+        Raises:
+            Exception: API调用失败
+        """
+        payload = {
+            "text_or_documents": text_or_documents,
+            "prompt_description": prompt_description,
+            "examples": examples,
+            "fetch_urls": fetch_urls,
+            "max_char_buffer": max_char_buffer,
+            "extraction_passes": extraction_passes,
+            "prompt_validation_level": prompt_validation_level,
+            "prompt_validation_strict": prompt_validation_strict,
+        }
+        
+        if temperature is not None:
+            payload["temperature"] = temperature
+        if additional_context:
+            payload["additional_context"] = additional_context
+        if resolver_params:
+            payload["resolver_params"] = resolver_params
+        if model_parameters:
+            payload["model_parameters"] = model_parameters
+        if timeout:
+            payload["timeout"] = timeout
+        
+        url = "/powerrag/struct_extract/submit"
+        res = self.client.post(url, json=payload)
+        res_json = res.json()
+        
+        if res_json.get("code") != 0:
+            raise Exception(res_json.get("message", "Struct extract failed"))
+        
+        return res_json.get("data", {})
+    
+    def get_struct_extract_status(self, task_id: str) -> Dict[str, Any]:
+        """
+        获取结构化抽取任务状态
+        
+        Args:
+            task_id: 任务ID
+        
+        Returns:
+            任务状态信息
+        
+        Raises:
+            Exception: API调用失败
+        """
+        url = f"/powerrag/struct_extract/status/{task_id}"
+        res = self.client.get(url)
+        res_json = res.json()
+        
+        if res_json.get("code") != 0:
+            raise Exception(res_json.get("message", "Get struct extract status failed"))
+        
+        return res_json.get("data", {})
+
diff --git a/powerrag/sdk/modules/knowledge_base.py b/powerrag/sdk/modules/knowledge_base.py
new file mode 100644
index 000000000..c8e0ebbcc
--- /dev/null
+++ b/powerrag/sdk/modules/knowledge_base.py
@@ -0,0 +1,34 @@
+#
+#  Copyright 2025 The OceanBase Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+from typing import TypedDict, Optional, Dict, Any
+
+
+class KnowledgeBaseInfo(TypedDict, total=False):
+    """知识库信息类型定义"""
+    id: str
+    name: str
+    avatar: Optional[str]
+    tenant_id: Optional[str]
+    description: Optional[str]
+    embedding_model: str
+    permission: str
+    document_count: int
+    chunk_count: int
+    chunk_method: str
+    parser_config: Optional[Dict[str, Any]]
+    pagerank: int
+
diff --git a/powerrag/sdk/modules/knowledge_base_manager.py b/powerrag/sdk/modules/knowledge_base_manager.py
new file mode 100644
index 000000000..bc1eb07a9
--- /dev/null
+++ b/powerrag/sdk/modules/knowledge_base_manager.py
@@ -0,0 +1,232 @@
+#
+#  Copyright 2025 The OceanBase Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+from typing import Optional, List, Dict, Any
+from .knowledge_base import KnowledgeBaseInfo
+
+
+class KnowledgeBaseManager:
+    """知识库管理模块"""
+    
+    def __init__(self, client):
+        """
+        初始化知识库管理模块
+        
+        Args:
+            client: PowerRAG客户端实例
+        """
+        self.client = client
+    
+    def create(
+        self,
+        name: str,
+        description: Optional[str] = None,
+        avatar: Optional[str] = None,
+        embedding_model: Optional[str] = None,
+        permission: str = "me",
+        chunk_method: str = "naive",
+        parser_config: Optional[Dict[str, Any]] = None,
+    ) -> KnowledgeBaseInfo:
+        """
+        创建知识库
+        
+        Args:
+            name: 知识库名称（必填）
+            description: 描述（可选）
+            avatar: 头像，base64编码（可选）
+            embedding_model: 嵌入模型名称（可选，默认使用租户默认模型）
+            permission: 权限，'me' 或 'team'（默认'me'）
+            chunk_method: 切片方法（默认'naive'）
+            parser_config: 解析器配置（可选）
+        
+        Returns:
+            创建的知识库信息
+        
+        Raises:
+            Exception: API调用失败
+        
+        Note:
+            pagerank 字段只能在更新时设置，创建时不能设置
+        """
+        payload = {
+            "name": name,
+        }
+        
+        if description is not None:
+            payload["description"] = description
+        if avatar is not None:
+            payload["avatar"] = avatar
+        if embedding_model is not None:
+            payload["embedding_model"] = embedding_model
+        if permission:
+            payload["permission"] = permission
+        if chunk_method:
+            payload["chunk_method"] = chunk_method
+        if parser_config is not None:
+            payload["parser_config"] = parser_config
+        
+        res = self.client.post("/datasets", json=payload)
+        res_json = res.json()
+        
+        if res_json.get("code") != 0:
+            raise Exception(res_json.get("message", "Create knowledge base failed"))
+        
+        return res_json.get("data", {})
+    
+    def get(self, kb_id: str) -> KnowledgeBaseInfo:
+        """
+        获取知识库
+        
+        Args:
+            kb_id: 知识库ID
+        
+        Returns:
+            知识库信息
+        
+        Raises:
+            Exception: API调用失败或知识库不存在
+        """
+        kbs, _ = self.list(id=kb_id, page_size=1)
+        if not kbs:
+            raise Exception(f"Knowledge base '{kb_id}' not found")
+        return kbs[0]
+    
+    def list(
+        self,
+        id: Optional[str] = None,
+        name: Optional[str] = None,
+        page: int = 1,
+        page_size: int = 30,
+        orderby: str = "create_time",
+        desc: bool = True,
+    ) -> tuple[List[KnowledgeBaseInfo], int]:
+        """
+        列出知识库
+        
+        Args:
+            id: 知识库ID（可选，用于精确查询）
+            name: 知识库名称（可选，用于模糊查询）
+            page: 页码，默认1
+            page_size: 每页数量，默认30
+            orderby: 排序字段，默认create_time
+            desc: 是否降序，默认True
+        
+        Returns:
+            (知识库列表, 总数)
+        
+        Raises:
+            Exception: API调用失败
+        """
+        params = {
+            "page": page,
+            "page_size": page_size,
+            "orderby": orderby,
+            "desc": desc,
+        }
+        
+        if id:
+            params["id"] = id
+        if name:
+            params["name"] = name
+        
+        res = self.client.get("/datasets", params=params)
+        res_json = res.json()
+        
+        if res_json.get("code") == 0:
+            # API返回的字段名是 total_datasets，不是 total
+            return res_json.get("data", []), res_json.get("total_datasets", 0)
+        
+        raise Exception(res_json.get("message", "List knowledge bases failed"))
+    
+    def update(
+        self,
+        kb_id: str,
+        name: Optional[str] = None,
+        description: Optional[str] = None,
+        avatar: Optional[str] = None,
+        embedding_model: Optional[str] = None,
+        permission: Optional[str] = None,
+        chunk_method: Optional[str] = None,
+        parser_config: Optional[Dict[str, Any]] = None,
+        pagerank: Optional[int] = None,
+    ) -> KnowledgeBaseInfo:
+        """
+        更新知识库
+        
+        Args:
+            kb_id: 知识库ID
+            name: 知识库名称（可选）
+            description: 描述（可选）
+            avatar: 头像（可选）
+            embedding_model: 嵌入模型（可选）
+            permission: 权限（可选）
+            chunk_method: 切片方法（可选）
+            parser_config: 解析器配置（可选）
+            pagerank: 页面排名（可选）
+        
+        Returns:
+            更新后的知识库信息
+        
+        Raises:
+            Exception: API调用失败
+        """
+        # 字段名映射：SDK字段 -> API字段
+        update_data = {}
+        if name is not None:
+            update_data["name"] = name
+        if description is not None:
+            update_data["description"] = description
+        if avatar is not None:
+            update_data["avatar"] = avatar
+        if embedding_model is not None:
+            update_data["embd_id"] = embedding_model
+        if permission is not None:
+            update_data["permission"] = permission
+        if chunk_method is not None:
+            update_data["parser_id"] = chunk_method
+        if parser_config is not None:
+            update_data["parser_config"] = parser_config
+        if pagerank is not None:
+            update_data["pagerank"] = pagerank
+        
+        if not update_data:
+            raise Exception("No fields to update")
+        
+        res = self.client.put(f"/datasets/{kb_id}", json=update_data)
+        res_json = res.json()
+        
+        if res_json.get("code") != 0:
+            raise Exception(res_json.get("message", "Update knowledge base failed"))
+        
+        return res_json.get("data", {})
+    
+    def delete(self, ids: Optional[List[str]] = None) -> None:
+        """
+        删除知识库
+        
+        Args:
+            ids: 知识库ID列表，如果为None则删除所有知识库
+        
+        Raises:
+            Exception: API调用失败
+        """
+        payload = {"ids": ids}
+        res = self.client.delete("/datasets", json=payload)
+        res_json = res.json()
+        
+        if res_json.get("code") != 0:
+            raise Exception(res_json.get("message", "Delete knowledge bases failed"))
+
diff --git a/powerrag/sdk/modules/knowledge_graph.py b/powerrag/sdk/modules/knowledge_graph.py
new file mode 100644
index 000000000..e236588db
--- /dev/null
+++ b/powerrag/sdk/modules/knowledge_graph.py
@@ -0,0 +1,52 @@
+#
+#  Copyright 2025 The OceanBase Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+from typing import TypedDict, Optional, List, Dict, Any
+
+
+class KnowledgeGraphNode(TypedDict, total=False):
+    """知识图谱节点"""
+    id: str
+    label: str
+    pagerank: Optional[float]
+    properties: Optional[Dict[str, Any]]
+
+
+class KnowledgeGraphEdge(TypedDict, total=False):
+    """知识图谱边"""
+    source: str
+    target: str
+    weight: Optional[float]
+    label: Optional[str]
+    properties: Optional[Dict[str, Any]]
+
+
+class KnowledgeGraphData(TypedDict, total=False):
+    """知识图谱数据"""
+    graph: Dict[str, Any]  # 包含nodes和edges
+    mind_map: Dict[str, Any]
+
+
+class KnowledgeGraphTaskInfo(TypedDict, total=False):
+    """知识图谱任务信息"""
+    graphrag_task_id: str
+    status: Optional[str]
+    progress: Optional[float]
+    progress_msg: Optional[str]
+    begin_at: Optional[str]
+    create_time: Optional[int]
+    update_time: Optional[int]
+
diff --git a/powerrag/sdk/modules/knowledge_graph_manager.py b/powerrag/sdk/modules/knowledge_graph_manager.py
new file mode 100644
index 000000000..2ead3c75d
--- /dev/null
+++ b/powerrag/sdk/modules/knowledge_graph_manager.py
@@ -0,0 +1,102 @@
+#
+#  Copyright 2025 The OceanBase Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+from typing import Optional, Dict, Any
+from .knowledge_graph import KnowledgeGraphData, KnowledgeGraphTaskInfo
+
+
+class KnowledgeGraphManager:
+    """知识图谱管理模块"""
+    
+    def __init__(self, client):
+        """
+        初始化知识图谱管理模块
+        
+        Args:
+            client: PowerRAG客户端实例
+        """
+        self.client = client
+    
+    def build(self, kb_id: str) -> KnowledgeGraphTaskInfo:
+        """
+        构建知识图谱（异步）
+        
+        注意：KnowledgeGraph的配置参数从知识库的 `parser_config.graphrag` 中读取。
+        需要在创建或更新知识库时设置这些配置参数。
+        
+        Args:
+            kb_id: 知识库ID
+        
+        Returns:
+            任务信息，包含graphrag_task_id
+        
+        Raises:
+            Exception: API调用失败
+        """
+        url = f"/datasets/{kb_id}/run_graphrag"
+        res = self.client.post(url)
+        res_json = res.json()
+        
+        if res_json.get("code") != 0:
+            raise Exception(res_json.get("message", "Build knowledge graph failed"))
+        
+        return res_json.get("data", {})
+    
+    def get(self, kb_id: str) -> KnowledgeGraphData:
+        """
+        获取知识图谱
+        
+        Args:
+            kb_id: 知识库ID
+        
+        Returns:
+            知识图谱数据，包含graph和mind_map
+        
+        Raises:
+            Exception: API调用失败
+        """
+        url = f"/datasets/{kb_id}/knowledge_graph"
+        res = self.client.get(url)
+        res_json = res.json()
+        
+        if res_json.get("code") != 0:
+            raise Exception(res_json.get("message", "Get knowledge graph failed"))
+        
+        return res_json.get("data", {"graph": {}, "mind_map": {}})
+    
+    def get_status(self, kb_id: str) -> Optional[Dict[str, Any]]:
+        """
+        获取知识图谱构建状态
+        
+        Args:
+            kb_id: 知识库ID
+        
+        Returns:
+            任务状态信息，如果不存在则返回None
+        
+        Raises:
+            Exception: API调用失败
+        """
+        url = f"/datasets/{kb_id}/trace_graphrag"
+        res = self.client.get(url)
+        res_json = res.json()
+        
+        if res_json.get("code") != 0:
+            raise Exception(res_json.get("message", "Get knowledge graph status failed"))
+        
+        data = res_json.get("data", {})
+        return data if data else None
+
diff --git a/powerrag/sdk/modules/raptor.py b/powerrag/sdk/modules/raptor.py
new file mode 100644
index 000000000..5516c9862
--- /dev/null
+++ b/powerrag/sdk/modules/raptor.py
@@ -0,0 +1,29 @@
+#
+#  Copyright 2025 The OceanBase Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+from typing import TypedDict, Optional, Dict, Any
+
+
+class RAPTORTaskInfo(TypedDict, total=False):
+    """RAPTOR任务信息"""
+    raptor_task_id: str
+    status: Optional[str]
+    progress: Optional[float]
+    progress_msg: Optional[str]
+    begin_at: Optional[str]
+    create_time: Optional[int]
+    update_time: Optional[int]
+
diff --git a/powerrag/sdk/modules/raptor_manager.py b/powerrag/sdk/modules/raptor_manager.py
new file mode 100644
index 000000000..4bb814c67
--- /dev/null
+++ b/powerrag/sdk/modules/raptor_manager.py
@@ -0,0 +1,83 @@
+#
+#  Copyright 2025 The OceanBase Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+from typing import Optional, Dict, Any
+from .raptor import RAPTORTaskInfo
+
+
+class RAPTORManager:
+    """RAPTOR管理模块"""
+    
+    def __init__(self, client):
+        """
+        初始化RAPTOR管理模块
+        
+        Args:
+            client: PowerRAG客户端实例
+        """
+        self.client = client
+    
+    def build(self, kb_id: str) -> RAPTORTaskInfo:
+        """
+        构建RAPTOR（异步）
+        
+        注意：RAPTOR的配置参数从知识库的 `parser_config.raptor` 中读取。
+        需要在创建或更新知识库时设置这些配置参数。
+        
+        Args:
+            kb_id: 知识库ID
+        
+        Returns:
+            任务信息，包含raptor_task_id
+        
+        Raises:
+            Exception: API调用失败
+        """
+        # 使用SDK的RAPTOR接口
+        url = f"/datasets/{kb_id}/run_raptor"
+        res = self.client.post(url)
+        res_json = res.json()
+        
+        if res_json.get("code") != 0:
+            raise Exception(res_json.get("message", "Build RAPTOR failed"))
+        
+        return res_json.get("data", {})
+    
+    def get_status(self, kb_id: str) -> Optional[Dict[str, Any]]:
+        """
+        获取RAPTOR构建状态
+        
+        Args:
+            kb_id: 知识库ID
+        
+        Returns:
+            任务状态信息，如果不存在则返回None
+        
+        Raises:
+            Exception: API调用失败
+        """
+        # 使用SDK的RAPTOR接口
+        url = f"/datasets/{kb_id}/trace_raptor"
+        res = self.client.get(url)
+        res_json = res.json()
+        
+        if res_json.get("code") != 0:
+            raise Exception(res_json.get("message", "Get RAPTOR status failed"))
+        
+        data = res_json.get("data", {})
+        return data if data else None
+        
+
diff --git a/powerrag/sdk/modules/retrieval.py b/powerrag/sdk/modules/retrieval.py
new file mode 100644
index 000000000..0b0c92acb
--- /dev/null
+++ b/powerrag/sdk/modules/retrieval.py
@@ -0,0 +1,41 @@
+#
+#  Copyright 2025 The OceanBase Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+from typing import TypedDict, Optional, List, Dict, Any
+
+
+class RetrievalChunk(TypedDict, total=False):
+    """检索结果切片"""
+    id: str
+    content: str
+    document_id: str
+    dataset_id: str
+    similarity: float
+    important_keywords: List[str]
+    questions: List[str]
+    docnm_kwd: str
+    image_id: Optional[str]
+    available: bool
+    positions: List[List[int]]
+
+
+class RetrievalResult(TypedDict, total=False):
+    """检索结果"""
+    total: int
+    chunks: List[RetrievalChunk]
+    doc_aggs: Dict[str, Any]  # 文档聚合信息
+    labels: Optional[Dict[str, Any]]  # 标签信息
+
diff --git a/powerrag/sdk/modules/retrieval_manager.py b/powerrag/sdk/modules/retrieval_manager.py
new file mode 100644
index 000000000..239104531
--- /dev/null
+++ b/powerrag/sdk/modules/retrieval_manager.py
@@ -0,0 +1,158 @@
+#
+#  Copyright 2025 The OceanBase Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+from typing import Optional, List, Dict, Any
+from .retrieval import RetrievalResult
+
+
+class RetrievalManager:
+    """检索管理模块"""
+    
+    def __init__(self, client):
+        """
+        初始化检索管理模块
+        
+        Args:
+            client: PowerRAG客户端实例
+        """
+        self.client = client
+    
+    def search(
+        self,
+        kb_ids: List[str],
+        question: str,
+        document_ids: Optional[List[str]] = None,
+        page: int = 1,
+        page_size: int = 30,
+        similarity_threshold: float = 0.2,
+        vector_similarity_weight: float = 0.3,
+        top_k: int = 1024,
+        keyword: bool = False,
+        use_kg: bool = False,
+        rerank_id: Optional[str] = None,
+        highlight: bool = True,
+        cross_languages: Optional[List[str]] = None,
+        metadata_condition: Optional[Dict[str, Any]] = None,
+    ) -> RetrievalResult:
+        """
+        检索（向量/关键词/混合）
+        
+        Args:
+            kb_ids: 知识库ID列表
+            question: 查询问题
+            document_ids: 文档ID列表（可选，用于过滤）
+            page: 页码，默认1
+            page_size: 每页数量，默认30
+            similarity_threshold: 相似度阈值，默认0.2
+            vector_similarity_weight: 向量相似度权重（混合检索时使用），默认0.3
+            top_k: 最大返回数量，默认1024
+            keyword: 是否使用关键词增强，默认False
+            use_kg: 是否使用知识图谱检索，默认False
+            rerank_id: 重排序模型ID（可选）
+            highlight: 是否高亮匹配内容，默认True
+            cross_languages: 跨语言列表（可选）
+            metadata_condition: 元数据过滤条件（可选）
+        
+        Returns:
+            检索结果，包含chunks列表和total数量
+        
+        Raises:
+            Exception: API调用失败
+        """
+        payload = {
+            "dataset_ids": kb_ids,
+            "question": question,
+            "page": page,
+            "page_size": page_size,
+            "similarity_threshold": similarity_threshold,
+            "vector_similarity_weight": vector_similarity_weight,
+            "top_k": top_k,
+            "keyword": keyword,
+            "use_kg": use_kg,
+            "highlight": highlight,
+        }
+        
+        if document_ids:
+            payload["document_ids"] = document_ids
+        if rerank_id:
+            payload["rerank_id"] = rerank_id
+        if cross_languages:
+            payload["cross_languages"] = cross_languages
+        if metadata_condition:
+            payload["metadata_condition"] = metadata_condition
+        
+        url = "/retrieval"
+        res = self.client.post(url, json=payload)
+        res_json = res.json()
+        
+        if res_json.get("code") != 0:
+            raise Exception(res_json.get("message", "Search failed"))
+        
+        return res_json.get("data", {"total": 0, "chunks": []})
+    
+    def test(
+        self,
+        kb_ids: List[str],
+        question: str,
+        document_ids: Optional[List[str]] = None,
+        page: int = 1,
+        page_size: int = 30,
+        similarity_threshold: float = 0.2,
+        vector_similarity_weight: float = 0.3,
+        top_k: int = 1024,
+        keyword: bool = False,
+        use_kg: bool = False,
+        rerank_id: Optional[str] = None,
+        highlight: bool = True,
+    ) -> RetrievalResult:
+        """
+        检索测试（与search方法相同，用于测试场景）
+        
+        Args:
+            kb_ids: 知识库ID列表
+            question: 查询问题
+            document_ids: 文档ID列表（可选）
+            page: 页码，默认1
+            page_size: 每页数量，默认30
+            similarity_threshold: 相似度阈值，默认0.2
+            vector_similarity_weight: 向量相似度权重，默认0.3
+            top_k: 最大返回数量，默认1024
+            keyword: 是否使用关键词增强，默认False
+            use_kg: 是否使用知识图谱检索，默认False
+            rerank_id: 重排序模型ID（可选）
+            highlight: 是否高亮匹配内容，默认True
+        
+        Returns:
+            检索结果
+        
+        Raises:
+            Exception: API调用失败
+        """
+        return self.search(
+            kb_ids=kb_ids,
+            question=question,
+            document_ids=document_ids,
+            page=page,
+            page_size=page_size,
+            similarity_threshold=similarity_threshold,
+            vector_similarity_weight=vector_similarity_weight,
+            top_k=top_k,
+            keyword=keyword,
+            use_kg=use_kg,
+            rerank_id=rerank_id,
+            highlight=highlight,
+        )
+
diff --git a/powerrag/sdk/tests/conftest.py b/powerrag/sdk/tests/conftest.py
new file mode 100644
index 000000000..78075d876
--- /dev/null
+++ b/powerrag/sdk/tests/conftest.py
@@ -0,0 +1,231 @@
+#
+#  Copyright 2025 The OceanBase Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import os
+import time
+import pytest
+from pathlib import Path
+
+from powerrag.sdk import PowerRAGClient
+
+# 从环境变量获取配置
+HOST_ADDRESS = os.getenv("HOST_ADDRESS", "http://127.0.0.1:9222")
+API_KEY = os.getenv("POWERRAG_API_KEY", "ragflow-MAln1FNDn9PhIcqv1axaaUT3mM-efUZ83O5LVcroe9E")
+
+
+@pytest.fixture(scope="session")
+def client():
+    """
+    创建PowerRAG客户端实例
+    
+    Returns:
+        PowerRAGClient实例
+    """
+    return PowerRAGClient(api_key=API_KEY, base_url=HOST_ADDRESS)
+
+
+@pytest.fixture(scope="function")
+def kb_id(client: PowerRAGClient):
+    """
+    创建测试用的知识库
+    
+    Args:
+        client: PowerRAG客户端实例
+    
+    Yields:
+        知识库ID
+    
+    Returns:
+        知识库ID
+    """
+    kb = client.knowledge_base.create(name=f"test_kb_{os.getpid()}")
+    yield kb["id"]
+    # 清理：删除测试知识库
+    try:
+        client.knowledge_base.delete([kb["id"]])
+    except Exception:
+        pass
+
+
+@pytest.fixture(scope="function")
+def doc_id(client: PowerRAGClient, kb_id: str, test_file_path: str):
+    """
+    创建测试用的文档
+    
+    Args:
+        client: PowerRAG客户端实例
+        kb_id: 知识库ID
+        test_file_path: 测试文件路径
+    
+    Yields:
+        文档ID
+    """
+    docs = client.document.upload(kb_id, test_file_path)
+    yield docs[0]["id"]
+    # 清理：删除测试文档
+    try:
+        client.document.delete(kb_id, [docs[0]["id"]])
+    except Exception:
+        pass
+
+
+@pytest.fixture(scope="function")
+def chunk_id(client: PowerRAGClient, kb_id: str, doc_id: str):
+    """
+    创建测试用的切片
+    
+    Args:
+        client: PowerRAG客户端实例
+        kb_id: 知识库ID
+        doc_id: 文档ID
+    
+    Yields:
+        切片ID
+    """
+    chunk = client.chunk.create(
+        kb_id,
+        doc_id,
+        content="Test chunk content for testing"
+    )
+    yield chunk["id"]
+    # 清理：删除测试切片
+    try:
+        client.chunk.delete(kb_id, doc_id, [chunk["id"]])
+    except Exception:
+        pass
+
+
+@pytest.fixture(scope="function")
+def test_file_path(tmp_path):
+    """
+    创建测试文件（HTML 格式，parse_to_md 支持）
+    
+    Args:
+        tmp_path: pytest临时路径
+    
+    Returns:
+        测试文件路径
+    """
+    test_file = tmp_path / "test_document.html"
+    test_file.write_text("""<!DOCTYPE html>
+<html>
+<head>
+    <meta charset="UTF-8">
+    <title>Test Document</title>
+</head>
+<body>
+    <h1>Test Document</h1>
+    <p>This is a test document for PowerRAG SDK testing.</p>
+    
+    <h2>Section 1</h2>
+    <p>This is the first section with some content.</p>
+    
+    <h2>Section 2</h2>
+    <p>This is the second section with more content.</p>
+</body>
+</html>
+""")
+    return str(test_file)
+
+
+@pytest.fixture(scope="function")
+def test_files(tmp_path):
+    """
+    创建多个测试文件（HTML 格式）
+    
+    Args:
+        tmp_path: pytest临时路径
+    
+    Returns:
+        测试文件路径列表
+    """
+    files = []
+    for i in range(3):
+        test_file = tmp_path / f"test_document_{i}.html"
+        test_file.write_text(f"""<!DOCTYPE html>
+<html>
+<head>
+    <meta charset="UTF-8">
+    <title>Test Document {i}</title>
+</head>
+<body>
+    <h1>Test Document {i}</h1>
+    <p>This is test document {i} for PowerRAG SDK testing.</p>
+    
+    <h2>Content</h2>
+    <p>Sample content for document {i}.</p>
+</body>
+</html>
+""")
+        files.append(str(test_file))
+    return files
+
+
+@pytest.fixture(scope="function")
+def doc_ids(client: PowerRAGClient, kb_id: str, test_files: list):
+    """
+    创建多个测试文档
+    
+    Args:
+        client: PowerRAG客户端实例
+        kb_id: 知识库ID
+        test_files: 测试文件路径列表
+    
+    Yields:
+        文档ID列表
+    """
+    docs = client.document.upload(kb_id, test_files)
+    doc_ids = [doc["id"] for doc in docs]
+    yield doc_ids
+    # 清理：删除测试文档
+    try:
+        client.document.delete(kb_id, doc_ids)
+    except Exception:
+        pass
+
+
+@pytest.fixture(scope="function")
+def kb_with_docs(client: PowerRAGClient, test_files: list):
+    """
+    创建带有已解析文档的知识库（用于RAPTOR等需要文档的测试）
+    
+    Args:
+        client: PowerRAG客户端实例
+        test_files: 测试文件路径列表
+    
+    Yields:
+        知识库ID
+    """
+    # 创建知识库
+    kb = client.knowledge_base.create(name=f"test_kb_with_docs_{os.getpid()}")
+    kb_id = kb["id"]
+    
+    try:
+        # 上传文档
+        docs = client.document.upload(kb_id, test_files)
+        doc_ids = [doc["id"] for doc in docs]
+        
+        # 解析文档（wait=True 会等待解析完成）
+        client.document.parse_to_chunk(kb_id, doc_ids, wait=True)
+        
+        yield kb_id
+    finally:
+        # 清理：删除测试知识库
+        try:
+            client.knowledge_base.delete([kb_id])
+        except Exception:
+            pass
+
diff --git a/powerrag/sdk/tests/pytest.ini b/powerrag/sdk/tests/pytest.ini
new file mode 100644
index 000000000..029953c9e
--- /dev/null
+++ b/powerrag/sdk/tests/pytest.ini
@@ -0,0 +1,20 @@
+#
+#  Copyright 2025 The OceanBase Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+[pytest]
+# pytest 配置文件
+# 注意：环境变量需要在运行 pytest 之前设置，或者通过 conftest.py 设置
+
diff --git a/powerrag/sdk/tests/test_chunk.py b/powerrag/sdk/tests/test_chunk.py
new file mode 100644
index 000000000..940460201
--- /dev/null
+++ b/powerrag/sdk/tests/test_chunk.py
@@ -0,0 +1,163 @@
+#
+#  Copyright 2025 The OceanBase Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import pytest
+from powerrag.sdk import PowerRAGClient
+
+
+class TestChunkList:
+    """测试切片列表"""
+    
+    def test_list_chunks(self, client: PowerRAGClient, kb_id: str, doc_id: str):
+        """测试列出切片"""
+        chunks, total, doc_info = client.chunk.list(kb_id, doc_id)
+        assert isinstance(chunks, list)
+        assert total >= 0
+    
+    def test_list_with_keywords(self, client: PowerRAGClient, kb_id: str, doc_id: str):
+        """测试使用关键词搜索切片"""
+        chunks, total, _ = client.chunk.list(kb_id, doc_id, keywords="test")
+        assert isinstance(chunks, list)
+
+
+class TestChunkGet:
+    """测试切片查询"""
+    
+    def test_get_existing_chunk(self, client: PowerRAGClient, kb_id: str, doc_id: str, chunk_id: str):
+        """测试获取存在的切片"""
+        chunk = client.chunk.get(kb_id, doc_id, chunk_id)
+        assert chunk["id"] == chunk_id
+    
+    def test_get_nonexistent_chunk(self, client: PowerRAGClient, kb_id: str, doc_id: str):
+        """测试获取不存在的切片"""
+        with pytest.raises(Exception) as exc_info:
+            client.chunk.get(kb_id, doc_id, "nonexistent_id")
+        assert "not found" in str(exc_info.value).lower()
+
+
+class TestChunkCreate:
+    """测试切片创建"""
+    
+    def test_create_chunk(self, client: PowerRAGClient, kb_id: str, doc_id: str):
+        """测试创建切片"""
+        chunk = client.chunk.create(
+            kb_id,
+            doc_id,
+            content="Test chunk content",
+            important_keywords=["test", "chunk"]
+        )
+        assert chunk["id"] is not None
+        assert chunk["content"] == "Test chunk content"
+        
+        # 清理
+        client.chunk.delete(kb_id, doc_id, [chunk["id"]])
+    
+    def test_create_chunk_with_questions(self, client: PowerRAGClient, kb_id: str, doc_id: str):
+        """测试创建带问题的切片"""
+        chunk = client.chunk.create(
+            kb_id,
+            doc_id,
+            content="Test content",
+            questions=["What is this?", "How does it work?"]
+        )
+        assert len(chunk.get("questions", [])) == 2
+        
+        # 清理
+        client.chunk.delete(kb_id, doc_id, [chunk["id"]])
+
+
+class TestChunkUpdate:
+    """测试切片更新"""
+    
+    def test_update_content(self, client: PowerRAGClient, kb_id: str, doc_id: str, chunk_id: str):
+        """测试更新切片内容"""
+        updated_chunk = client.chunk.update(
+            kb_id,
+            doc_id,
+            chunk_id,
+            content="Updated content"
+        )
+        assert updated_chunk["content"] == "Updated content"
+    
+    def test_update_keywords(self, client: PowerRAGClient, kb_id: str, doc_id: str, chunk_id: str):
+        """测试更新关键词"""
+        updated_chunk = client.chunk.update(
+            kb_id,
+            doc_id,
+            chunk_id,
+            important_keywords=["new", "keywords"]
+        )
+        assert updated_chunk.get("important_keywords") == ["new", "keywords"]
+
+
+class TestChunkDelete:
+    """测试切片删除"""
+    
+    def test_delete_single_chunk(self, client: PowerRAGClient, kb_id: str, doc_id: str):
+        """测试删除单个切片"""
+        chunk = client.chunk.create(kb_id, doc_id, content="To be deleted")
+        chunk_id = chunk["id"]
+        
+        client.chunk.delete(kb_id, doc_id, [chunk_id])
+        
+        with pytest.raises(Exception):
+            client.chunk.get(kb_id, doc_id, chunk_id)
+    
+    def test_delete_multiple_chunks(self, client: PowerRAGClient, kb_id: str, doc_id: str):
+        """测试批量删除切片"""
+        chunk_ids = []
+        for i in range(3):
+            chunk = client.chunk.create(kb_id, doc_id, content=f"Chunk {i}")
+            chunk_ids.append(chunk["id"])
+        
+        client.chunk.delete(kb_id, doc_id, chunk_ids)
+        
+        for chunk_id in chunk_ids:
+            with pytest.raises(Exception):
+                client.chunk.get(kb_id, doc_id, chunk_id)
+
+
+class TestChunkSplitText:
+    """测试文本切片"""
+    
+    def test_split_text(self, client: PowerRAGClient):
+        """测试文本切片"""
+        markdown_text = """
+# 第一章
+
+这是第一章的内容...
+
+## 1.1 小节
+
+这是小节内容...
+"""
+        result = client.chunk.split_text(
+            text=markdown_text,
+            parser_id="title",
+            config={"title_level": 2, "chunk_token_num": 256}
+        )
+        assert result.get("total_chunks", 0) > 0 or len(result.get("chunks", [])) > 0
+    
+    def test_split_text_with_config(self, client: PowerRAGClient):
+        """测试使用配置的文本切片"""
+        text = "This is a test document with multiple paragraphs."
+        result = client.chunk.split_text(
+            text=text,
+            parser_id="naive",
+            config={"chunk_token_num": 128}
+        )
+        assert "chunks" in result or "total_chunks" in result
+
diff --git a/powerrag/sdk/tests/test_document.py b/powerrag/sdk/tests/test_document.py
new file mode 100644
index 000000000..3aa1e098b
--- /dev/null
+++ b/powerrag/sdk/tests/test_document.py
@@ -0,0 +1,430 @@
+#
+#  Copyright 2025 The OceanBase Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import pytest
+from powerrag.sdk import PowerRAGClient
+
+
+class TestDocumentUpload:
+    """测试文档上传"""
+    
+    def test_upload_single_file(self, client: PowerRAGClient, kb_id: str, test_file_path: str):
+        """测试上传单个文件"""
+        docs = client.document.upload(kb_id, test_file_path)
+        assert len(docs) == 1
+        assert docs[0]["id"] is not None
+        assert docs[0]["name"] is not None
+        
+        # 清理
+        client.document.delete(kb_id, [docs[0]["id"]])
+    
+    def test_upload_multiple_files(self, client: PowerRAGClient, kb_id: str, test_files: list):
+        """测试批量上传文件"""
+        docs = client.document.upload(kb_id, test_files)
+        assert len(docs) == len(test_files)
+        
+        # 清理
+        doc_ids = [doc["id"] for doc in docs]
+        client.document.delete(kb_id, doc_ids)
+    
+    def test_upload_nonexistent_file(self, client: PowerRAGClient, kb_id: str):
+        """测试上传不存在的文件"""
+        with pytest.raises(FileNotFoundError):
+            client.document.upload(kb_id, "nonexistent.pdf")
+
+
+class TestDocumentList:
+    """测试文档列表"""
+    
+    def test_list_all_documents(self, client: PowerRAGClient, kb_id: str):
+        """测试列出所有文档"""
+        docs, total = client.document.list(kb_id)
+        assert isinstance(docs, list)
+        assert total >= 0
+    
+    def test_list_with_filter(self, client: PowerRAGClient, kb_id: str, test_file_path: str):
+        """测试使用过滤器列出文档"""
+        uploaded_docs = client.document.upload(kb_id, test_file_path)
+        doc_name = uploaded_docs[0]["name"]
+        
+        try:
+            docs, total = client.document.list(kb_id, name=doc_name)
+            assert len(docs) >= 1
+            assert any(doc["name"] == doc_name for doc in docs)
+        finally:
+            client.document.delete(kb_id, [uploaded_docs[0]["id"]])
+
+
+class TestDocumentGet:
+    """测试文档查询"""
+    
+    def test_get_existing_document(self, client: PowerRAGClient, kb_id: str, test_file_path: str):
+        """测试获取存在的文档"""
+        uploaded_docs = client.document.upload(kb_id, test_file_path)
+        doc_id = uploaded_docs[0]["id"]
+        
+        try:
+            doc = client.document.get(kb_id, doc_id)
+            assert doc["id"] == doc_id
+        finally:
+            client.document.delete(kb_id, [doc_id])
+    
+    def test_get_nonexistent_document(self, client: PowerRAGClient, kb_id: str):
+        """测试获取不存在的文档"""
+        # 使用有效的UUID格式但不存在于系统中的ID
+        nonexistent_id = "a" * 32  # 32个字符的十六进制字符串
+        with pytest.raises(Exception) as exc_info:
+            client.document.get(kb_id, nonexistent_id)
+        # 检查错误信息中是否包含 "not found" 或 "don't own"
+        error_msg = str(exc_info.value).lower()
+        assert "not found" in error_msg or "don't own" in error_msg
+
+
+class TestDocumentUpdate:
+    """测试文档更新"""
+    
+    def test_update_name(self, client: PowerRAGClient, kb_id: str, test_file_path: str):
+        """测试更新文档名称"""
+        uploaded_docs = client.document.upload(kb_id, test_file_path)
+        doc_id = uploaded_docs[0]["id"]
+        
+        try:
+            # 注意：不能更改文件扩展名，所以保持 .html 扩展名
+            updated_doc = client.document.update(kb_id, doc_id, name="updated_name.html")
+            assert updated_doc["name"] == "updated_name.html"
+        finally:
+            client.document.delete(kb_id, [doc_id])
+    
+    def test_rename(self, client: PowerRAGClient, kb_id: str, test_file_path: str):
+        """测试重命名文档"""
+        uploaded_docs = client.document.upload(kb_id, test_file_path)
+        doc_id = uploaded_docs[0]["id"]
+        
+        try:
+            # 注意：不能更改文件扩展名，所以保持 .html 扩展名
+            renamed_doc = client.document.rename(kb_id, doc_id, "renamed.html")
+            assert renamed_doc["name"] == "renamed.html"
+        finally:
+            client.document.delete(kb_id, [doc_id])
+    
+    def test_set_meta(self, client: PowerRAGClient, kb_id: str, test_file_path: str):
+        """测试设置元数据"""
+        uploaded_docs = client.document.upload(kb_id, test_file_path)
+        doc_id = uploaded_docs[0]["id"]
+        
+        try:
+            meta_fields = {"author": "Test Author", "category": "Test"}
+            updated_doc = client.document.set_meta(kb_id, doc_id, meta_fields)
+            assert updated_doc.get("meta_fields") == meta_fields
+        finally:
+            client.document.delete(kb_id, [doc_id])
+
+
+class TestDocumentDelete:
+    """测试文档删除"""
+    
+    def test_delete_single_document(self, client: PowerRAGClient, kb_id: str, test_file_path: str):
+        """测试删除单个文档"""
+        uploaded_docs = client.document.upload(kb_id, test_file_path)
+        doc_id = uploaded_docs[0]["id"]
+        
+        client.document.delete(kb_id, [doc_id])
+        
+        with pytest.raises(Exception):
+            client.document.get(kb_id, doc_id)
+    
+    def test_delete_multiple_documents(self, client: PowerRAGClient, kb_id: str, test_files: list):
+        """测试批量删除文档"""
+        uploaded_docs = client.document.upload(kb_id, test_files)
+        doc_ids = [doc["id"] for doc in uploaded_docs]
+        
+        client.document.delete(kb_id, doc_ids)
+        
+        for doc_id in doc_ids:
+            with pytest.raises(Exception):
+                client.document.get(kb_id, doc_id)
+
+
+class TestDocumentDownload:
+    """测试文档下载"""
+    
+    def test_download_to_bytes(self, client: PowerRAGClient, kb_id: str, test_file_path: str):
+        """测试下载为字节流"""
+        uploaded_docs = client.document.upload(kb_id, test_file_path)
+        doc_id = uploaded_docs[0]["id"]
+        
+        try:
+            content = client.document.download(kb_id, doc_id)
+            assert isinstance(content, bytes)
+            assert len(content) > 0
+        finally:
+            client.document.delete(kb_id, [doc_id])
+    
+    def test_download_to_file(self, client: PowerRAGClient, kb_id: str, test_file_path: str, tmp_path):
+        """测试下载到文件"""
+        uploaded_docs = client.document.upload(kb_id, test_file_path)
+        doc_id = uploaded_docs[0]["id"]
+        
+        try:
+            save_path = tmp_path / "downloaded_file.html"
+            result_path = client.document.download(kb_id, doc_id, save_path=str(save_path))
+            
+            assert result_path == str(save_path)
+            assert save_path.exists()
+            assert save_path.stat().st_size > 0
+        finally:
+            client.document.delete(kb_id, [doc_id])
+
+
+class TestDocumentParse:
+    """测试文档解析"""
+    
+    def test_parse_to_chunk_sync(self, client: PowerRAGClient, kb_id: str, test_file_path: str):
+        """测试同步解析为切片"""
+        uploaded_docs = client.document.upload(kb_id, test_file_path)
+        doc_id = uploaded_docs[0]["id"]
+        
+        try:
+            results = client.document.parse_to_chunk(kb_id, [doc_id], wait=True)
+            assert len(results) == 1
+            assert results[0]["status"] == "DONE"
+        finally:
+            client.document.delete(kb_id, [doc_id])
+    
+    def test_parse_to_chunk_async(self, client: PowerRAGClient, kb_id: str, test_file_path: str):
+        """测试异步解析为切片"""
+        uploaded_docs = client.document.upload(kb_id, test_file_path)
+        doc_id = uploaded_docs[0]["id"]
+        
+        try:
+            task_id = client.document.parse_to_chunk(kb_id, [doc_id], wait=False)
+            assert task_id is not None
+        finally:
+            client.document.delete(kb_id, [doc_id])
+    
+    def test_cancel_parse(self, client: PowerRAGClient, kb_id: str, test_file_path: str):
+        """测试取消解析"""
+        uploaded_docs = client.document.upload(kb_id, test_file_path)
+        doc_id = uploaded_docs[0]["id"]
+        
+        try:
+            client.document.parse_to_chunk(kb_id, [doc_id], wait=False)
+            client.document.cancel_parse(kb_id, [doc_id])
+            
+            doc = client.document.get(kb_id, doc_id)
+            assert doc["run"] in ["CANCEL", "UNSTART"]
+        finally:
+            client.document.delete(kb_id, [doc_id])
+
+
+class TestDocumentParseToMD:
+    """测试文档解析为 Markdown（不切分）"""
+    
+    def test_parse_to_md_basic(self, client: PowerRAGClient, kb_id: str, test_file_path: str):
+        """测试基本的 parse_to_md 功能"""
+        # 上传文档
+        uploaded_docs = client.document.upload(kb_id, test_file_path)
+        doc_id = uploaded_docs[0]["id"]
+        
+        try:
+            # 解析为 Markdown
+            result = client.document.parse_to_md(doc_id)
+            
+            # 验证返回结果
+            assert "doc_id" in result
+            assert "doc_name" in result
+            assert "markdown" in result
+            assert "markdown_length" in result
+            assert result["doc_id"] == doc_id
+            assert isinstance(result["markdown"], str)
+            assert result["markdown_length"] > 0
+        finally:
+            client.document.delete(kb_id, [doc_id])
+    
+    def test_parse_to_md_with_config(self, client: PowerRAGClient, kb_id: str, test_file_path: str):
+        """测试带配置参数的 parse_to_md"""
+        uploaded_docs = client.document.upload(kb_id, test_file_path)
+        doc_id = uploaded_docs[0]["id"]
+        
+        try:
+            # 使用配置解析
+            config = {
+                "layout_recognize": "mineru",
+                "enable_ocr": False,
+                "enable_formula": False,
+                "enable_table": True
+            }
+            result = client.document.parse_to_md(doc_id, config=config)
+            
+            # 验证返回结果
+            assert result["doc_id"] == doc_id
+            assert "markdown" in result
+            assert len(result["markdown"]) > 0
+        finally:
+            client.document.delete(kb_id, [doc_id])
+    
+    def test_parse_to_md_nonexistent_doc(self, client: PowerRAGClient):
+        """测试解析不存在的文档"""
+        nonexistent_id = "nonexistent_doc_id_123"
+        
+        with pytest.raises(Exception) as exc_info:
+            client.document.parse_to_md(nonexistent_id)
+        
+        # 验证错误信息
+        error_msg = str(exc_info.value).lower()
+        assert "not found" in error_msg or "failed" in error_msg
+    
+    def test_parse_to_md_with_images(self, client: PowerRAGClient, kb_id: str, test_file_path: str):
+        """测试解析带图片的文档"""
+        uploaded_docs = client.document.upload(kb_id, test_file_path)
+        doc_id = uploaded_docs[0]["id"]
+        
+        try:
+            result = client.document.parse_to_md(doc_id)
+            
+            # 验证图片相关字段
+            assert "images" in result
+            assert "total_images" in result
+            assert isinstance(result["images"], dict)
+            assert isinstance(result["total_images"], int)
+            assert result["total_images"] >= 0
+        finally:
+            client.document.delete(kb_id, [doc_id])
+
+
+class TestDocumentParseToMDAsync:
+    """测试异步解析文档为 Markdown"""
+    
+    def test_parse_to_md_async_basic(self, client: PowerRAGClient, kb_id: str, test_file_path: str):
+        """测试异步解析基本功能"""
+        # 上传文档
+        uploaded_docs = client.document.upload(kb_id, test_file_path)
+        doc_id = uploaded_docs[0]["id"]
+        
+        try:
+            # 提交异步任务
+            task_id = client.document.parse_to_md_async(doc_id)
+            assert task_id
+            assert len(task_id) > 0
+            
+            # 查询任务状态
+            status = client.document.get_parse_to_md_status(task_id)
+            assert "task_id" in status
+            assert "status" in status
+            assert status["status"] in ["pending", "processing", "success", "failed"]
+            
+            # 等待任务完成
+            result = client.document.wait_for_parse_to_md(task_id, timeout=300)
+            assert result["status"] == "success"
+            assert "result" in result
+            assert "markdown" in result["result"]
+            assert result["result"]["markdown_length"] > 0
+            
+        finally:
+            client.document.delete(kb_id, [doc_id])
+    
+    def test_parse_to_md_async_with_config(self, client: PowerRAGClient, kb_id: str, test_file_path: str):
+        """测试异步解析带配置"""
+        uploaded_docs = client.document.upload(kb_id, test_file_path)
+        doc_id = uploaded_docs[0]["id"]
+        
+        try:
+            # 提交带配置的异步任务
+            task_id = client.document.parse_to_md_async(
+                doc_id,
+                config={
+                    "layout_recognize": "mineru",
+                    "enable_ocr": False,
+                    "enable_table": True
+                }
+            )
+            
+            # 等待完成
+            result = client.document.wait_for_parse_to_md(task_id, timeout=300)
+            assert result["status"] == "success"
+            assert result["result"]["markdown_length"] > 0
+            
+        finally:
+            client.document.delete(kb_id, [doc_id])
+    
+    def test_parse_to_md_async_nonexistent_doc(self, client: PowerRAGClient):
+        """测试异步解析不存在的文档"""
+        with pytest.raises(Exception) as exc_info:
+            client.document.parse_to_md_async("nonexistent_doc_id")
+        
+        assert "not found" in str(exc_info.value).lower() or "failed" in str(exc_info.value).lower()
+    
+    def test_get_parse_to_md_status_not_found(self, client: PowerRAGClient):
+        """测试查询不存在的任务状态"""
+        status = client.document.get_parse_to_md_status("nonexistent_task_id")
+        assert status["status"] == "not_found"
+    
+    def test_wait_for_parse_to_md_timeout(self, client: PowerRAGClient, kb_id: str, test_file_path: str):
+        """测试等待任务超时（使用极短超时时间）"""
+        uploaded_docs = client.document.upload(kb_id, test_file_path)
+        doc_id = uploaded_docs[0]["id"]
+        
+        try:
+            task_id = client.document.parse_to_md_async(doc_id)
+            
+            # 使用极短的超时时间（0.1秒）来触发超时
+            with pytest.raises(TimeoutError):
+                client.document.wait_for_parse_to_md(task_id, timeout=0.1, interval=0.05)
+            
+        finally:
+            client.document.delete(kb_id, [doc_id])
+
+
+class TestDocumentParseToMDUpload:
+    """测试直接上传文件并解析为 Markdown"""
+    
+    def test_parse_to_md_upload_json_response(self, client: PowerRAGClient, test_file_path: str):
+        """测试上传文件并返回 JSON 响应"""
+        result = client.document.parse_to_md_upload(test_file_path)
+        
+        # 验证返回结果
+        assert "filename" in result
+        assert "markdown" in result
+        assert "markdown_length" in result
+        assert "images" in result
+        assert "total_images" in result
+        assert isinstance(result["markdown"], str)
+        assert result["markdown_length"] > 0
+    
+    def test_parse_to_md_upload_with_config(self, client: PowerRAGClient, test_file_path: str):
+        """测试带配置参数上传并解析"""
+        config = {
+            "layout_recognize": "mineru",
+            "enable_ocr": False
+        }
+        result = client.document.parse_to_md_upload(test_file_path, config=config)
+        
+        assert "markdown" in result
+        assert len(result["markdown"]) > 0
+    
+    def test_parse_to_md_upload_nonexistent_file(self, client: PowerRAGClient):
+        """测试上传不存在的文件"""
+        with pytest.raises(FileNotFoundError):
+            client.document.parse_to_md_upload("nonexistent_file.pdf")
+    
+    def test_parse_to_md_upload_different_formats(self, client: PowerRAGClient, test_file_path: str):
+        """测试上传不同格式的文件"""
+        # 注意：这个测试需要实际的不同格式文件
+        # 这里我们只测试 txt 文件，实际使用时可以添加更多格式
+        result = client.document.parse_to_md_upload(test_file_path)
+        
+        assert "markdown" in result
+        assert result["markdown_length"] > 0
\ No newline at end of file
diff --git a/powerrag/sdk/tests/test_extraction.py b/powerrag/sdk/tests/test_extraction.py
new file mode 100644
index 000000000..5aa9e771f
--- /dev/null
+++ b/powerrag/sdk/tests/test_extraction.py
@@ -0,0 +1,137 @@
+#
+#  Copyright 2025 The OceanBase Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import pytest
+from powerrag.sdk import PowerRAGClient
+
+
+class TestExtractFromDocument:
+    """测试从文档抽取"""
+    
+    def test_extract_entities(self, client: PowerRAGClient, doc_id: str):
+        """测试抽取实体"""
+        result = client.extraction.extract_from_document(
+            doc_id,
+            extractor_type="entity",
+            config={"entity_types": ["PERSON", "ORG"]}
+        )
+        assert result["extractor_type"] == "entity"
+        assert "data" in result
+    
+    def test_extract_keywords(self, client: PowerRAGClient, doc_id: str):
+        """测试抽取关键词"""
+        result = client.extraction.extract_from_document(
+            doc_id,
+            extractor_type="keyword",
+            config={"max_keywords": 20}
+        )
+        assert result["extractor_type"] == "keyword"
+        assert "data" in result
+    
+    def test_extract_summary(self, client: PowerRAGClient, doc_id: str):
+        """测试抽取摘要"""
+        result = client.extraction.extract_from_document(
+            doc_id,
+            extractor_type="summary",
+            config={"max_length": 200}
+        )
+        assert result["extractor_type"] == "summary"
+        assert "data" in result
+
+
+class TestExtractFromText:
+    """测试从文本抽取"""
+    
+    def test_extract_entities_from_text(self, client: PowerRAGClient):
+        """测试从文本抽取实体"""
+        text = "John works at Microsoft in Seattle."
+        result = client.extraction.extract_from_text(
+            text,
+            extractor_type="entity",
+            config={"entity_types": ["PERSON", "ORG", "LOCATION"]}
+        )
+        assert result["extractor_type"] == "entity"
+        assert "data" in result
+    
+    def test_extract_keywords_from_text(self, client: PowerRAGClient):
+        """测试从文本抽取关键词"""
+        text = "This is a test document about artificial intelligence and machine learning."
+        result = client.extraction.extract_from_text(
+            text,
+            extractor_type="keyword",
+            config={"max_keywords": 10}
+        )
+        assert result["extractor_type"] == "keyword"
+        assert "data" in result
+
+
+class TestExtractBatch:
+    """测试批量抽取"""
+    
+    def test_extract_batch(self, client: PowerRAGClient, doc_ids: list):
+        """测试批量抽取"""
+        results = client.extraction.extract_batch(
+            doc_ids,
+            extractor_type="entity"
+        )
+        assert len(results) == len(doc_ids)
+        assert all("success" in r for r in results)
+
+
+class TestStructExtract:
+    """测试结构化抽取"""
+    
+    def test_struct_extract(self, client: PowerRAGClient):
+        """测试结构化抽取"""
+        text = "John attended a conference in New York on January 1, 2024."
+        examples = [
+            {
+                "text": "John attended a conference in New York on January 1, 2024.",
+                "extractions": [
+                    {"extraction_class": "name", "extraction_text": "John"},
+                    {"extraction_class": "location", "extraction_text": "New York"},
+                    {"extraction_class": "date", "extraction_text": "January 1, 2024"}
+                ]
+            }
+        ]
+        
+        task_info = client.extraction.struct_extract(
+            text_or_documents=text,
+            prompt_description="Extract names, locations, and dates from the text.",
+            examples=examples
+        )
+        assert "task_id" in task_info
+    
+    def test_get_struct_extract_status(self, client: PowerRAGClient):
+        """测试获取结构化抽取任务状态"""
+        text = "Test text for extraction."
+        examples = [
+            {
+                "text": "Test text for extraction.",
+                "extractions": []
+            }
+        ]
+        
+        task_info = client.extraction.struct_extract(
+            text_or_documents=text,
+            prompt_description="Extract information.",
+            examples=examples
+        )
+        task_id = task_info["task_id"]
+        
+        status = client.extraction.get_struct_extract_status(task_id)
+        assert "status" in status or "task_id" in status
+
diff --git a/powerrag/sdk/tests/test_knowledge_base.py b/powerrag/sdk/tests/test_knowledge_base.py
new file mode 100644
index 000000000..21058e643
--- /dev/null
+++ b/powerrag/sdk/tests/test_knowledge_base.py
@@ -0,0 +1,193 @@
+#
+#  Copyright 2025 The OceanBase Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import pytest
+from powerrag.sdk import PowerRAGClient
+
+
+class TestKnowledgeBaseCreate:
+    """测试知识库创建"""
+    
+    def test_create_with_name_only(self, client: PowerRAGClient):
+        """测试仅使用名称创建知识库"""
+        kb = client.knowledge_base.create(name="test_kb")
+        assert kb["id"] is not None
+        assert kb["name"] == "test_kb"
+        assert kb["chunk_method"] == "naive"
+        assert kb["permission"] == "me"
+        
+        # 清理
+        client.knowledge_base.delete([kb["id"]])
+    
+    def test_create_with_all_fields(self, client: PowerRAGClient):
+        """测试使用所有字段创建知识库"""
+        # 注意：pagerank 字段只能在更新时设置，创建时不能设置
+        kb = client.knowledge_base.create(
+            name="test_kb_full",
+            description="Test description",
+            embedding_model="BAAI/bge-small-en-v1.5@Builtin",
+            permission="team",
+            chunk_method="book",
+            parser_config={"chunk_token_num": 256}
+        )
+        assert kb["name"] == "test_kb_full"
+        assert kb["description"] == "Test description"
+        assert kb["chunk_method"] == "book"
+        assert kb["permission"] == "team"
+        
+        # 清理
+        client.knowledge_base.delete([kb["id"]])
+    
+    def test_create_duplicate_name(self, client: PowerRAGClient):
+        """测试创建重复名称的知识库"""
+        name = "duplicate_test"
+        kb1 = client.knowledge_base.create(name=name)
+        
+        try:
+            # 某些系统可能允许重复名称，所以这里只检查是否创建成功
+            kb2 = client.knowledge_base.create(name=name)
+            # 如果创建成功，清理两个知识库
+            client.knowledge_base.delete([kb1["id"], kb2["id"]])
+        except Exception as e:
+            # 如果抛出异常，说明不允许重复名称
+            assert "already exists" in str(e).lower() or "duplicate" in str(e).lower()
+            # 清理第一个知识库
+            client.knowledge_base.delete([kb1["id"]])
+
+
+class TestKnowledgeBaseGet:
+    """测试知识库查询"""
+    
+    def test_get_existing_kb(self, client: PowerRAGClient):
+        """测试获取存在的知识库"""
+        kb = client.knowledge_base.create(name="get_test")
+        try:
+            fetched_kb = client.knowledge_base.get(kb["id"])
+            assert fetched_kb["id"] == kb["id"]
+            assert fetched_kb["name"] == kb["name"]
+        finally:
+            client.knowledge_base.delete([kb["id"]])
+    
+    def test_get_nonexistent_kb(self, client: PowerRAGClient):
+        """测试获取不存在的知识库"""
+        # 使用有效的UUID格式但不存在于系统中的ID
+        # UUID v1格式：32位十六进制字符串
+        nonexistent_id = "a" * 32  # 32个字符的十六进制字符串
+        with pytest.raises(Exception) as exc_info:
+            client.knowledge_base.get(nonexistent_id)
+        # 检查错误信息中是否包含 "not found" 或 "invalid uuid"
+        error_msg = str(exc_info.value).lower()
+        assert "not found" in error_msg or "invalid uuid" in error_msg
+
+
+class TestKnowledgeBaseList:
+    """测试知识库列表"""
+    
+    def test_list_all(self, client: PowerRAGClient):
+        """测试列出所有知识库"""
+        kb_ids = []
+        try:
+            for i in range(3):
+                kb = client.knowledge_base.create(name=f"list_test_{i}")
+                kb_ids.append(kb["id"])
+            
+            kbs, total = client.knowledge_base.list()
+            assert len(kbs) > 0
+            assert total >= 3
+        finally:
+            if kb_ids:
+                client.knowledge_base.delete(kb_ids)
+    
+    def test_list_with_filter(self, client: PowerRAGClient):
+        """测试使用过滤器列出知识库"""
+        name = "filter_test"
+        kb = client.knowledge_base.create(name=name)
+        try:
+            kbs, total = client.knowledge_base.list(name=name)
+            assert len(kbs) >= 1
+            assert any(kb_item["name"] == name for kb_item in kbs)
+        finally:
+            client.knowledge_base.delete([kb["id"]])
+    
+    def test_list_with_pagination(self, client: PowerRAGClient):
+        """测试分页列出知识库"""
+        kb_ids = []
+        try:
+            for i in range(5):
+                kb = client.knowledge_base.create(name=f"page_test_{i}")
+                kb_ids.append(kb["id"])
+            
+            kbs_page1, total = client.knowledge_base.list(page=1, page_size=2)
+            assert len(kbs_page1) <= 2
+            assert total >= 5
+        finally:
+            if kb_ids:
+                client.knowledge_base.delete(kb_ids)
+
+
+class TestKnowledgeBaseUpdate:
+    """测试知识库更新"""
+    
+    def test_update_name(self, client: PowerRAGClient):
+        """测试更新知识库名称"""
+        kb = client.knowledge_base.create(name="update_test")
+        try:
+            updated_kb = client.knowledge_base.update(kb["id"], name="updated_name")
+            assert updated_kb["name"] == "updated_name"
+        finally:
+            client.knowledge_base.delete([kb["id"]])
+    
+    def test_update_multiple_fields(self, client: PowerRAGClient):
+        """测试更新多个字段"""
+        kb = client.knowledge_base.create(name="multi_update_test")
+        try:
+            updated_kb = client.knowledge_base.update(
+                kb["id"],
+                name="multi_updated",
+                description="Updated description",
+                permission="team"
+            )
+            assert updated_kb["name"] == "multi_updated"
+            assert updated_kb["description"] == "Updated description"
+            assert updated_kb["permission"] == "team"
+        finally:
+            client.knowledge_base.delete([kb["id"]])
+
+
+class TestKnowledgeBaseDelete:
+    """测试知识库删除"""
+    
+    def test_delete_single_kb(self, client: PowerRAGClient):
+        """测试删除单个知识库"""
+        kb = client.knowledge_base.create(name="delete_test")
+        client.knowledge_base.delete([kb["id"]])
+        
+        with pytest.raises(Exception):
+            client.knowledge_base.get(kb["id"])
+    
+    def test_delete_multiple_kbs(self, client: PowerRAGClient):
+        """测试批量删除知识库"""
+        kb_ids = []
+        for i in range(3):
+            kb = client.knowledge_base.create(name=f"batch_delete_{i}")
+            kb_ids.append(kb["id"])
+        
+        client.knowledge_base.delete(kb_ids)
+        
+        for kb_id in kb_ids:
+            with pytest.raises(Exception):
+                client.knowledge_base.get(kb_id)
+
diff --git a/powerrag/sdk/tests/test_knowledge_graph.py b/powerrag/sdk/tests/test_knowledge_graph.py
new file mode 100644
index 000000000..2bb1ab170
--- /dev/null
+++ b/powerrag/sdk/tests/test_knowledge_graph.py
@@ -0,0 +1,90 @@
+#
+#  Copyright 2025 The OceanBase Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import pytest
+import time
+from powerrag.sdk import PowerRAGClient
+
+
+class TestKnowledgeGraphBuild:
+    """测试知识图谱构建"""
+    
+    def test_build_knowledge_graph(self, client: PowerRAGClient, kb_with_docs: str):
+        """测试构建知识图谱"""
+        task_info = client.knowledge_graph.build(kb_with_docs)
+        assert "graphrag_task_id" in task_info
+    
+    def test_build_knowledge_graph_already_running(self, client: PowerRAGClient, kb_with_docs: str):
+        """测试构建知识图谱时任务已在运行"""
+        # 先启动一个任务
+        client.knowledge_graph.build(kb_with_docs)
+        
+        # 再次启动可能会失败或返回已有任务
+        try:
+            task_info = client.knowledge_graph.build(kb_with_docs)
+            # 如果成功，说明系统允许重复构建
+            assert "graphrag_task_id" in task_info
+        except Exception as e:
+            # 如果失败，应该是因为任务已在运行
+            assert "already running" in str(e).lower() or "running" in str(e).lower()
+
+
+class TestKnowledgeGraphGet:
+    """测试知识图谱查询"""
+    
+    def test_get_knowledge_graph(self, client: PowerRAGClient, kb_with_docs: str):
+        """测试获取知识图谱"""
+        kg_data = client.knowledge_graph.get(kb_with_docs)
+        assert "graph" in kg_data
+        assert "mind_map" in kg_data
+    
+    def test_get_knowledge_graph_empty(self, client: PowerRAGClient, kb_with_docs: str):
+        """测试获取空的知识图谱"""
+        # 确保没有知识图谱数据
+        try:
+            client.knowledge_graph.delete(kb_with_docs)
+        except Exception:
+            pass
+        
+        kg_data = client.knowledge_graph.get(kb_with_docs)
+        assert kg_data["graph"] == {}
+        assert kg_data["mind_map"] == {}
+
+
+class TestKnowledgeGraphStatus:
+    """测试知识图谱状态查询"""
+    
+    def test_get_status(self, client: PowerRAGClient, kb_with_docs: str):
+        """测试获取知识图谱状态"""
+        # 先构建
+        task_info = client.knowledge_graph.build(kb_with_docs)
+        
+        # 查询状态
+        status = client.knowledge_graph.get_status(kb_with_docs)
+        assert status is not None
+        assert "progress" in status
+    
+    def test_get_status_not_exists(self, client: PowerRAGClient, kb_with_docs: str):
+        """测试获取不存在的知识图谱状态"""
+        # 确保没有运行的任务 - 只在存在时删除
+        try:
+            if client.knowledge_graph.get_status(kb_with_docs) is not None:
+                client.knowledge_graph.delete(kb_with_docs)
+        except Exception:
+            pass
+        
+        status = client.knowledge_graph.get_status(kb_with_docs)
+        assert status is None
\ No newline at end of file
diff --git a/powerrag/sdk/tests/test_raptor.py b/powerrag/sdk/tests/test_raptor.py
new file mode 100644
index 000000000..a2e7e889f
--- /dev/null
+++ b/powerrag/sdk/tests/test_raptor.py
@@ -0,0 +1,69 @@
+#
+#  Copyright 2025 The OceanBase Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import pytest
+import time
+from powerrag.sdk import PowerRAGClient
+
+
+class TestRAPTORBuild:
+    """测试RAPTOR构建"""
+    
+    def test_build_raptor(self, client: PowerRAGClient, kb_with_docs: str):
+        """测试构建RAPTOR"""
+        task_info = client.raptor.build(kb_with_docs)
+        assert "raptor_task_id" in task_info
+    
+    def test_build_raptor_already_running(self, client: PowerRAGClient, kb_with_docs: str):
+        """测试构建RAPTOR时任务已在运行"""
+        # 先启动一个任务
+        client.raptor.build(kb_with_docs)
+        
+        # 再次启动可能会失败或返回已有任务
+        try:
+            task_info = client.raptor.build(kb_with_docs)
+            # 如果成功，说明系统允许重复构建
+            assert "raptor_task_id" in task_info
+        except Exception as e:
+            # 如果失败，应该是因为任务已在运行
+            assert "already running" in str(e).lower() or "running" in str(e).lower()
+
+
+class TestRAPTORStatus:
+    """测试RAPTOR状态查询"""
+    
+    def test_get_status(self, client: PowerRAGClient, kb_with_docs: str):
+        """测试获取RAPTOR状态"""
+        # 先构建
+        task_info = client.raptor.build(kb_with_docs)
+        
+        # 查询状态
+        status = client.raptor.get_status(kb_with_docs)
+        assert status is not None
+        assert "progress" in status
+    
+    def test_get_status_not_exists(self, client: PowerRAGClient, kb_with_docs: str):
+        """测试获取不存在的RAPTOR状态"""
+        # 确保没有运行的任务 - 只有在存在时才删除
+        try:
+            if client.raptor.get_status(kb_with_docs) is not None:
+                client.raptor.delete(kb_with_docs)
+        except Exception:
+            pass
+        
+        status = client.raptor.get_status(kb_with_docs)
+        assert status is None
+
diff --git a/powerrag/sdk/tests/test_retrieval.py b/powerrag/sdk/tests/test_retrieval.py
new file mode 100644
index 000000000..062840d6d
--- /dev/null
+++ b/powerrag/sdk/tests/test_retrieval.py
@@ -0,0 +1,105 @@
+#
+#  Copyright 2025 The OceanBase Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import pytest
+from powerrag.sdk import PowerRAGClient
+
+
+class TestRetrievalSearch:
+    """测试检索"""
+    
+    def test_basic_search(self, client: PowerRAGClient, kb_id: str):
+        """测试基本检索"""
+        result = client.retrieval.search(
+            kb_ids=[kb_id],
+            question="测试问题"
+        )
+        assert "chunks" in result
+        assert "total" in result
+        assert isinstance(result["chunks"], list)
+    
+    def test_search_with_pagination(self, client: PowerRAGClient, kb_id: str):
+        """测试分页检索"""
+        result = client.retrieval.search(
+            kb_ids=[kb_id],
+            question="测试问题",
+            page=1,
+            page_size=10
+        )
+        assert len(result["chunks"]) <= 10
+    
+    def test_search_with_similarity_threshold(self, client: PowerRAGClient, kb_id: str):
+        """测试相似度阈值"""
+        result = client.retrieval.search(
+            kb_ids=[kb_id],
+            question="测试问题",
+            similarity_threshold=0.5
+        )
+        # 验证所有结果的相似度都大于等于阈值（如果有结果）
+        for chunk in result["chunks"]:
+            assert chunk.get("similarity", 0) >= 0.5
+    
+    def test_search_with_document_filter(self, client: PowerRAGClient, kb_id: str, doc_id: str):
+        """测试文档过滤"""
+        result = client.retrieval.search(
+            kb_ids=[kb_id],
+            question="测试问题",
+            document_ids=[doc_id]
+        )
+        # 验证所有结果都来自指定的文档（如果有结果）
+        for chunk in result["chunks"]:
+            assert chunk["document_id"] == doc_id
+    
+    def test_search_with_keyword(self, client: PowerRAGClient, kb_id: str):
+        """测试关键词增强"""
+        result = client.retrieval.search(
+            kb_ids=[kb_id],
+            question="测试问题",
+            keyword=True
+        )
+        assert "chunks" in result
+    
+    def test_search_with_kg(self, client: PowerRAGClient, kb_id: str):
+        """测试知识图谱检索"""
+        result = client.retrieval.search(
+            kb_ids=[kb_id],
+            question="测试问题",
+            use_kg=True
+        )
+        assert "chunks" in result
+    
+    def test_search_with_highlight(self, client: PowerRAGClient, kb_id: str):
+        """测试高亮"""
+        result = client.retrieval.search(
+            kb_ids=[kb_id],
+            question="测试问题",
+            highlight=True
+        )
+        assert "chunks" in result
+
+
+class TestRetrievalTest:
+    """测试检索测试方法"""
+    
+    def test_retrieval_test(self, client: PowerRAGClient, kb_id: str):
+        """测试检索测试方法"""
+        result = client.retrieval.test(
+            kb_ids=[kb_id],
+            question="测试问题"
+        )
+        assert "chunks" in result
+        assert "total" in result
+
diff --git a/powerrag/server/app.py b/powerrag/server/app.py
index 6963e4245..f6abc7011 100644
--- a/powerrag/server/app.py
+++ b/powerrag/server/app.py
@@ -14,44 +14,39 @@
 #  limitations under the License.
 #
 
-"""PowerRAG Flask Application Configuration"""
+"""PowerRAG Quart Application Configuration"""
 
 import logging
 import json
-from flask import Flask
-from flask.json.provider import DefaultJSONProvider
-from flask_cors import CORS
+from quart import Quart
+from quart_cors import cors
 from api.utils.json_encode import CustomJSONEncoder
 
 logger = logging.getLogger(__name__)
 
 
-class CustomJSONProvider(DefaultJSONProvider):
-    """Custom JSON provider that supports Chinese characters without Unicode escaping"""
-    
-    def dumps(self, obj, **kwargs):
-        """Override dumps to ensure Chinese characters are not escaped"""
-        kwargs.setdefault('ensure_ascii', False)
-        kwargs.setdefault('cls', CustomJSONEncoder)
-        return json.dumps(obj, **kwargs)
-
-
 def create_app():
-    """Create and configure the PowerRAG Flask application"""
+    """Create and configure the PowerRAG Quart application"""
     
-    app = Flask(__name__)
+    app = Quart(__name__)
     
     # CORS configuration - allow requests from RAGFlow frontend
-    CORS(app, supports_credentials=True, max_age=2592000)
-    
-    # JSON encoder configuration
-    # Use custom JSON provider to ensure Chinese characters are displayed properly
-    app.json = CustomJSONProvider(app)
+    # Note: Cannot use allow_credentials=True with wildcard allow_origin="*"
+    # Since PowerRAG has its own API key authentication, we don't need credentials
+    app = cors(app, allow_origin="*", allow_credentials=False, allow_methods=["*"], allow_headers=["*"])
     
     # Request configuration
     app.url_map.strict_slashes = False
     app.config["MAX_CONTENT_LENGTH"] = 1024 * 1024 * 1024  # 1GB max upload
     
+    # Custom JSON encoder for Chinese characters
+    @app.before_serving
+    async def setup_json_encoder():
+        """Setup custom JSON encoder"""
+        import functools
+        import json
+        json.dumps = functools.partial(json.dumps, cls=CustomJSONEncoder, ensure_ascii=False)
+    
     # Register blueprints
     from powerrag.server.routes.powerrag_routes import powerrag_bp
     from powerrag.server.routes.task_routes import task_bp
@@ -61,10 +56,10 @@ def create_app():
     
     # Health check endpoint
     @app.route("/health", methods=["GET"])
-    def health_check():
+    async def health_check():
         return {"status": "ok", "service": "powerrag"}, 200
     
-    logger.info("PowerRAG Flask application created successfully")
+    logger.info("PowerRAG Quart application created successfully")
     
     return app
 
diff --git a/powerrag/server/powerrag_server.py b/powerrag/server/powerrag_server.py
index 01d96d8aa..b235c0340 100644
--- a/powerrag/server/powerrag_server.py
+++ b/powerrag/server/powerrag_server.py
@@ -32,8 +32,6 @@
 project_root = Path(__file__).parent.parent.parent
 sys.path.insert(0, str(project_root))
 
-from werkzeug.serving import run_simple
-
 # Initialize logging
 from common.log_utils import init_root_logger
 init_root_logger("powerrag_server")
@@ -115,7 +113,7 @@ def main():
     signal.signal(signal.SIGINT, signal_handler)
     signal.signal(signal.SIGTERM, signal_handler)
     
-    # Create Flask app
+    # Create Quart app
     app = create_app()
     
     # Start server
@@ -128,13 +126,12 @@ def main():
         logger.info(f"  - POST http://{args.host}:{args.port}/api/v1/powerrag/extract")
         logger.info(f"  - GET  http://{args.host}:{args.port}/health")
         
-        run_simple(
-            hostname=args.host,
+        # Run Quart app
+        app.run(
+            host=args.host,
             port=args.port,
-            application=app,
-            threaded=True,
-            use_reloader=args.reload,  # Only reload if explicitly requested
-            use_debugger=args.debug,   # Debugger enabled with --debug
+            debug=args.debug,
+            use_reloader=args.reload,
         )
     except Exception as e:
         logger.error(f"Failed to start PowerRAG server: {e}", exc_info=True)
diff --git a/powerrag/server/routes/powerrag_routes.py b/powerrag/server/routes/powerrag_routes.py
index 20d057a0c..4cc881810 100644
--- a/powerrag/server/routes/powerrag_routes.py
+++ b/powerrag/server/routes/powerrag_routes.py
@@ -18,7 +18,7 @@
 
 import os
 import logging
-from flask import Blueprint, request, jsonify, Response
+from quart import Blueprint, request, jsonify, Response
 from powerrag.server.services.parse_service import PowerRAGParseService
 from powerrag.server.services.convert_service import PowerRAGConvertService
 from powerrag.server.services.split_service import PowerRAGSplitService
@@ -50,7 +50,7 @@
 # ============================================================================
 
 @powerrag_bp.route("/run", methods=["POST"])
-def run_parse():
+async def run_parse():
     """
     Run PowerRAG parsing tasks using task_executor (async) - 推荐使用
     
@@ -83,7 +83,7 @@ def run_parse():
     """
     logger.info(f"=== PowerRAG /run endpoint called from {request.remote_addr} ===")
     try:
-        data = request.get_json()
+        data = await request.get_json()
         
         if not data:
             return jsonify({
@@ -219,7 +219,7 @@ def run_parse():
 
 @powerrag_bp.route("/parse", methods=["POST"])
 @apikey_required
-def parse_document(tenant_id):
+async def parse_document(tenant_id):
     """
     Quick parse for preview (synchronous) - 仅用于快速预览
     
@@ -247,7 +247,7 @@ def parse_document(tenant_id):
     }
     """
     try:
-        data = request.get_json()
+        data = await request.get_json()
         
         if not data:
             return jsonify({
@@ -300,7 +300,7 @@ def parse_document(tenant_id):
 
 @powerrag_bp.route("/parse/batch", methods=["POST"])
 @apikey_required
-def parse_documents_batch(tenant_id):
+async def parse_documents_batch(tenant_id):
     """
     Batch parse multiple documents (using ThreadPoolExecutor like FileService.parse_docs)
     
@@ -312,7 +312,7 @@ def parse_documents_batch(tenant_id):
     }
     """
     try:
-        data = request.get_json()
+        data = await request.get_json()
         
         if not data:
             return jsonify({
@@ -450,7 +450,7 @@ def parse_upload_file():
 
 @powerrag_bp.route("/convert", methods=["POST"])
 @apikey_required
-def convert_document(tenant_id):
+async def convert_document(tenant_id):
     """
     Convert document format using PowerRAG converters
     
@@ -478,7 +478,7 @@ def convert_document(tenant_id):
     - to_page (int): End page number (default: 100000)
     """
     try:
-        data = request.get_json()
+        data = await request.get_json()
         
         if not data:
             return jsonify({
@@ -517,7 +517,7 @@ def convert_document(tenant_id):
 
 @powerrag_bp.route("/convert/upload", methods=["POST"])
 @apikey_required
-def convert_upload_file(tenant_id):
+async def convert_upload_file(tenant_id):
     """
     Convert uploaded file directly (with file download support)
     
@@ -635,13 +635,483 @@ def convert_upload_file(tenant_id):
         }), 500
 
 
+# ============================================================================
+# 文档解析为 Markdown 接口（不切分）
+# ============================================================================
+
+@powerrag_bp.route("/parse_to_md", methods=["POST"])
+@apikey_required
+async def parse_to_md(tenant_id):
+    """
+    Parse document to Markdown WITHOUT chunking
+    
+    将文档解析为 Markdown 格式，但不进行切分。
+    适用于需要完整文档内容或外部系统自行处理切分的场景。
+    
+    支持的文件格式:
+    - PDF (.pdf)
+    - Office 文档 (.doc, .docx, .ppt, .pptx)
+    - 图片 (.jpg, .png)
+    - HTML (.html, .htm)
+    
+    Authentication: Requires RAGFlow API key in Authorization header (Bearer token)
+    
+    Request JSON:
+    {
+        "doc_id": "document_id",  // RAGFlow 文档 ID
+        "config": {
+            "layout_recognize": "mineru",  // 布局识别引擎: mineru 或 dots_ocr
+            "enable_ocr": false,           // 是否启用 OCR
+            "enable_formula": false,       // 是否识别公式
+            "enable_table": true,          // 是否识别表格
+            "from_page": 0,                // 起始页（仅 PDF）
+            "to_page": 100000              // 结束页（仅 PDF）
+        }
+    }
+    
+    Response:
+    {
+        "code": 0,
+        "data": {
+            "doc_id": "document_id",
+            "doc_name": "document.pdf",
+            "markdown": "# Title\n\nContent...",  // 完整的 Markdown 内容
+            "markdown_length": 5000,
+            "images": {                            // 文档中的图片（base64）
+                "image_001.png": "base64_data...",
+                "image_002.png": "base64_data..."
+            },
+            "total_images": 2
+        },
+        "message": "success"
+    }
+    """
+    try:
+        data = await request.get_json()
+        
+        if not data:
+            return jsonify({
+                "code": 400,
+                "message": "No JSON data provided"
+            }), 400
+        
+        doc_id = data.get("doc_id")
+        config = data.get("config", {})
+        
+        if not doc_id:
+            return jsonify({
+                "code": 400,
+                "message": "doc_id is required"
+            }), 400
+        
+        # Get document from database
+        exist, doc = DocumentService.get_by_id(doc_id)
+        if not exist:
+            return jsonify({
+                "code": 404,
+                "message": f"Document {doc_id} not found"
+            }), 404
+        
+        # Get document binary data from storage
+        bucket, name = File2DocumentService.get_storage_address(doc_id=doc_id)
+        binary = settings.STORAGE_IMPL.get(bucket, name)
+        
+        if not binary:
+            return jsonify({
+                "code": 404,
+                "message": f"Document binary data not found for {doc_id}"
+            }), 404
+        
+        # Create service
+        gotenberg_url = config.get("gotenberg_url", GOTENBERG_URL)
+        service = PowerRAGParseService(gotenberg_url=gotenberg_url)
+        
+        # Parse document to markdown (no chunking)
+        from pathlib import Path
+        file_ext = Path(doc.name).suffix.lstrip('.').lower()
+        
+        # Determine format type
+        # Supported: PDF, Office (doc/docx/ppt/pptx), HTML, Images (jpg/png)
+        format_type_map = {
+            'pdf': 'pdf',
+            'docx': 'office', 'doc': 'office',
+            'xlsx': 'office', 'xls': 'office',
+            'pptx': 'office', 'ppt': 'office',
+            'html': 'html', 'htm': 'html',
+            'jpg': 'image', 'jpeg': 'image',
+            'png': 'image'
+        }
+        
+        format_type = format_type_map.get(file_ext)
+        if not format_type:
+            return jsonify({
+                "code": 400,
+                "message": f"Unsupported file format: {file_ext}. Supported formats: pdf, doc, docx, ppt, pptx, jpg, png, html"
+            }), 400
+        
+        # Use _parse_to_markdown method (returns tuple of markdown_content and images)
+        md_content, images = service._parse_to_markdown(
+            filename=doc.name,
+            binary=binary,
+            format_type=format_type,
+            config=config
+        )
+        
+        return jsonify({
+            "code": 0,
+            "data": {
+                "doc_id": doc_id,
+                "doc_name": doc.name,
+                "markdown": md_content,
+                "markdown_length": len(md_content),
+                "images": images,
+                "total_images": len(images)
+            },
+            "message": "success"
+        }), 200
+        
+    except Exception as e:
+        logger.error(f"Parse to markdown error: {e}", exc_info=True)
+        return jsonify({
+            "code": 500,
+            "message": str(e)
+        }), 500
+
+
+@powerrag_bp.route("/parse_to_md/async", methods=["POST"])
+@apikey_required
+async def parse_to_md_async(tenant_id):
+    """
+    Parse document to Markdown asynchronously (submit task)
+    
+    异步解析文档为 Markdown，返回任务 ID。
+    适用于大文档或需要长时间处理的场景。
+    
+    Authentication: Requires RAGFlow API key in Authorization header (Bearer token)
+    
+    Request JSON:
+    {
+        "doc_id": "document_id",  // RAGFlow 文档 ID
+        "config": {
+            "layout_recognize": "mineru",
+            "enable_ocr": false,
+            "enable_formula": false,
+            "enable_table": true
+        }
+    }
+    
+    Response:
+    {
+        "code": 0,
+        "data": {
+            "task_id": "uuid-string"
+        },
+        "message": "Task submitted successfully"
+    }
+    """
+    try:
+        data = await request.get_json()
+        
+        if not data:
+            return jsonify({
+                "code": 400,
+                "message": "No JSON data provided"
+            }), 400
+        
+        doc_id = data.get("doc_id")
+        config = data.get("config", {})
+        
+        if not doc_id:
+            return jsonify({
+                "code": 400,
+                "message": "doc_id is required"
+            }), 400
+        
+        # Verify document exists and get binary data in the main thread
+        # (cannot access storage from thread pool workers)
+        exist, doc = DocumentService.get_by_id(doc_id)
+        if not exist:
+            return jsonify({
+                "code": 404,
+                "message": f"Document {doc_id} not found"
+            }), 404
+        
+        # Get document binary data NOW (in main thread with app context)
+        bucket, name = File2DocumentService.get_storage_address(doc_id=doc_id)
+        if not bucket or not name:
+            return jsonify({
+                "code": 404,
+                "message": f"Document storage address not found for {doc_id}"
+            }), 404
+        
+        binary = settings.STORAGE_IMPL.get(bucket, name)
+        if not binary:
+            return jsonify({
+                "code": 404,
+                "message": f"Document binary data not found for {doc_id}"
+            }), 404
+        
+        # Determine format type
+        from pathlib import Path
+        file_ext = Path(doc.name).suffix.lstrip('.').lower()
+        format_type_map = {
+            'pdf': 'pdf', 'docx': 'office', 'doc': 'office',
+            'xlsx': 'office', 'xls': 'office', 'pptx': 'office', 'ppt': 'office',
+            'html': 'html', 'htm': 'html',
+            'jpg': 'image', 'jpeg': 'image', 'png': 'image'
+        }
+        format_type = format_type_map.get(file_ext, 'pdf')
+        
+        # Get task manager and service
+        from powerrag.server.services.parse_to_md_task_manager import get_task_manager
+        task_manager = get_task_manager()
+        
+        gotenberg_url = config.get("gotenberg_url", GOTENBERG_URL)
+        service = PowerRAGParseService(gotenberg_url=gotenberg_url)
+        
+        # Submit task with binary data (not doc_id)
+        task_id = task_manager.submit_task(
+            service=service,
+            method_name="parse_to_md",
+            filename=doc.name,
+            binary=binary,
+            format_type=format_type,
+            config=config
+        )
+        
+        return jsonify({
+            "code": 0,
+            "data": {
+                "task_id": task_id
+            },
+            "message": "Task submitted successfully"
+        }), 200
+        
+    except Exception as e:
+        logger.error(f"Parse to markdown async error: {e}", exc_info=True)
+        return jsonify({
+            "code": 500,
+            "message": str(e)
+        }), 500
+
+
+@powerrag_bp.route("/parse_to_md/status/<task_id>", methods=["GET"])
+@apikey_required
+def get_parse_to_md_status(task_id):
+    """
+    Get parse_to_md task status and result
+    
+    查询异步解析任务的状态和结果。
+    
+    Authentication: Requires RAGFlow API key in Authorization header (Bearer token)
+    
+    Response:
+    {
+        "code": 0,
+        "data": {
+            "task_id": "uuid-string",
+            "status": "pending|processing|success|failed|not_found",
+            "created_at": "2025-01-01T00:00:00",
+            "updated_at": "2025-01-01T00:00:00",
+            "result": {
+                "doc_id": "...",
+                "doc_name": "...",
+                "markdown": "...",
+                "markdown_length": 5000,
+                "images": {...},
+                "total_images": 2
+            },
+            "error": "Error message if failed"
+        },
+        "message": "success"
+    }
+    """
+    try:
+        from powerrag.server.services.parse_to_md_task_manager import get_task_manager
+        task_manager = get_task_manager()
+        
+        status = task_manager.get_task_status(task_id)
+        
+        if status.get("status") == "not_found":
+            return jsonify({
+                "code": 404,
+                "message": "Task not found",
+                "data": status
+            }), 404
+        
+        return jsonify({
+            "code": 0,
+            "data": status,
+            "message": "success"
+        }), 200
+        
+    except Exception as e:
+        logger.error(f"Get parse_to_md status error: {e}", exc_info=True)
+        return jsonify({
+            "code": 500,
+            "message": str(e)
+        }), 500
+
+
+@powerrag_bp.route("/parse_to_md/upload", methods=["POST"])
+@apikey_required
+async def parse_to_md_upload(tenant_id):
+    """
+    Parse uploaded file to Markdown WITHOUT chunking
+    
+    直接上传文件并解析为 Markdown，不进行切分。
+    
+    支持的文件格式:
+    - PDF (.pdf)
+    - Office 文档 (.doc, .docx, .ppt, .pptx)
+    - 图片 (.jpg, .png)
+    - HTML (.html, .htm)
+    
+    Authentication: Requires RAGFlow API key in Authorization header (Bearer token)
+    
+    Request (multipart/form-data):
+    - file: File to parse (required) - supports PDF, Office (doc/docx/ppt/pptx), Images (jpg/png), HTML
+    - config: JSON string of parser config (optional)
+    
+    Config parameters:
+    - layout_recognize (str): mineru or dots_ocr (default: mineru)
+    - enable_ocr (bool): Enable OCR (default: false)
+    - enable_formula (bool): Enable formula recognition (default: false)
+    - enable_table (bool): Enable table recognition (default: true)
+    - from_page (int): Start page number (default: 0)
+    - to_page (int): End page number (default: 100000)
+    
+    Response JSON:
+    {
+        "code": 0,
+        "data": {
+            "filename": "document.pdf",
+            "markdown": "# Title\n\nContent...",
+            "markdown_length": 5000,
+            "images": {...},
+            "total_images": 2
+        },
+        "message": "success"
+    }
+    """
+    try:
+        # Check if file is present
+        files = await request.files
+        if 'file' not in files:
+            return jsonify({
+                "code": 400,
+                "message": "No file provided"
+            }), 400
+        
+        file = files['file']
+        if file.filename == '':
+            return jsonify({
+                "code": 400,
+                "message": "No file selected"
+            }), 400
+        
+        # Parse config from JSON string if provided
+        import json
+        form = await request.form
+        config_str = form.get('config', '{}')
+        try:
+            config = json.loads(config_str)
+        except json.JSONDecodeError:
+            return jsonify({
+                "code": 400,
+                "message": "Invalid JSON in config parameter"
+            }), 400
+        
+        # Read file binary
+        filename = file.filename
+        logger.info(f"Received file upload: filename={filename}, file object={file}")
+        
+        if not filename:
+            return jsonify({
+                "code": 400,
+                "message": "Filename is required"
+            }), 400
+        
+        binary = file.read()
+        if not binary:
+            return jsonify({
+                "code": 400,
+                "message": "File is empty"
+            }), 400
+        
+        # Add filename to config
+        config['filename'] = filename
+        
+        # Determine format type
+        from pathlib import Path
+        file_ext = Path(filename).suffix.lstrip('.').lower()
+        
+        logger.info(f"Parsed filename: {filename}, extension: '{file_ext}'")
+        
+        if not file_ext:
+            return jsonify({
+                "code": 400,
+                "message": f"File must have an extension. Filename: '{filename}', parsed extension: '{file_ext}'"
+            }), 400
+        
+        # Supported: PDF, Office (doc/docx/ppt/pptx), HTML, Markdown, Images (jpg/png)
+        format_type_map = {
+            'pdf': 'pdf',
+            'docx': 'office', 'doc': 'office',
+            'xlsx': 'office', 'xls': 'office',
+            'pptx': 'office', 'ppt': 'office',
+            'html': 'html', 'htm': 'html',
+            'jpg': 'image', 'jpeg': 'image',
+            'png': 'image'
+        }
+        
+        format_type = format_type_map.get(file_ext)
+        if not format_type:
+            return jsonify({
+                "code": 400,
+                "message": f"Unsupported file format: {file_ext}. Supported formats: pdf, doc, docx, ppt, pptx, jpg, png, html"
+            }), 400
+        
+        # Create service and parse
+        gotenberg_url = config.get("gotenberg_url", GOTENBERG_URL)
+        service = PowerRAGParseService(gotenberg_url=gotenberg_url)
+        
+        # Parse to markdown
+        md_content, images = service._parse_to_markdown(
+            filename=filename,
+            binary=binary,
+            format_type=format_type,
+            config=config
+        )
+        
+        # Return as JSON
+        return jsonify({
+            "code": 0,
+            "data": {
+                "filename": filename,
+                "markdown": md_content,
+                "markdown_length": len(md_content),
+                "images": images,
+                "total_images": len(images)
+            },
+            "message": "success"
+        }), 200
+        
+    except Exception as e:
+        logger.error(f"Parse to markdown (upload) error: {e}", exc_info=True)
+        return jsonify({
+            "code": 500,
+            "message": str(e)
+        }), 500
+
+
 # ============================================================================
 # 文档切片接口
 # ============================================================================
 
 @powerrag_bp.route("/split", methods=["POST"])
 @apikey_required
-def split_text(tenant_id):
+async def split_text(tenant_id):
     """
     Split text into chunks using powerrag/app chunking methods
     
@@ -670,7 +1140,7 @@ def split_text(tenant_id):
     }
     """
     try:
-        data = request.get_json()
+        data = await request.get_json()
         
         if not data:
             return jsonify({
@@ -717,7 +1187,7 @@ def split_text(tenant_id):
 
 @powerrag_bp.route("/extract", methods=["POST"])
 @apikey_required
-def extract_from_document(tenant_id):
+async def extract_from_document(tenant_id):
     """
     Extract information from document using PowerRAG extractors
     
@@ -732,7 +1202,7 @@ def extract_from_document(tenant_id):
     }
     """
     try:
-        data = request.get_json()
+        data = await request.get_json()
         
         if not data:
             return jsonify({
@@ -769,7 +1239,7 @@ def extract_from_document(tenant_id):
 
 @powerrag_bp.route("/extract/text", methods=["POST"])
 @apikey_required
-def extract_from_text(tenant_id):
+async def extract_from_text(tenant_id):
     """
     Extract information from raw text (no doc_id required)
     
@@ -781,7 +1251,7 @@ def extract_from_text(tenant_id):
     }
     """
     try:
-        data = request.get_json()
+        data = await request.get_json()
         
         if not data:
             return jsonify({
@@ -817,7 +1287,8 @@ def extract_from_text(tenant_id):
 
 
 @powerrag_bp.route("/extract/batch", methods=["POST"])
-def extract_batch(tenant_id):
+@apikey_required
+async def extract_batch(tenant_id):
     """
     Extract information from multiple documents
     
@@ -829,7 +1300,7 @@ def extract_batch(tenant_id):
     }
     """
     try:
-        data = request.get_json()
+        data = await request.get_json()
         
         if not data:
             return jsonify({
@@ -885,7 +1356,7 @@ def extract_batch(tenant_id):
 
 @powerrag_bp.route("/struct_extract/submit", methods=["POST"])
 @apikey_required
-def submit_extraction_task(tenant_id):
+async def submit_extraction_task(tenant_id):
     """
     Submit a langextract extraction task
     
@@ -952,7 +1423,7 @@ def submit_extraction_task(tenant_id):
     }
     """
     try:
-        data = request.get_json()
+        data = await request.get_json()
         
         if not data:
             return jsonify({
diff --git a/powerrag/server/routes/task_routes.py b/powerrag/server/routes/task_routes.py
index dbc99c6fe..feda60fc7 100644
--- a/powerrag/server/routes/task_routes.py
+++ b/powerrag/server/routes/task_routes.py
@@ -22,7 +22,7 @@
 """
 
 import logging
-from flask import Blueprint, request, jsonify
+from quart import Blueprint, request, jsonify
 
 from powerrag.server.services.task_queue_service import PowerRAGTaskQueueService
 
@@ -32,7 +32,7 @@
 
 
 @task_bp.route("/parse/async", methods=["POST"])
-def parse_document_async():
+async def parse_document_async():
     """
     Create an async parsing task using task_executor
     
@@ -62,7 +62,7 @@ def parse_document_async():
         }
     """
     try:
-        data = request.get_json()
+        data = await request.get_json()
         
         if not data:
             return jsonify({
@@ -112,7 +112,7 @@ def parse_document_async():
 
 
 @task_bp.route("/task/<task_id>", methods=["GET"])
-def get_task_status(task_id):
+async def get_task_status(task_id):
     """
     Get task status and progress
     
@@ -151,7 +151,7 @@ def get_task_status(task_id):
 
 
 @task_bp.route("/task/<task_id>/cancel", methods=["POST"])
-def cancel_task(task_id):
+async def cancel_task(task_id):
     """
     Cancel a running task
     
@@ -184,7 +184,7 @@ def cancel_task(task_id):
 
 
 @task_bp.route("/document/<doc_id>/chunks", methods=["GET"])
-def get_document_chunks(doc_id):
+async def get_document_chunks(doc_id):
     """
     Get parsed chunks for a completed document
     
diff --git a/powerrag/server/services/convert_service.py b/powerrag/server/services/convert_service.py
index fc08cf275..25b72a760 100644
--- a/powerrag/server/services/convert_service.py
+++ b/powerrag/server/services/convert_service.py
@@ -281,8 +281,10 @@ def _html_to_pdf(self, binary: bytes, config: Dict[str, Any]) -> bytes:
         filename = config.get('filename', 'document.html')
         
         try:
+            # According to https://gotenberg.dev/docs/routes#html-file-into-pdf-route
+            # The file MUST be named "index.html"
             url = f"{self.gotenberg_url}/forms/chromium/convert/html"
-            files = {'files': (filename, io.BytesIO(binary))}
+            files = {'files': ('index.html', io.BytesIO(binary))}
             
             logger.info(f"Converting HTML document to PDF via Gotenberg: {filename}")
             response = requests.post(url, files=files, timeout=60)
diff --git a/powerrag/server/services/extract_service.py b/powerrag/server/services/extract_service.py
index 9f8cfa200..9e0db84c2 100644
--- a/powerrag/server/services/extract_service.py
+++ b/powerrag/server/services/extract_service.py
@@ -23,14 +23,25 @@
 
 from api.db.services.document_service import DocumentService
 from api.db.services.file2document_service import File2DocumentService
-from common.settings import STORAGE_IMPL
+from common import settings
 
-# ⚠️ 延迟导入 PdfParser，避免启动时加载 OCR 模型
-# from deepdoc.parser import PdfParser as RAGFlowPdfParser
 
 logger = logging.getLogger(__name__)
 
 
+def _ensure_storage_initialized():
+    """Ensure STORAGE_IMPL is initialized before use"""
+    if settings.STORAGE_IMPL is None:
+        logger.warning("STORAGE_IMPL not initialized, calling init_settings()")
+        settings.init_settings()
+    
+    if settings.STORAGE_IMPL is None:
+        raise RuntimeError(
+            "STORAGE_IMPL is not initialized. Please ensure init_settings() "
+            "is called during application startup."
+        )
+
+
 class PowerRAGExtractService:
     """Service for information extraction from documents"""
     
@@ -55,6 +66,9 @@ def extract_from_document(self, doc_id: str, extractor_type: str,
             Dict containing extracted information and metadata
         """
         try:
+            # Ensure storage is initialized
+            _ensure_storage_initialized()
+            
             # Get document
             exist, doc = DocumentService.get_by_id(doc_id)
             if not exist:
@@ -62,7 +76,7 @@ def extract_from_document(self, doc_id: str, extractor_type: str,
             
             # Get binary data and extract text
             bucket, name = File2DocumentService.get_storage_address(doc_id=doc_id)
-            binary = STORAGE_IMPL.get(bucket, name)
+            binary = settings.STORAGE_IMPL.get(bucket, name)
             
             if not binary:
                 raise ValueError(f"Document binary not found for {doc_id}")
diff --git a/powerrag/server/services/parse_service.py b/powerrag/server/services/parse_service.py
index 371edfe68..8e4f96abb 100644
--- a/powerrag/server/services/parse_service.py
+++ b/powerrag/server/services/parse_service.py
@@ -580,3 +580,100 @@ def parse_docs_batch(self, doc_ids: List[str], parser_type: str = None,
         
         return results
 
+    def _parse_to_markdown_for_task(self, doc_id: str = None, filename: str = None, 
+                                     binary: bytes = None, format_type: str = None,
+                                     config: Dict[str, Any] = None) -> Dict[str, Any]:
+        """
+        Parse document to Markdown for async task execution
+        
+        This is a wrapper method used by ParseToMdTaskManager for async execution.
+        It handles both doc_id-based and direct binary-based parsing.
+        
+        Args:
+            doc_id: Document ID (for database lookup)
+            filename: Filename (for direct binary parsing)
+            binary: Binary data (for direct binary parsing)
+            format_type: Format type (for direct binary parsing)
+            config: Parser configuration
+        
+        Returns:
+            Dict with parsed results:
+            {
+                "doc_id": "...",
+                "doc_name": "...",
+                "markdown": "...",
+                "markdown_length": 5000,
+                "images": {...},
+                "total_images": 2
+            }
+        """
+        if config is None:
+            config = {}
+        
+        # Case 1: Parse from doc_id (from database)
+        if doc_id:
+            # Get document from database
+            exist, doc = DocumentService.get_by_id(doc_id)
+            if not exist:
+                raise ValueError(f"Document {doc_id} not found")
+            
+            # Get binary data from storage
+            bucket, name = File2DocumentService.get_storage_address(doc_id=doc_id)
+            
+            if not bucket or not name:
+                raise ValueError(f"Invalid storage address for document {doc_id}: bucket={bucket}, name={name}")
+            
+            storage = STORAGE_IMPL
+            
+            if not storage:
+                raise ValueError("Storage implementation not available")
+            
+            try:
+                binary = storage.get(bucket, name)
+                if not binary:
+                    raise ValueError(f"Document binary data not found in storage: bucket={bucket}, name={name}")
+            except Exception as e:
+                logger.error(f"Failed to get binary for doc {doc_id}: {e}", exc_info=True)
+                raise ValueError(f"Failed to retrieve document binary: {e}")
+            
+            # Determine format
+            file_ext = Path(doc.name).suffix.lstrip('.').lower()
+            format_type_map = {
+                'pdf': 'pdf', 'docx': 'office', 'doc': 'office',
+                'xlsx': 'office', 'xls': 'office', 'pptx': 'office', 'ppt': 'office',
+                'html': 'html', 'htm': 'html',
+                'jpg': 'image', 'jpeg': 'image', 'png': 'image'
+            }
+            format_type = format_type_map.get(file_ext, 'pdf')
+            filename = doc.name
+        
+        # Case 2: Parse from direct binary (filename, binary, format_type provided)
+        elif filename and binary is not None and format_type:
+            doc_id = None
+        else:
+            raise ValueError("Must provide either doc_id or (filename, binary, format_type)")
+        
+        # Parse to markdown
+        md_content, images = self._parse_to_markdown(
+            filename=filename,
+            binary=binary,
+            format_type=format_type,
+            config=config
+        )
+        
+        # Prepare result
+        result = {
+            "markdown": md_content,
+            "markdown_length": len(md_content),
+            "images": images,
+            "total_images": len(images) if images else 0
+        }
+        
+        if doc_id:
+            result["doc_id"] = doc_id
+            result["doc_name"] = filename
+        else:
+            result["doc_name"] = filename
+        
+        return result
+
diff --git a/powerrag/server/services/parse_to_md_task_manager.py b/powerrag/server/services/parse_to_md_task_manager.py
new file mode 100644
index 000000000..ce838fd11
--- /dev/null
+++ b/powerrag/server/services/parse_to_md_task_manager.py
@@ -0,0 +1,237 @@
+"""
+Parse to Markdown Task Manager
+
+Manages async tasks for parse_to_md operations.
+Provides task submission, status tracking, and result retrieval.
+"""
+
+import uuid
+import threading
+import logging
+from datetime import datetime
+from typing import Dict, Any, Optional
+from enum import Enum
+from concurrent.futures import ThreadPoolExecutor
+
+
+logger = logging.getLogger(__name__)
+
+
+class TaskStatus(Enum):
+    """Task status enum"""
+    PENDING = "pending"
+    PROCESSING = "processing"
+    SUCCESS = "success"
+    FAILED = "failed"
+    NOT_FOUND = "not_found"
+
+
+class ParseToMdTaskManager:
+    """
+    Singleton task manager for parse_to_md async operations.
+    
+    Features:
+    - Thread-safe task storage
+    - Async task execution with thread pool
+    - Task status tracking
+    - Result caching (max 1000 completed tasks)
+    """
+    
+    _instance = None
+    _lock = threading.Lock()
+    
+    def __new__(cls):
+        if cls._instance is None:
+            with cls._lock:
+                if cls._instance is None:
+                    cls._instance = super().__new__(cls)
+                    cls._instance._initialized = False
+        return cls._instance
+    
+    def __init__(self):
+        if self._initialized:
+            return
+        
+        self._initialized = True
+        self.tasks = {}  # task_id -> task_info
+        self.tasks_lock = threading.Lock()
+        
+        # Thread pool for async execution (max 4 concurrent tasks)
+        self.executor = ThreadPoolExecutor(max_workers=4, thread_name_prefix="parse_to_md_worker")
+        
+        # Max cached completed tasks (to prevent memory leak)
+        self.max_cached_tasks = 1000
+        
+        logger.info("ParseToMdTaskManager initialized")
+    
+    def submit_task(
+        self,
+        service,
+        method_name: str,
+        **kwargs
+    ) -> str:
+        """
+        Submit a parse_to_md task for async execution
+        
+        Args:
+            service: The ParseService instance
+            method_name: Method name to call ("parse_to_md" or "parse_to_md_upload")
+            **kwargs: Arguments to pass to the method
+        
+        Returns:
+            task_id: Unique task identifier
+        """
+        task_id = str(uuid.uuid4())
+        
+        with self.tasks_lock:
+            # Clean up old tasks if needed
+            if len(self.tasks) > self.max_cached_tasks:
+                self._cleanup_old_tasks()
+            
+            # Create task info
+            self.tasks[task_id] = {
+                "task_id": task_id,
+                "status": TaskStatus.PENDING.value,
+                "created_at": datetime.now().isoformat(),
+                "updated_at": datetime.now().isoformat(),
+                "method": method_name,
+                "kwargs": kwargs,
+                "result": None,
+                "error": None
+            }
+        
+        # Submit to thread pool
+        future = self.executor.submit(self._execute_task, task_id, service, method_name, kwargs)
+        
+        logger.info(f"Task {task_id} submitted for {method_name}")
+        
+        return task_id
+    
+    def _execute_task(
+        self,
+        task_id: str,
+        service,
+        method_name: str,
+        kwargs: Dict[str, Any]
+    ):
+        """
+        Execute the parse task in background thread
+        
+        Args:
+            task_id: Task ID
+            service: ParseService instance
+            method_name: Method to call
+            kwargs: Method arguments
+        """
+        try:
+            # Update status to processing
+            self._update_task_status(task_id, TaskStatus.PROCESSING)
+            
+            # Call the actual method
+            if method_name == "parse_to_md":
+                result = service._parse_to_markdown_for_task(**kwargs)
+            elif method_name == "parse_to_md_upload":
+                result = service._parse_to_markdown_for_task(**kwargs)
+            else:
+                raise ValueError(f"Unknown method: {method_name}")
+            
+            # Update with success result
+            with self.tasks_lock:
+                if task_id in self.tasks:
+                    self.tasks[task_id].update({
+                        "status": TaskStatus.SUCCESS.value,
+                        "updated_at": datetime.now().isoformat(),
+                        "result": result
+                    })
+            
+            logger.info(f"Task {task_id} completed successfully")
+            
+        except Exception as e:
+            # Update with error
+            logger.error(f"Task {task_id} failed: {e}", exc_info=True)
+            
+            with self.tasks_lock:
+                if task_id in self.tasks:
+                    self.tasks[task_id].update({
+                        "status": TaskStatus.FAILED.value,
+                        "updated_at": datetime.now().isoformat(),
+                        "error": str(e)
+                    })
+    
+    def get_task_status(self, task_id: str) -> Dict[str, Any]:
+        """
+        Get task status and result
+        
+        Args:
+            task_id: Task ID
+        
+        Returns:
+            Task information dict
+        """
+        with self.tasks_lock:
+            task = self.tasks.get(task_id)
+            
+            if not task:
+                return {
+                    "task_id": task_id,
+                    "status": TaskStatus.NOT_FOUND.value
+                }
+            
+            # Return a copy to avoid external modifications
+            return {
+                "task_id": task["task_id"],
+                "status": task["status"],
+                "created_at": task["created_at"],
+                "updated_at": task["updated_at"],
+                "result": task.get("result"),
+                "error": task.get("error")
+            }
+    
+    def _update_task_status(self, task_id: str, status: TaskStatus):
+        """Update task status"""
+        with self.tasks_lock:
+            if task_id in self.tasks:
+                self.tasks[task_id].update({
+                    "status": status.value,
+                    "updated_at": datetime.now().isoformat()
+                })
+    
+    def _cleanup_old_tasks(self):
+        """
+        Clean up old completed/failed tasks to prevent memory leak.
+        Keeps only the most recent tasks.
+        """
+        # Get completed/failed tasks
+        completed_tasks = [
+            (tid, t["updated_at"]) 
+            for tid, t in self.tasks.items() 
+            if t["status"] in [TaskStatus.SUCCESS.value, TaskStatus.FAILED.value]
+        ]
+        
+        # Sort by updated_at (oldest first)
+        completed_tasks.sort(key=lambda x: x[1])
+        
+        # Remove oldest 20% of tasks
+        num_to_remove = max(1, len(completed_tasks) // 5)
+        for i in range(num_to_remove):
+            task_id = completed_tasks[i][0]
+            del self.tasks[task_id]
+            logger.debug(f"Cleaned up old task {task_id}")
+    
+    def shutdown(self):
+        """Shutdown the task manager and thread pool"""
+        logger.info("Shutting down ParseToMdTaskManager")
+        self.executor.shutdown(wait=True)
+
+
+# Singleton instance
+_task_manager = None
+
+
+def get_task_manager() -> ParseToMdTaskManager:
+    """Get the singleton task manager instance"""
+    global _task_manager
+    if _task_manager is None:
+        _task_manager = ParseToMdTaskManager()
+    return _task_manager
+
diff --git a/powerrag/server/services/split_service.py b/powerrag/server/services/split_service.py
index 5e4e45eee..e4b535938 100644
--- a/powerrag/server/services/split_service.py
+++ b/powerrag/server/services/split_service.py
@@ -55,7 +55,7 @@ def _init_chunker_factory(self):
         """动态导入chunker模块，避免循环导入"""
         global CHUNKER_FACTORY
         if not CHUNKER_FACTORY:
-            global regex_based_chunking, title_based_chunking, smart_based_chunking
+            # 直接引用同一模块中定义的函数
             CHUNKER_FACTORY.update({
                 ParserType.TITLE.value: title_based_chunking,  # PowerRAG Title Chunker
                 ParserType.REGEX.value: regex_based_chunking,  # PowerRAG regex Chunker

From f8d2bd59d4e8121c94dbb9528c57d6dac902e79a Mon Sep 17 00:00:00 2001
From: "suiyu.zg" <suiyu.zg@oceanbase.com>
Date: Mon, 5 Jan 2026 21:24:18 +0800
Subject: [PATCH 02/19] feat: add GitHub Actions workflow for Python package
 publishing and initial SDK configuration

---
 .github/workflows/python-publish.yml | 71 ++++++++++++++++++++++++++++
 powerrag/sdk/MANIFEST.in             |  6 +++
 powerrag/sdk/README.md               |  2 +-
 powerrag/sdk/pyproject.toml          | 45 ++++++++++++++++++
 4 files changed, 123 insertions(+), 1 deletion(-)
 create mode 100644 .github/workflows/python-publish.yml
 create mode 100644 powerrag/sdk/MANIFEST.in
 create mode 100644 powerrag/sdk/pyproject.toml

diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml
new file mode 100644
index 000000000..1ebf223ca
--- /dev/null
+++ b/.github/workflows/python-publish.yml
@@ -0,0 +1,71 @@
+# This workflow will upload a Python Package using Twine when a release is created
+# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
+
+name: Upload Python Package
+
+on:
+  release:
+    types: [published]
+
+permissions:
+  contents: read
+
+jobs:
+  release-build:
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v4
+    
+    - name: Set up Python
+      uses: actions/setup-python@v5
+      with:
+        python-version: '3.10'
+    
+    - name: Install Poetry
+      run: |
+        pip install poetry
+    
+    - name: Set version from release tag
+      working-directory: ./powerrag/sdk
+      run: |
+        # Extract version from tag (e.g., sdk-v0.1.0 -> 0.1.0 or v0.1.0 -> 0.1.0)
+        VERSION="${{ github.ref_name }}"
+        VERSION="${VERSION#v}"
+        echo "Setting version to: $VERSION"
+        poetry version "$VERSION"
+        cat pyproject.toml | grep "^version"
+    
+    - name: Install dependencies
+      working-directory: ./powerrag/sdk
+      run: |
+        poetry install
+    
+    - name: Build package
+      working-directory: ./powerrag/sdk
+      run: |
+        poetry build
+
+    - name: Upload distributions
+      uses: actions/upload-artifact@v4
+      with:
+        name: release-dists
+        path: powerrag/sdk/dist/
+
+  pypi-publish:
+    runs-on: ubuntu-latest
+    needs:
+      - release-build
+    permissions:
+      id-token: write
+
+    steps:
+      - name: Retrieve release distributions
+        uses: actions/download-artifact@v4
+        with:
+          name: release-dists
+          path: dist/
+
+      - name: Publish release distributions to PyPI
+        uses: pypa/gh-action-pypi-publish@release/v1
+
diff --git a/powerrag/sdk/MANIFEST.in b/powerrag/sdk/MANIFEST.in
new file mode 100644
index 000000000..fc8223e29
--- /dev/null
+++ b/powerrag/sdk/MANIFEST.in
@@ -0,0 +1,6 @@
+include README.md
+include ../../LICENSE
+recursive-include powerrag/sdk *.py
+recursive-exclude powerrag/sdk/__pycache__ *
+recursive-exclude powerrag/sdk/tests *
+
diff --git a/powerrag/sdk/README.md b/powerrag/sdk/README.md
index 81b69be76..0fac6a2ce 100644
--- a/powerrag/sdk/README.md
+++ b/powerrag/sdk/README.md
@@ -43,7 +43,7 @@ pip install -e .
 
 ### 依赖要求
 
-- Python 3.8+
+- Python 3.10+
 - requests >= 2.28.0
 - typing-extensions (Python < 3.11)
 
diff --git a/powerrag/sdk/pyproject.toml b/powerrag/sdk/pyproject.toml
new file mode 100644
index 000000000..89943ba45
--- /dev/null
+++ b/powerrag/sdk/pyproject.toml
@@ -0,0 +1,45 @@
+[build-system]
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"
+
+[tool.poetry]
+name = "powerrag-sdk"
+version = "0.0.0"  # Version will be set automatically from GitHub release tag
+description = "A Python SDK for PowerRAG API, providing easy-to-use interfaces for knowledge base management, document processing, chunking, extraction, RAPTOR, knowledge graph, and retrieval."
+authors = ["OceanBase Team <contact@oceanbase.com>"]
+license = "Apache-2.0"
+readme = "README.md"
+homepage = "https://github.com/oceanbase/powerrag"
+repository = "https://github.com/oceanbase/powerrag"
+documentation = "https://github.com/oceanbase/powerrag/docs"
+keywords = ["powerrag", "rag", "llm", "sdk", "knowledge-base", "document-processing"]
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "Intended Audience :: Developers",
+    "License :: OSI Approved :: Apache Software License",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Topic :: Software Development :: Libraries :: Python Modules",
+]
+
+# Include SDK modules using relative path from repository root
+packages = [
+    { include = "powerrag/sdk", from = "../.." },
+]
+
+[tool.poetry.dependencies]
+python = ">=3.9,<4.0"
+requests = ">=2.28.0"
+typing-extensions = { version = ">=4.0.0", python = "<3.11" }
+
+[tool.poetry.group.dev.dependencies]
+pytest = ">=8.3.5"
+requests-toolbelt = ">=1.0.0"
+
+[tool.poetry.urls]
+"Bug Tracker" = "https://github.com/oceanbase/powerrag/issues"
+"Documentation" = "https://github.com/oceanbase/powerrag/docs"
+"Source Code" = "https://github.com/oceanbase/powerrag"

From 35ba7e221fd46f06fbc2b1fe73fb802e5b5e5695 Mon Sep 17 00:00:00 2001
From: "suiyu.zg" <suiyu.zg@oceanbase.com>
Date: Tue, 6 Jan 2026 11:05:30 +0800
Subject: [PATCH 03/19] chore: update GitHub Actions workflow for SDK
 publishing and refine package configuration

---
 .github/workflows/python-publish.yml | 15 ++++++++-------
 .gitignore                           |  3 +++
 powerrag/sdk/MANIFEST.in             |  6 ------
 powerrag/sdk/pyproject.toml          | 15 +++++++++++++--
 4 files changed, 24 insertions(+), 15 deletions(-)
 delete mode 100644 powerrag/sdk/MANIFEST.in

diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml
index 1ebf223ca..54eed1288 100644
--- a/.github/workflows/python-publish.yml
+++ b/.github/workflows/python-publish.yml
@@ -26,10 +26,13 @@ jobs:
       run: |
         pip install poetry
     
-    - name: Set version from release tag
-      working-directory: ./powerrag/sdk
+    - name: Copy pyproject.toml to root and set version
       run: |
-        # Extract version from tag (e.g., sdk-v0.1.0 -> 0.1.0 or v0.1.0 -> 0.1.0)
+        # Copy SDK pyproject.toml to repository root for building
+        cp powerrag/sdk/pyproject.toml .
+        cp powerrag/sdk/README.md .
+        
+        # Extract version from tag (e.g., v0.1.0 -> 0.1.0)
         VERSION="${{ github.ref_name }}"
         VERSION="${VERSION#v}"
         echo "Setting version to: $VERSION"
@@ -37,12 +40,10 @@ jobs:
         cat pyproject.toml | grep "^version"
     
     - name: Install dependencies
-      working-directory: ./powerrag/sdk
       run: |
-        poetry install
+        poetry install --only main
     
     - name: Build package
-      working-directory: ./powerrag/sdk
       run: |
         poetry build
 
@@ -50,7 +51,7 @@ jobs:
       uses: actions/upload-artifact@v4
       with:
         name: release-dists
-        path: powerrag/sdk/dist/
+        path: dist/
 
   pypi-publish:
     runs-on: ubuntu-latest
diff --git a/.gitignore b/.gitignore
index 1e4b1642b..3fbd4dea9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,6 +12,9 @@ api/flask_session
 # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
 Cargo.lock
 
+# Poetry lock file - exclude for library/SDK projects (similar to Cargo.lock for libraries)
+powerrag/sdk/poetry.lock
+
 # These are backup files generated by rustfmt
 **/*.rs.bk
 
diff --git a/powerrag/sdk/MANIFEST.in b/powerrag/sdk/MANIFEST.in
deleted file mode 100644
index fc8223e29..000000000
--- a/powerrag/sdk/MANIFEST.in
+++ /dev/null
@@ -1,6 +0,0 @@
-include README.md
-include ../../LICENSE
-recursive-include powerrag/sdk *.py
-recursive-exclude powerrag/sdk/__pycache__ *
-recursive-exclude powerrag/sdk/tests *
-
diff --git a/powerrag/sdk/pyproject.toml b/powerrag/sdk/pyproject.toml
index 89943ba45..219bb3585 100644
--- a/powerrag/sdk/pyproject.toml
+++ b/powerrag/sdk/pyproject.toml
@@ -25,9 +25,20 @@ classifiers = [
     "Topic :: Software Development :: Libraries :: Python Modules",
 ]
 
-# Include SDK modules using relative path from repository root
+# Package configuration
+# IMPORTANT: Build must be run from repository root (not from powerrag/sdk/)
+# This ensures the package path 'powerrag.sdk' is correctly resolved
+# Build workflow: GitHub Action copies this file to root and runs poetry build
+# Import path: from powerrag.sdk import PowerRAGClient
 packages = [
-    { include = "powerrag/sdk", from = "../.." },
+    { include = "powerrag/sdk" },
+]
+
+# Exclude patterns (Poetry automatically excludes __pycache__, *.pyc, tests, etc.)
+exclude = [
+    "powerrag/sdk/tests",
+    "powerrag/sdk/**/__pycache__",
+    "powerrag/sdk/**/*.pyc",
 ]
 
 [tool.poetry.dependencies]

From 632532887591f4d5beb0f1df1d6a44fe8759e320 Mon Sep 17 00:00:00 2001
From: "suiyu.zg" <suiyu.zg@oceanbase.com>
Date: Tue, 6 Jan 2026 11:07:02 +0800
Subject: [PATCH 04/19] chore: update Python version requirement in
 pyproject.toml to support 3.10

---
 powerrag/sdk/pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/powerrag/sdk/pyproject.toml b/powerrag/sdk/pyproject.toml
index 219bb3585..dd0737693 100644
--- a/powerrag/sdk/pyproject.toml
+++ b/powerrag/sdk/pyproject.toml
@@ -42,7 +42,7 @@ exclude = [
 ]
 
 [tool.poetry.dependencies]
-python = ">=3.9,<4.0"
+python = ">=3.10,<4.0"
 requests = ">=2.28.0"
 typing-extensions = { version = ">=4.0.0", python = "<3.11" }
 

From 79d1294d5ec5ea79608ba96b55cc2b20c118eb15 Mon Sep 17 00:00:00 2001
From: "suiyu.zg" <suiyu.zg@oceanbase.com>
Date: Tue, 6 Jan 2026 11:15:45 +0800
Subject: [PATCH 05/19] chore: add environment configuration for PyPI in GitHub
 Actions workflow

---
 .github/workflows/python-publish.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml
index 54eed1288..c53b42646 100644
--- a/.github/workflows/python-publish.yml
+++ b/.github/workflows/python-publish.yml
@@ -59,6 +59,9 @@ jobs:
       - release-build
     permissions:
       id-token: write
+    environment:
+      name: pypi
+      url: https://pypi.org/project/powerrag-sdk/
 
     steps:
       - name: Retrieve release distributions

From 16a0245a546e788443ae707e8cc7acba83c79aa5 Mon Sep 17 00:00:00 2001
From: "suiyu.zg" <suiyu.zg@oceanbase.com>
Date: Tue, 6 Jan 2026 16:52:53 +0800
Subject: [PATCH 06/19] docs: update SDK README.md

---
 powerrag/sdk/README.md | 24 ++----------------------
 1 file changed, 2 insertions(+), 22 deletions(-)

diff --git a/powerrag/sdk/README.md b/powerrag/sdk/README.md
index 0fac6a2ce..d2c470b87 100644
--- a/powerrag/sdk/README.md
+++ b/powerrag/sdk/README.md
@@ -8,39 +8,19 @@ PowerRAG SDK 是一个功能强大的 Python SDK，为 PowerRAG API 提供了简
 ## 特性
 
 - 🚀 **简单易用**: 面向对象的 API 设计，直观的方法调用
-- 📚 **完整功能**: 支持 PowerRAG 所有核心功能模块
+- 📚 **完整功能**: 支持 PowerRAG 所有核心功能模块，包括文档上传/解析/切片/提取/Raptor构建/知识库graph构建
 - 🔄 **异步支持**: 支持异步任务的状态查询和轮询等待
 - 📦 **批量操作**: 支持批量上传、删除、抽取等操作
 - 📝 **Markdown 解析**: 支持文档解析为 Markdown 格式（同步/异步）
-- 🎯 **类型提示**: 完整的类型注解，IDE 友好
-- ✅ **全面测试**: 包含完整的测试用例
 
 ## 安装
 
-### 方式 1: 使用 pip（推荐）
+### 使用 pip
 
 ```bash
 pip install powerrag-sdk
 ```
 
-### 方式 2: 从源码安装
-
-```bash
-git clone https://github.com/oceanbase/powerrag.git
-cd powerrag
-pip install -e .
-```
-
-### 方式 3: 仅安装 SDK 模块
-
-如果你只需要 SDK 功能：
-
-```bash
-git clone https://github.com/oceanbase/powerrag.git
-cd powerrag/powerrag/sdk
-pip install -e .
-```
-
 ### 依赖要求
 
 - Python 3.10+

From f15d0ef4f3ae8af9c15da73811604ecd7b7f862d Mon Sep 17 00:00:00 2001
From: "suiyu.zg" <suiyu.zg@oceanbase.com>
Date: Mon, 12 Jan 2026 21:07:45 +0800
Subject: [PATCH 07/19] feat(document): add binary file parsing to Markdown
 method

---
 powerrag/__init__.py                     |  21 ---
 powerrag/sdk/modules/document_manager.py |  76 +++++++++++
 powerrag/sdk/tests/test_document.py      | 162 +++++++++++++++++++++++
 3 files changed, 238 insertions(+), 21 deletions(-)

diff --git a/powerrag/__init__.py b/powerrag/__init__.py
index 1ea6623ac..e48fb89ee 100644
--- a/powerrag/__init__.py
+++ b/powerrag/__init__.py
@@ -21,27 +21,6 @@
 that can be integrated into the RAGFlow pipeline system.
 """
 
-# Import all PowerRAG components for pipeline integration
-from .flow.parsers.powerrag_parsers import (
-    PDFParser,
-    PDFParserParam,
-)
-
-from .flow.splitters.powerrag_splitters import (
-    TitleBasedSplitter,
-    TitleBasedSplitterParam,
-)
-
-from .flow.extractors.powerrag_extractors import (
-    EntityExtractor,
-    EntityExtractorParam,
-)
-
-from .flow.converters.powerrag_converters import (
-    DocumentToPDF,
-    DocumentToPDFParam,
-)
-
 # Export all components for pipeline registration
 __all__ = [
     # Parsers
diff --git a/powerrag/sdk/modules/document_manager.py b/powerrag/sdk/modules/document_manager.py
index 34971524a..c9978092f 100644
--- a/powerrag/sdk/modules/document_manager.py
+++ b/powerrag/sdk/modules/document_manager.py
@@ -681,6 +681,82 @@ def parse_to_md_upload(
         
         return res_json.get("data", {})
     
+    def parse_to_md_binary(
+        self,
+        file_binary: bytes,
+        filename: str,
+        config: Optional[Dict[str, Any]] = None,
+    ) -> Dict[str, Any]:
+        """
+        直接使用文件二进制内容解析为Markdown（不切分）
+        
+        使用文件二进制数据解析为 Markdown 格式，不进行切分。
+        适用于文件已在内存中或从其他来源获取的场景。
+        
+        支持的文件格式:
+        - PDF (.pdf)
+        - Office 文档 (.doc, .docx, .ppt, .pptx)
+        - 图片 (.jpg, .png)
+        - HTML (.html, .htm)
+        
+        Args:
+            file_binary: 文件的二进制内容
+            filename: 文件名（必须包含正确的扩展名）
+            config: 解析配置（可选），同 parse_to_md
+                - layout_recognize: 布局识别引擎 (mineru 或 dots_ocr，默认 mineru)
+                - enable_formula: 是否识别公式 (默认 False)
+                - enable_table: 是否识别表格 (默认 True)
+                - from_page: 起始页（仅 PDF，默认 0）
+                - to_page: 结束页（仅 PDF，默认 100000）
+        
+        Returns:
+            解析结果字典，包含以下字段：
+            - filename: 文件名
+            - markdown: Markdown 内容
+            - markdown_length: Markdown 长度
+            - images: 图片字典 (base64)
+            - total_images: 图片总数
+        
+        Raises:
+            ValueError: 文件名或二进制数据无效
+            Exception: API调用失败
+        
+        Example:
+            >>> with open("document.pdf", "rb") as f:
+            ...     file_binary = f.read()
+            >>> result = doc_manager.parse_to_md_binary(
+            ...     file_binary=file_binary,
+            ...     filename="document.pdf",
+            ...     config={"layout_recognize": "mineru", "enable_ocr": True}
+            ... )
+            >>> print(result['markdown'])
+            >>> print(f"Parsed {result['total_images']} images")
+        """
+        if not file_binary:
+            raise ValueError("file_binary cannot be empty")
+        if not filename:
+            raise ValueError("filename cannot be empty")
+        
+        # Prepare files from binary data
+        files = [("file", (filename, file_binary))]
+        
+        # Prepare form data
+        import json
+        form_data = {}
+        if config:
+            form_data["config"] = json.dumps(config)
+        
+        url = "/powerrag/parse_to_md/upload"
+        res = self.client.post(url, json=None, files=files, data=form_data)
+        
+        # Parse JSON response
+        res_json = res.json()
+        
+        if res_json.get("code") != 0:
+            raise Exception(res_json.get("message", "Parse to markdown (binary) failed"))
+        
+        return res_json.get("data", {})
+    
     def parse_url(
         self,
         kb_id: str,
diff --git a/powerrag/sdk/tests/test_document.py b/powerrag/sdk/tests/test_document.py
index 3aa1e098b..fad327cd4 100644
--- a/powerrag/sdk/tests/test_document.py
+++ b/powerrag/sdk/tests/test_document.py
@@ -426,5 +426,167 @@ def test_parse_to_md_upload_different_formats(self, client: PowerRAGClient, test
         # 这里我们只测试 txt 文件，实际使用时可以添加更多格式
         result = client.document.parse_to_md_upload(test_file_path)
         
+        assert "markdown" in result
+        assert result["markdown_length"] > 0
+
+
+class TestDocumentParseToMDBinary:
+    """测试使用二进制文件解析为 Markdown"""
+    
+    def test_parse_to_md_binary_basic(self, client: PowerRAGClient, test_file_path: str):
+        """测试基本的二进制文件解析功能"""
+        # 读取文件为二进制
+        with open(test_file_path, "rb") as f:
+            file_binary = f.read()
+        
+        # 使用二进制数据解析
+        result = client.document.parse_to_md_binary(
+            file_binary=file_binary,
+            filename="test_document.html"
+        )
+        
+        # 验证返回结果
+        assert "filename" in result
+        assert "markdown" in result
+        assert "markdown_length" in result
+        assert "images" in result
+        assert "total_images" in result
+        assert isinstance(result["markdown"], str)
+        assert result["markdown_length"] > 0
+    
+    def test_parse_to_md_binary_with_config(self, client: PowerRAGClient, test_file_path: str):
+        """测试带配置参数的二进制文件解析"""
+        with open(test_file_path, "rb") as f:
+            file_binary = f.read()
+        
+        config = {
+            "layout_recognize": "mineru",
+            "enable_ocr": False,
+            "enable_table": True
+        }
+        
+        result = client.document.parse_to_md_binary(
+            file_binary=file_binary,
+            filename="test_document.html",
+            config=config
+        )
+        
+        assert "markdown" in result
+        assert len(result["markdown"]) > 0
+        assert result["markdown_length"] > 0
+    
+    def test_parse_to_md_binary_empty_content(self, client: PowerRAGClient):
+        """测试空的二进制内容"""
+        with pytest.raises(ValueError) as exc_info:
+            client.document.parse_to_md_binary(
+                file_binary=b"",
+                filename="test.html"
+            )
+        
+        assert "cannot be empty" in str(exc_info.value).lower()
+    
+    def test_parse_to_md_binary_empty_filename(self, client: PowerRAGClient):
+        """测试空的文件名"""
+        with pytest.raises(ValueError) as exc_info:
+            client.document.parse_to_md_binary(
+                file_binary=b"test content",
+                filename=""
+            )
+        
+        assert "cannot be empty" in str(exc_info.value).lower()
+    
+    def test_parse_to_md_binary_different_file_types(self, client: PowerRAGClient, tmp_path):
+        """测试不同文件类型的二进制解析"""
+        # 测试 HTML 文件
+        html_file = tmp_path / "test.html"
+        html_content = "<html><body><h1>Test</h1><p>Content</p></body></html>"
+        html_file.write_text(html_content)
+        
+        with open(html_file, "rb") as f:
+            file_binary = f.read()
+        
+        result = client.document.parse_to_md_binary(
+            file_binary=file_binary,
+            filename="test.html"
+        )
+        
+        assert "markdown" in result
+        assert result["markdown_length"] > 0
+    
+    def test_parse_to_md_binary_with_images(self, client: PowerRAGClient, test_file_path: str):
+        """测试解析带图片的文档（二进制）"""
+        with open(test_file_path, "rb") as f:
+            file_binary = f.read()
+        
+        result = client.document.parse_to_md_binary(
+            file_binary=file_binary,
+            filename="test_document.html"
+        )
+        
+        # 验证图片相关字段
+        assert "images" in result
+        assert "total_images" in result
+        assert isinstance(result["images"], dict)
+        assert isinstance(result["total_images"], int)
+        assert result["total_images"] >= 0
+    
+    def test_parse_to_md_binary_filename_with_extension(self, client: PowerRAGClient, test_file_path: str):
+        """测试文件名必须包含扩展名"""
+        with open(test_file_path, "rb") as f:
+            file_binary = f.read()
+        
+        # 测试带正确扩展名的文件名
+        result = client.document.parse_to_md_binary(
+            file_binary=file_binary,
+            filename="document.html"
+        )
+        
+        assert result["filename"] == "document.html"
+        assert "markdown" in result
+    
+    def test_parse_to_md_binary_large_file(self, client: PowerRAGClient, tmp_path):
+        """测试较大文件的二进制解析"""
+        # 创建一个相对较大的测试文件
+        large_file = tmp_path / "large_test.html"
+        large_content = "<html><body>" + "<p>Test paragraph.</p>" * 1000 + "</body></html>"
+        large_file.write_text(large_content)
+        
+        with open(large_file, "rb") as f:
+            file_binary = f.read()
+        
+        result = client.document.parse_to_md_binary(
+            file_binary=file_binary,
+            filename="large_test.html"
+        )
+        
+        assert "markdown" in result
+        assert result["markdown_length"] > 0
+        # 验证内容长度合理
+        assert len(result["markdown"]) > 1000
+    
+    def test_parse_to_md_binary_utf8_content(self, client: PowerRAGClient, tmp_path):
+        """测试包含UTF-8字符的文件"""
+        utf8_file = tmp_path / "utf8_test.html"
+        utf8_content = """
+        <!DOCTYPE html>
+        <html>
+        <head><meta charset="UTF-8"><title>UTF-8测试</title></head>
+        <body>
+            <h1>中文标题</h1>
+            <p>这是中文内容。</p>
+            <p>English content with special chars: é, ñ, ü</p>
+        </body>
+        </html>
+        """
+        utf8_file.write_text(utf8_content, encoding="utf-8")
+        
+        with open(utf8_file, "rb") as f:
+            file_binary = f.read()
+        
+        result = client.document.parse_to_md_binary(
+            file_binary=file_binary,
+            filename="utf8_test.html"
+        )
+        
         assert "markdown" in result
         assert result["markdown_length"] > 0
\ No newline at end of file

From 7732022c323183dd4438b2d052e47aa6c3a258a0 Mon Sep 17 00:00:00 2001
From: "suiyu.zg" <suiyu.zg@oceanbase.com>
Date: Tue, 13 Jan 2026 14:25:28 +0800
Subject: [PATCH 08/19] refactor(document_manager): centralize parse to
 markdown upload logic

---
 powerrag/sdk/modules/document_manager.py | 82 +++++++++++++-----------
 1 file changed, 45 insertions(+), 37 deletions(-)

diff --git a/powerrag/sdk/modules/document_manager.py b/powerrag/sdk/modules/document_manager.py
index c9978092f..f6bea1f6a 100644
--- a/powerrag/sdk/modules/document_manager.py
+++ b/powerrag/sdk/modules/document_manager.py
@@ -17,6 +17,7 @@
 from typing import Optional, List, Dict, Any, Union
 from pathlib import Path
 from .document import DocumentInfo
+import json
 
 
 class DocumentManager:
@@ -615,6 +616,45 @@ def parse_to_md(
         
         return res_json.get("data", {})
     
+    def _parse_to_md_with_binary(
+        self,
+        file_binary: bytes,
+        filename: str,
+        config: Optional[Dict[str, Any]] = None,
+    ) -> Dict[str, Any]:
+        """
+        Internal helper method to parse file binary to Markdown
+        
+        Args:
+            file_binary: Binary content of the file
+            filename: Name of the file (must include correct extension)
+            config: Parse configuration (optional)
+        
+        Returns:
+            Parse result dictionary
+        
+        Raises:
+            Exception: API call failed
+        """
+        # Prepare files from binary data
+        files = [("file", (filename, file_binary))]
+        
+        # Prepare form data
+        form_data = {}
+        if config:
+            form_data["config"] = json.dumps(config)
+        
+        url = "/powerrag/parse_to_md/upload"
+        res = self.client.post(url, json=None, files=files, data=form_data)
+        
+        # Parse JSON response
+        res_json = res.json()
+        
+        if res_json.get("code") != 0:
+            raise Exception(res_json.get("message", "Parse to markdown failed"))
+        
+        return res_json.get("data", {})
+    
     def parse_to_md_upload(
         self,
         file_path: str,
@@ -660,26 +700,11 @@ def parse_to_md_upload(
         if not path.exists():
             raise FileNotFoundError(f"File not found: {file_path}")
         
-        # Prepare files
+        # Read file and delegate to helper method
         with open(path, "rb") as f:
-            files = [("file", (path.name, f.read()))]
+            file_binary = f.read()
         
-        # Prepare form data
-        import json
-        form_data = {}
-        if config:
-            form_data["config"] = json.dumps(config)
-        
-        url = "/powerrag/parse_to_md/upload"
-        res = self.client.post(url, json=None, files=files, data=form_data)
-        
-        # Parse JSON response
-        res_json = res.json()
-        
-        if res_json.get("code") != 0:
-            raise Exception(res_json.get("message", "Parse to markdown (upload) failed"))
-        
-        return res_json.get("data", {})
+        return self._parse_to_md_with_binary(file_binary, path.name, config)
     
     def parse_to_md_binary(
         self,
@@ -737,25 +762,8 @@ def parse_to_md_binary(
         if not filename:
             raise ValueError("filename cannot be empty")
         
-        # Prepare files from binary data
-        files = [("file", (filename, file_binary))]
-        
-        # Prepare form data
-        import json
-        form_data = {}
-        if config:
-            form_data["config"] = json.dumps(config)
-        
-        url = "/powerrag/parse_to_md/upload"
-        res = self.client.post(url, json=None, files=files, data=form_data)
-        
-        # Parse JSON response
-        res_json = res.json()
-        
-        if res_json.get("code") != 0:
-            raise Exception(res_json.get("message", "Parse to markdown (binary) failed"))
-        
-        return res_json.get("data", {})
+        # Delegate to helper method
+        return self._parse_to_md_with_binary(file_binary, filename, config)
     
     def parse_url(
         self,

From 33a05200d2c95c48490df20a08d9fd7cf45738d4 Mon Sep 17 00:00:00 2001
From: "suiyu.zg" <suiyu.zg@oceanbase.com>
Date: Tue, 13 Jan 2026 15:40:04 +0800
Subject: [PATCH 09/19] refactor(init): remove module docstring and __all__
 exports

---
 powerrag/__init__.py | 26 --------------------------
 1 file changed, 26 deletions(-)

diff --git a/powerrag/__init__.py b/powerrag/__init__.py
index e48fb89ee..a1a24464a 100644
--- a/powerrag/__init__.py
+++ b/powerrag/__init__.py
@@ -13,29 +13,3 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-
-"""
-PowerRAG - Advanced RAG Components for RAGFlow Pipeline
-
-This module provides advanced parsing, splitting, and extraction components
-that can be integrated into the RAGFlow pipeline system.
-"""
-
-# Export all components for pipeline registration
-__all__ = [
-    # Parsers
-    "PDFParser",
-    "PDFParserParam",
-    
-    # Splitters
-    "TitleBasedSplitter",
-    "TitleBasedSplitterParam",
-    
-    # Extractors
-    "EntityExtractor",
-    "EntityExtractorParam",
-    
-    # Converters
-    "DocumentToPDF",
-    "DocumentToPDFParam",
-]
\ No newline at end of file

From 399d17ae14a2c517e93574c9aba8bd0c4039830a Mon Sep 17 00:00:00 2001
From: "suiyu.zg" <suiyu.zg@oceanbase.com>
Date: Tue, 20 Jan 2026 19:59:02 +0800
Subject: [PATCH 10/19] chore(docker): add GOTENBERG server environment
 variables

---
 docker/.env.example | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/docker/.env.example b/docker/.env.example
index ac71518a9..08a832e49 100644
--- a/docker/.env.example
+++ b/docker/.env.example
@@ -61,8 +61,12 @@ MINERU_BACKEND="pipeline"
 # MinerU VLM server url, required when backend is 'vlm-http-client'
 MINERU_VLM_URL=
 
+# GOTENBERG server host
+GOTENBERG_HOST="gotenberg"
+# GOTENBERG server port
+GOTENBERG_PORT=3000
 # GOTENBERG server url
-GOTENBERG_URL=
+GOTENBERG_URL=http://${GOTENBERG_HOST}:${GOTENBERG_PORT}
 # dots.ocr server url
 DOTS_OCR_URL=
 

From f28c27213421005b839034e9db8b3f5c1fa6231c Mon Sep 17 00:00:00 2001
From: He Wang <wanghechn@qq.com>
Date: Wed, 21 Jan 2026 15:56:30 +0800
Subject: [PATCH 11/19] Update docker/.env.example

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 docker/.env.example | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/.env.example b/docker/.env.example
index 08a832e49..a6968b96c 100644
--- a/docker/.env.example
+++ b/docker/.env.example
@@ -62,7 +62,7 @@ MINERU_BACKEND="pipeline"
 MINERU_VLM_URL=
 
 # GOTENBERG server host
-GOTENBERG_HOST="gotenberg"
+GOTENBERG_HOST=gotenberg
 # GOTENBERG server port
 GOTENBERG_PORT=3000
 # GOTENBERG server url

From e9780edb40a3db6af0bf9f740510fd63c4fa4015 Mon Sep 17 00:00:00 2001
From: "suiyu.zg" <suiyu.zg@oceanbase.com>
Date: Fri, 23 Jan 2026 14:21:16 +0800
Subject: [PATCH 12/19] feat(document): add input_type parameter for file type
 detection

---
 powerrag/sdk/modules/document_manager.py  |  69 +++++++--
 powerrag/sdk/tests/.env.example           |   8 +
 powerrag/sdk/tests/conftest.py            |  14 +-
 powerrag/sdk/tests/pytest.ini             |  11 ++
 powerrag/sdk/tests/test_document.py       | 177 ++++++++++++++++++++++
 powerrag/server/routes/powerrag_routes.py |  80 +++++++---
 powerrag/server/services/parse_service.py |  72 ++++++---
 powerrag/utils/file_utils.py              |  65 ++++++++
 8 files changed, 437 insertions(+), 59 deletions(-)
 create mode 100644 powerrag/sdk/tests/.env.example

diff --git a/powerrag/sdk/modules/document_manager.py b/powerrag/sdk/modules/document_manager.py
index f6bea1f6a..08b58fea4 100644
--- a/powerrag/sdk/modules/document_manager.py
+++ b/powerrag/sdk/modules/document_manager.py
@@ -621,14 +621,18 @@ def _parse_to_md_with_binary(
         file_binary: bytes,
         filename: str,
         config: Optional[Dict[str, Any]] = None,
+        input_type: str = 'auto',
     ) -> Dict[str, Any]:
         """
         Internal helper method to parse file binary to Markdown
         
         Args:
             file_binary: Binary content of the file
-            filename: Name of the file (must include correct extension)
+            filename: Name of the file
             config: Parse configuration (optional)
+            input_type: File type detection mode (default: 'auto'). Can be:
+                - 'auto': Try filename extension first, then auto-detect from binary if no extension (default)
+                - 'pdf', 'office', 'html', 'image': Explicit file type (bypass detection)
         
         Returns:
             Parse result dictionary
@@ -642,7 +646,16 @@ def _parse_to_md_with_binary(
         # Prepare form data
         form_data = {}
         if config:
-            form_data["config"] = json.dumps(config)
+            # Add input_type to config if it's not 'auto' (since 'auto' is the default)
+            if input_type != 'auto':
+                config_copy = config.copy()
+                config_copy['input_type'] = input_type
+                form_data["config"] = json.dumps(config_copy)
+            else:
+                form_data["config"] = json.dumps(config)
+        elif input_type != 'auto':
+            # Create config with just input_type if not default
+            form_data["config"] = json.dumps({"input_type": input_type})
         
         url = "/powerrag/parse_to_md/upload"
         res = self.client.post(url, json=None, files=files, data=form_data)
@@ -659,6 +672,7 @@ def parse_to_md_upload(
         self,
         file_path: str,
         config: Optional[Dict[str, Any]] = None,
+        input_type: str = 'auto',
     ) -> Dict[str, Any]:
         """
         上传文件并解析为Markdown（不切分）
@@ -675,6 +689,9 @@ def parse_to_md_upload(
         Args:
             file_path: 文件路径
             config: 解析配置（可选），同 parse_to_md
+            input_type: 文件类型识别模式（默认: 'auto'），支持：
+                - 'auto': 优先使用文件扩展名，无扩展名或不支持时自动识别（默认）
+                - 'pdf', 'office', 'html', 'image': 显式指定文件类型（跳过识别）
         
         Returns:
             解析结果字典，包含以下字段：
@@ -689,12 +706,24 @@ def parse_to_md_upload(
             Exception: API调用失败
         
         Example:
+            >>> # 默认使用扩展名识别（推荐）
             >>> result = doc_manager.parse_to_md_upload(
-            ...     file_path="document.pdf",
-            ...     config={"layout_recognize": "mineru"}
+            ...     file_path="document.pdf"
+            ... )
+            >>> print(result['markdown'])
+            >>> 
+            >>> # 对于无扩展名文件，input_type='auto' 会自动从二进制内容识别
+            >>> result = doc_manager.parse_to_md_upload(
+            ...     file_path="document_no_ext"
+            ...     # input_type='auto' 是默认值，可以省略
             ... )
             >>> print(result['markdown'])
-            >>> print(f"Parsed {result['total_images']} images")
+            >>> 
+            >>> # 显式指定文件类型（跳过自动识别）
+            >>> result = doc_manager.parse_to_md_upload(
+            ...     file_path="document",
+            ...     input_type="pdf"
+            ... )
         """
         path = Path(file_path)
         if not path.exists():
@@ -704,13 +733,14 @@ def parse_to_md_upload(
         with open(path, "rb") as f:
             file_binary = f.read()
         
-        return self._parse_to_md_with_binary(file_binary, path.name, config)
+        return self._parse_to_md_with_binary(file_binary, path.name, config, input_type)
     
     def parse_to_md_binary(
         self,
         file_binary: bytes,
         filename: str,
         config: Optional[Dict[str, Any]] = None,
+        input_type: str = 'auto',
     ) -> Dict[str, Any]:
         """
         直接使用文件二进制内容解析为Markdown（不切分）
@@ -726,13 +756,16 @@ def parse_to_md_binary(
         
         Args:
             file_binary: 文件的二进制内容
-            filename: 文件名（必须包含正确的扩展名）
+            filename: 文件名
             config: 解析配置（可选），同 parse_to_md
                 - layout_recognize: 布局识别引擎 (mineru 或 dots_ocr，默认 mineru)
                 - enable_formula: 是否识别公式 (默认 False)
                 - enable_table: 是否识别表格 (默认 True)
                 - from_page: 起始页（仅 PDF，默认 0）
                 - to_page: 结束页（仅 PDF，默认 100000）
+            input_type: 文件类型识别模式（默认: 'auto'），支持：
+                - 'auto': 优先使用文件扩展名，无扩展名或不支持时自动识别（默认）
+                - 'pdf', 'office', 'html', 'image': 显式指定文件类型（跳过识别）
         
         Returns:
             解析结果字典，包含以下字段：
@@ -749,13 +782,27 @@ def parse_to_md_binary(
         Example:
             >>> with open("document.pdf", "rb") as f:
             ...     file_binary = f.read()
+            >>> # 默认使用扩展名识别（推荐）
             >>> result = doc_manager.parse_to_md_binary(
             ...     file_binary=file_binary,
-            ...     filename="document.pdf",
-            ...     config={"layout_recognize": "mineru", "enable_ocr": True}
+            ...     filename="document.pdf"
             ... )
             >>> print(result['markdown'])
-            >>> print(f"Parsed {result['total_images']} images")
+            >>> 
+            >>> # 对于无扩展名的二进制数据，input_type='auto' 会自动识别
+            >>> result = doc_manager.parse_to_md_binary(
+            ...     file_binary=file_binary,
+            ...     filename="document"  # 无扩展名
+            ...     # input_type='auto' 是默认值
+            ... )
+            >>> print(result['markdown'])
+            >>> 
+            >>> # 显式指定文件类型（跳过自动识别）
+            >>> result = doc_manager.parse_to_md_binary(
+            ...     file_binary=file_binary,
+            ...     filename="document",
+            ...     input_type="pdf"
+            ... )
         """
         if not file_binary:
             raise ValueError("file_binary cannot be empty")
@@ -763,7 +810,7 @@ def parse_to_md_binary(
             raise ValueError("filename cannot be empty")
         
         # Delegate to helper method
-        return self._parse_to_md_with_binary(file_binary, filename, config)
+        return self._parse_to_md_with_binary(file_binary, filename, config, input_type)
     
     def parse_url(
         self,
diff --git a/powerrag/sdk/tests/.env.example b/powerrag/sdk/tests/.env.example
new file mode 100644
index 000000000..63695f577
--- /dev/null
+++ b/powerrag/sdk/tests/.env.example
@@ -0,0 +1,8 @@
+# PowerRAG SDK Test Configuration
+# Copy this file to .env and update with your actual values
+
+# PowerRAG API服务地址
+HOST_ADDRESS=http://127.0.0.1:9390
+
+# PowerRAG API密钥（从PowerRAG系统管理-API密钥页面获取）
+POWERRAG_API_KEY=your-api-key-here
diff --git a/powerrag/sdk/tests/conftest.py b/powerrag/sdk/tests/conftest.py
index 78075d876..e9a694986 100644
--- a/powerrag/sdk/tests/conftest.py
+++ b/powerrag/sdk/tests/conftest.py
@@ -18,12 +18,20 @@
 import time
 import pytest
 from pathlib import Path
-
 from powerrag.sdk import PowerRAGClient
+from dotenv import load_dotenv
 
+load_dotenv()
 # 从环境变量获取配置
-HOST_ADDRESS = os.getenv("HOST_ADDRESS", "http://127.0.0.1:9222")
-API_KEY = os.getenv("POWERRAG_API_KEY", "ragflow-MAln1FNDn9PhIcqv1axaaUT3mM-efUZ83O5LVcroe9E")
+HOST_ADDRESS = os.getenv("HOST_ADDRESS", "http://127.0.0.1:9390")
+API_KEY = os.getenv("POWERRAG_API_KEY")
+
+if not API_KEY:
+    raise ValueError(
+        "POWERRAG_API_KEY environment variable is not set. "
+        "Please set it in your .env file or system environment. "
+        "Copy .env.example to .env and update with your API key."
+    )
 
 
 @pytest.fixture(scope="session")
diff --git a/powerrag/sdk/tests/pytest.ini b/powerrag/sdk/tests/pytest.ini
index 029953c9e..42140ddc4 100644
--- a/powerrag/sdk/tests/pytest.ini
+++ b/powerrag/sdk/tests/pytest.ini
@@ -18,3 +18,14 @@
 # pytest 配置文件
 # 注意：环境变量需要在运行 pytest 之前设置，或者通过 conftest.py 设置
 
+# 测试超时设置（秒）
+timeout = 300
+
+# 显示详细信息
+addopts = -v --tb=short
+
+# 测试文件匹配模式
+python_files = test_*.py
+python_classes = Test*
+python_functions = test_*
+
diff --git a/powerrag/sdk/tests/test_document.py b/powerrag/sdk/tests/test_document.py
index fad327cd4..90e811c54 100644
--- a/powerrag/sdk/tests/test_document.py
+++ b/powerrag/sdk/tests/test_document.py
@@ -588,5 +588,182 @@ def test_parse_to_md_binary_utf8_content(self, client: PowerRAGClient, tmp_path)
             filename="utf8_test.html"
         )
         
+        assert "markdown" in result
+        assert result["markdown_length"] > 0
+
+
+class TestDocumentInputTypeAutoDetection:
+    """测试 input_type 自动检测功能"""
+    
+    def test_auto_detection_with_valid_extension(self, client: PowerRAGClient, tmp_path):
+        """测试有有效扩展名时，input_type='auto' 优先使用扩展名"""
+        # 创建一个 HTML 文件
+        html_file = tmp_path / "test.html"
+        html_content = "<html><body><h1>Test</h1><p>Content</p></body></html>"
+        html_file.write_text(html_content)
+        
+        with open(html_file, "rb") as f:
+            file_binary = f.read()
+        
+        # input_type='auto' 是默认值，会优先使用 .html 扩展名
+        result = client.document.parse_to_md_binary(
+            file_binary=file_binary,
+            filename="test.html"
+            # input_type='auto' 是默认值，可以省略
+        )
+        
+        assert "markdown" in result
+        assert result["filename"] == "test.html"
+        assert result["markdown_length"] > 0
+    
+    def test_auto_detection_without_extension_pdf(self, client: PowerRAGClient, tmp_path):
+        """测试无扩展名 PDF 文件，input_type='auto' 会自动从二进制检测"""
+        # 创建一个简单的 PDF 文件头（实际测试可能需要真实的 PDF）
+        # 这里我们创建一个有 PDF 魔术数的文件
+        pdf_header = b"%PDF-1.4\n%\xE2\xE3\xCF\xD3\n"
+        pdf_content = pdf_header + b"1 0 obj\n<<\n/Type /Catalog\n>>\nendobj\n"
+        
+        # 使用没有扩展名的文件名
+        result = client.document.parse_to_md_binary(
+            file_binary=pdf_content,
+            filename="document_no_extension"
+            # input_type='auto' 会从二进制内容检测出 PDF
+        )
+        
+        # 注意：这个测试可能会因为 PDF 内容不完整而失败
+        # 实际环境中需要使用真实的 PDF 文件
+        assert "filename" in result
+        assert result["filename"] == "document_no_extension"
+    
+    def test_auto_detection_without_extension_html(self, client: PowerRAGClient):
+        """测试无扩展名 HTML 文件，input_type='auto' 会自动从二进制检测"""
+        html_content = b"<html><body><h1>Test Title</h1><p>Test content</p></body></html>"
+        
+        # 使用没有扩展名的文件名
+        result = client.document.parse_to_md_binary(
+            file_binary=html_content,
+            filename="document_without_ext"
+            # input_type='auto' 会从二进制内容检测出 HTML
+        )
+        
+        assert "markdown" in result
+        assert result["filename"] == "document_without_ext"
+        assert result["markdown_length"] > 0
+    
+    def test_explicit_input_type_pdf(self, client: PowerRAGClient, tmp_path):
+        """测试显式指定 input_type='pdf'"""
+        # 创建一个简单的 PDF 内容
+        pdf_header = b"%PDF-1.4\n%\xE2\xE3\xCF\xD3\n"
+        pdf_content = pdf_header + b"1 0 obj\n<<\n/Type /Catalog\n>>\nendobj\n"
+        
+        # 显式指定为 PDF 类型，即使文件名没有扩展名
+        result = client.document.parse_to_md_binary(
+            file_binary=pdf_content,
+            filename="document",
+            input_type="pdf"  # 显式指定类型
+        )
+        
+        assert "filename" in result
+    
+    def test_explicit_input_type_html(self, client: PowerRAGClient):
+        """测试显式指定 input_type='html'"""
+        html_content = b"<html><body><h1>Title</h1><p>Paragraph</p></body></html>"
+        
+        # 显式指定为 HTML 类型
+        result = client.document.parse_to_md_binary(
+            file_binary=html_content,
+            filename="document",
+            input_type="html"  # 显式指定类型
+        )
+        
+        assert "markdown" in result
+        assert result["markdown_length"] > 0
+    
+    def test_parse_to_md_upload_with_auto_detection(self, client: PowerRAGClient, tmp_path):
+        """测试 parse_to_md_upload 方法的自动检测功能"""
+        # 创建一个测试文件
+        html_file = tmp_path / "test_auto.html"
+        html_content = "<html><body><h1>Auto Detection Test</h1></body></html>"
+        html_file.write_text(html_content)
+        
+        # 使用默认的 input_type='auto'
+        result = client.document.parse_to_md_upload(str(html_file))
+        
+        assert "markdown" in result
+        assert "filename" in result
+        assert result["markdown_length"] > 0
+    
+    def test_parse_to_md_upload_with_explicit_type(self, client: PowerRAGClient, tmp_path):
+        """测试 parse_to_md_upload 显式指定类型"""
+        html_file = tmp_path / "test_explicit.html"
+        html_content = "<html><body><h1>Explicit Type Test</h1></body></html>"
+        html_file.write_text(html_content)
+        
+        # 显式指定类型
+        result = client.document.parse_to_md_upload(
+            str(html_file),
+            input_type="html"
+        )
+        
+        assert "markdown" in result
+        assert result["markdown_length"] > 0
+    
+    def test_auto_detection_priority_extension_over_binary(self, client: PowerRAGClient, tmp_path):
+        """测试 input_type='auto' 优先使用扩展名而非二进制检测"""
+        # 创建一个 HTML 文件
+        html_file = tmp_path / "priority_test.html"
+        html_content = "<html><body><h1>Priority Test</h1><p>Extension should be used first</p></body></html>"
+        html_file.write_text(html_content)
+        
+        with open(html_file, "rb") as f:
+            file_binary = f.read()
+        
+        # 文件名有 .html 扩展名，应该优先使用扩展名识别
+        result = client.document.parse_to_md_binary(
+            file_binary=file_binary,
+            filename="priority_test.html"
+            # input_type='auto' 默认值
+        )
+        
+        assert "markdown" in result
+        assert result["filename"] == "priority_test.html"
+        # 验证确实解析成功（说明使用了正确的类型）
+        assert "Priority Test" in result["markdown"] or result["markdown_length"] > 0
+    
+    def test_auto_detection_fallback_to_binary(self, client: PowerRAGClient):
+        """测试扩展名不支持时，fallback 到二进制检测"""
+        html_content = b"<html><body><h1>Fallback Test</h1></body></html>"
+        
+        # 使用一个不支持的扩展名
+        result = client.document.parse_to_md_binary(
+            file_binary=html_content,
+            filename="document.unknown_ext"
+            # input_type='auto' 会先尝试 .unknown_ext（失败），然后从二进制检测
+        )
+        
+        # 应该能够通过二进制检测识别为 HTML
+        assert "markdown" in result
+        assert result["markdown_length"] > 0
+    
+    def test_config_with_input_type(self, client: PowerRAGClient, tmp_path):
+        """测试 config 中包含 input_type 参数"""
+        html_file = tmp_path / "config_test.html"
+        html_content = "<html><body><h1>Config Test</h1></body></html>"
+        html_file.write_text(html_content)
+        
+        with open(html_file, "rb") as f:
+            file_binary = f.read()
+        
+        # 同时使用 config 和 input_type
+        result = client.document.parse_to_md_binary(
+            file_binary=file_binary,
+            filename="config_test.html",
+            config={
+                "layout_recognize": "mineru",
+                "enable_table": True
+            },
+            input_type="html"
+        )
+        
         assert "markdown" in result
         assert result["markdown_length"] > 0
\ No newline at end of file
diff --git a/powerrag/server/routes/powerrag_routes.py b/powerrag/server/routes/powerrag_routes.py
index 4cc881810..b1a63674b 100644
--- a/powerrag/server/routes/powerrag_routes.py
+++ b/powerrag/server/routes/powerrag_routes.py
@@ -980,6 +980,9 @@ async def parse_to_md_upload(tenant_id):
     - enable_table (bool): Enable table recognition (default: true)
     - from_page (int): Start page number (default: 0)
     - to_page (int): End page number (default: 100000)
+    - input_type (str): File type detection mode (default: 'auto'). Options:
+        * 'auto': Try filename extension first, then auto-detect from binary if no extension (default)
+        * 'pdf', 'office', 'html', 'image': Explicit file type (bypass detection)
     
     Response JSON:
     {
@@ -1042,34 +1045,61 @@ async def parse_to_md_upload(tenant_id):
         # Add filename to config
         config['filename'] = filename
         
-        # Determine format type
-        from pathlib import Path
-        file_ext = Path(filename).suffix.lstrip('.').lower()
-        
-        logger.info(f"Parsed filename: {filename}, extension: '{file_ext}'")
-        
-        if not file_ext:
-            return jsonify({
-                "code": 400,
-                "message": f"File must have an extension. Filename: '{filename}', parsed extension: '{file_ext}'"
-            }), 400
+        # Get input_type parameter (default: 'auto')
+        input_type = config.get('input_type', 'auto')
         
-        # Supported: PDF, Office (doc/docx/ppt/pptx), HTML, Markdown, Images (jpg/png)
-        format_type_map = {
-            'pdf': 'pdf',
-            'docx': 'office', 'doc': 'office',
-            'xlsx': 'office', 'xls': 'office',
-            'pptx': 'office', 'ppt': 'office',
-            'html': 'html', 'htm': 'html',
-            'jpg': 'image', 'jpeg': 'image',
-            'png': 'image'
-        }
-        
-        format_type = format_type_map.get(file_ext)
-        if not format_type:
+        # Determine format type based on input_type
+        if input_type == 'auto':
+            # Auto mode: Try extension first, then binary detection
+            from pathlib import Path
+            file_ext = Path(filename).suffix.lstrip('.').lower()
+            
+            if file_ext:
+                # Has extension, try to use it
+                format_type_map = {
+                    'pdf': 'pdf',
+                    'docx': 'office', 'doc': 'office',
+                    'xlsx': 'office', 'xls': 'office',
+                    'pptx': 'office', 'ppt': 'office',
+                    'html': 'html', 'htm': 'html',
+                    'jpg': 'image', 'jpeg': 'image',
+                    'png': 'image'
+                }
+                format_type = format_type_map.get(file_ext)
+                
+                if format_type:
+                    # Valid extension found
+                    logger.info(f"Using filename extension: {format_type} (.{file_ext}) for file: {filename}")
+                else:
+                    # Unsupported extension, try auto-detect from binary
+                    from powerrag.utils.file_utils import detect_file_type
+                    format_type = detect_file_type(binary)
+                    logger.info(f"Extension '{file_ext}' not supported, auto-detected from binary: {format_type} for file: {filename}")
+                    
+                    if format_type == 'unknown':
+                        return jsonify({
+                            "code": 400,
+                            "message": f"Unsupported file extension: {file_ext}. Supported formats: pdf, doc, docx, ppt, pptx, jpg, png, html. Binary auto-detection also failed."
+                        }), 400
+            else:
+                # No extension, auto-detect from binary content
+                from powerrag.utils.file_utils import detect_file_type
+                format_type = detect_file_type(binary)
+                logger.info(f"No extension found, auto-detected file type from binary: {format_type} for file: {filename}")
+                
+                if format_type == 'unknown':
+                    return jsonify({
+                        "code": 400,
+                        "message": f"Unable to determine file type for {filename}. File has no extension and binary auto-detection failed. Please provide a file with a valid extension or specify input_type explicitly."
+                    }), 400
+        elif input_type in ['pdf', 'office', 'html', 'image']:
+            # Use explicitly specified input_type
+            format_type = input_type
+            logger.info(f"Using explicit input_type: {format_type} for file: {filename}")
+        else:
             return jsonify({
                 "code": 400,
-                "message": f"Unsupported file format: {file_ext}. Supported formats: pdf, doc, docx, ppt, pptx, jpg, png, html"
+                "message": f"Invalid input_type: {input_type}. Must be 'auto' (default), 'pdf', 'office', 'html', or 'image'."
             }), 400
         
         # Create service and parse
diff --git a/powerrag/server/services/parse_service.py b/powerrag/server/services/parse_service.py
index 8e4f96abb..1c4d24f09 100644
--- a/powerrag/server/services/parse_service.py
+++ b/powerrag/server/services/parse_service.py
@@ -140,7 +140,7 @@ def parse_document(self, doc_id: str) -> Dict[str, Any]:
             raise
     
     def parse_file_binary(self, binary: bytes, filename: str,
-                         config: Dict[str, Any] = None) -> Dict[str, Any]:
+                         config: Dict[str, Any] = None, input_type: str = 'auto') -> Dict[str, Any]:
         """
         Parse file binary directly (without doc_id) and return markdown + images
         
@@ -150,9 +150,11 @@ def parse_file_binary(self, binary: bytes, filename: str,
         
         Args:
             binary: File binary data
-            filename: Original filename (used to detect format)
-            parser_id: Parser ID (e.g., "title", "naive", "paper")
+            filename: Original filename (used to detect format if possible)
             config: Parser configuration
+            input_type: File type detection mode (default: 'auto'). Can be:
+                - 'auto': Try filename extension first, then auto-detect from binary if no extension (default)
+                - 'pdf', 'office', 'html', 'image', 'markdown': Explicit file type (bypass detection)
             
         Returns:
             Dict containing markdown content and images
@@ -164,22 +166,43 @@ def parse_file_binary(self, binary: bytes, filename: str,
             }
         """
         try:
-            # Check if format is supported
-            file_ext = Path(filename).suffix.lstrip('.').lower()
-            if file_ext not in self.SUPPORTED_FORMATS:
+            # Determine format type based on input_type parameter
+            if input_type == 'auto':
+                # Auto mode: Try extension first, then binary detection
+                file_ext = Path(filename).suffix.lstrip('.').lower()
+                
+                if file_ext and file_ext in self.SUPPORTED_FORMATS:
+                    # Has valid extension, use it
+                    format_type = self.SUPPORTED_FORMATS[file_ext]
+                    logger.info(f"Using filename extension for file type: {format_type} (.{file_ext}) for file: {filename}")
+                else:
+                    # No extension or unsupported extension, auto-detect from binary
+                    from powerrag.utils.file_utils import detect_file_type
+                    format_type = detect_file_type(binary)
+                    logger.info(f"Auto-detected file type from binary: {format_type} for file: {filename}")
+                    
+                    if format_type == 'unknown':
+                        raise ValueError(
+                            f"Unable to determine file type for {filename}. "
+                            f"File has no extension or unsupported extension '{file_ext}', and binary auto-detection failed. "
+                            f"Please provide a valid input_type explicitly."
+                        )
+            elif input_type in ['pdf', 'office', 'html', 'image', 'markdown']:
+                # Use explicitly specified input_type
+                format_type = input_type
+                logger.info(f"Using explicit input_type: {format_type} for file: {filename}")
+            else:
                 raise ValueError(
-                    f"Unsupported format: .{file_ext}. "
-                    f"PowerRAG only supports: {', '.join(sorted(set(self.SUPPORTED_FORMATS.values())))}"
+                    f"Invalid input_type: {input_type}. "
+                    f"Must be 'auto', 'pdf', 'office', 'html', 'image', or 'markdown'"
                 )
             
-            format_type = self.SUPPORTED_FORMATS[file_ext]
-            
             # Parse document to get markdown and images
             md_content, images = self._parse_to_markdown(filename, binary, format_type, config)
             
             return {
                 "filename": filename,
-                "file_format": file_ext,
+                "file_format": Path(filename).suffix.lstrip('.').lower() if filename else 'unknown',
                 "format_type": format_type,
                 "markdown": md_content,
                 "images": images,
@@ -636,15 +659,24 @@ def _parse_to_markdown_for_task(self, doc_id: str = None, filename: str = None,
                 logger.error(f"Failed to get binary for doc {doc_id}: {e}", exc_info=True)
                 raise ValueError(f"Failed to retrieve document binary: {e}")
             
-            # Determine format
-            file_ext = Path(doc.name).suffix.lstrip('.').lower()
-            format_type_map = {
-                'pdf': 'pdf', 'docx': 'office', 'doc': 'office',
-                'xlsx': 'office', 'xls': 'office', 'pptx': 'office', 'ppt': 'office',
-                'html': 'html', 'htm': 'html',
-                'jpg': 'image', 'jpeg': 'image', 'png': 'image'
-            }
-            format_type = format_type_map.get(file_ext, 'pdf')
+            # Determine format from config or filename
+            input_type = config.get('input_type')
+            if input_type == 'auto':
+                from powerrag.utils.file_utils import detect_file_type
+                format_type = detect_file_type(binary)
+                logger.info(f"Auto-detected file type: {format_type} for document {doc_id}")
+            elif input_type:
+                format_type = input_type
+            else:
+                # Auto-detect from file extension
+                file_ext = Path(doc.name).suffix.lstrip('.').lower()
+                format_type_map = {
+                    'pdf': 'pdf', 'docx': 'office', 'doc': 'office',
+                    'xlsx': 'office', 'xls': 'office', 'pptx': 'office', 'ppt': 'office',
+                    'html': 'html', 'htm': 'html',
+                    'jpg': 'image', 'jpeg': 'image', 'png': 'image'
+                }
+                format_type = format_type_map.get(file_ext, 'pdf')
             filename = doc.name
         
         # Case 2: Parse from direct binary (filename, binary, format_type provided)
diff --git a/powerrag/utils/file_utils.py b/powerrag/utils/file_utils.py
index f4a58bba9..bf6d7b084 100644
--- a/powerrag/utils/file_utils.py
+++ b/powerrag/utils/file_utils.py
@@ -313,6 +313,71 @@ def _guess_ext(b: bytes) -> str:
         return ".doc"
     return ".bin"
 
+
+def detect_file_type(binary: bytes) -> str:
+    """
+    Detect file type from binary data using magic numbers.
+    
+    Returns the file type as a string compatible with PowerRAG format types:
+    - 'pdf': PDF files
+    - 'office': Office documents (doc, docx, xls, xlsx, ppt, pptx)
+    - 'html': HTML files
+    - 'image': Image files (jpg, jpeg, png)
+    - 'unknown': Unable to determine file type
+    
+    Args:
+        binary: File binary data
+        
+    Returns:
+        File type string ('pdf', 'office', 'html', 'image', 'unknown')
+    """
+    if not binary or len(binary) < 8:
+        return 'unknown'
+    
+    head = binary[:8]
+    
+    # Check PDF
+    if _is_pdf(head):
+        return 'pdf'
+    
+    # Check ZIP-based Office formats (docx, xlsx, pptx)
+    if _is_zip(head):
+        try:
+            with zipfile.ZipFile(io.BytesIO(binary), "r") as z:
+                names = [n.lower() for n in z.namelist()]
+                if any(n.startswith("word/") for n in names):
+                    return 'office'  # docx
+                if any(n.startswith("ppt/") for n in names):
+                    return 'office'  # pptx
+                if any(n.startswith("xl/") for n in names):
+                    return 'office'  # xlsx
+        except Exception:
+            pass
+    
+    # Check OLE-based Office formats (doc, xls, ppt)
+    if _is_ole(head):
+        return 'office'
+    
+    # Check common image formats
+    # JPEG: FF D8 FF
+    if head.startswith(b"\xFF\xD8\xFF"):
+        return 'image'
+    
+    # PNG: 89 50 4E 47 0D 0A 1A 0A
+    if head.startswith(b"\x89PNG\r\n\x1a\n"):
+        return 'image'
+    
+    # Check HTML (basic detection)
+    # Try to decode as text and check for HTML markers
+    try:
+        text_sample = binary[:1024].decode('utf-8', errors='ignore').lower()
+        if '<html' in text_sample or '<!doctype html' in text_sample or '<head' in text_sample:
+            return 'html'
+    except Exception:
+        pass
+    
+    return 'unknown'
+
 # Try to extract the real embedded payload from OLE's Ole10Native
 def _extract_ole10native_payload(data: bytes) -> bytes:
     try:

From 045ad0af371552f85409ab5973c19c58dd86feae Mon Sep 17 00:00:00 2001
From: "suiyu.zg" <suiyu.zg@oceanbase.com>
Date: Fri, 23 Jan 2026 15:19:12 +0800
Subject: [PATCH 13/19] feat(document): add support for file_url to parse
 documents from URL

---
 powerrag/sdk/README.md                    | 229 +++++++++++++++++++++-
 powerrag/sdk/tests/test_document.py       | 197 ++++++++++++++++++-
 powerrag/server/routes/powerrag_routes.py |  93 ++++++---
 3 files changed, 492 insertions(+), 27 deletions(-)

diff --git a/powerrag/sdk/README.md b/powerrag/sdk/README.md
index d2c470b87..410d07f2c 100644
--- a/powerrag/sdk/README.md
+++ b/powerrag/sdk/README.md
@@ -119,11 +119,49 @@ print(f"Status: {status['status']}")
 # 等待任务完成
 result = client.document.wait_for_parse_to_md(task_id, timeout=300)
 
-# 直接上传文件并解析为 Markdown
+# 直接上传文件并解析为 Markdown（无需知识库）
 result = client.document.parse_to_md_upload(
     "document.pdf",
     config={"layout_recognize": "mineru"}
 )
+
+# 从 URL 下载并解析（直接调用 API）
+import requests
+import json
+
+response = requests.post(
+    f"{client.api_url}/powerrag/parse_to_md/upload",
+    headers={"Authorization": f"Bearer {client.api_key}"},
+    data={
+        "file_url": "https://example.com/document.pdf",
+        "config": json.dumps({"layout_recognize": "mineru"})
+    }
+)
+result = response.json()
+
+# 使用二进制数据解析（支持无扩展名文件）
+with open("document.pdf", "rb") as f:
+    binary_data = f.read()
+
+result = client.document.parse_to_md_binary(
+    file_binary=binary_data,
+    filename="document.pdf",  # 有扩展名时自动识别
+    config={"layout_recognize": "mineru"}
+)
+
+# 对于无扩展名的文件，使用 input_type='auto' 自动识别
+result = client.document.parse_to_md_binary(
+    file_binary=binary_data,
+    filename="document_no_extension",  # 无扩展名
+    # input_type='auto' 是默认值，会自动从二进制内容检测文件类型
+)
+
+# 或显式指定文件类型
+result = client.document.parse_to_md_binary(
+    file_binary=binary_data,
+    filename="document",
+    input_type="pdf"  # 强制作为 PDF 处理
+)
 ```
 
 ### 检索
@@ -156,7 +194,7 @@ PowerRAG SDK 提供了强大的文档解析为 Markdown 的功能，支持多种
 - 图片 (.jpg, .png)
 - HTML (.html, .htm)
 
-**三种使用方式：**
+**四种使用方式：**
 
 1. **同步解析**（适合小文档）：
 ```python
@@ -173,7 +211,38 @@ result = client.document.wait_for_parse_to_md(task_id, timeout=300)
 
 3. **直接上传解析**（无需知识库）：
 ```python
+# 上传本地文件
 result = client.document.parse_to_md_upload("file.pdf", config={...})
+
+# 或使用 file_url 参数从URL下载（通过 config 传入）
+import requests
+response = requests.post(
+    "http://localhost:9390/api/v1/powerrag/parse_to_md/upload",
+    headers={"Authorization": "Bearer YOUR_API_KEY"},
+    data={
+        "file_url": "https://example.com/document.pdf",
+        "config": json.dumps({"layout_recognize": "mineru"})
+    }
+)
+```
+
+4. **二进制数据解析**（支持无扩展名文件）：
+```python
+with open("document.pdf", "rb") as f:
+    binary_data = f.read()
+
+# 自动识别文件类型（默认）
+result = client.document.parse_to_md_binary(
+    file_binary=binary_data,
+    filename="document_no_extension"  # 无扩展名也可以
+)
+
+# 或显式指定类型
+result = client.document.parse_to_md_binary(
+    file_binary=binary_data,
+    filename="document",
+    input_type="pdf"  # 'pdf', 'office', 'html', 'image'
+)
 ```
 
 **配置选项：**
@@ -182,6 +251,9 @@ result = client.document.parse_to_md_upload("file.pdf", config={...})
 - `enable_formula`: 是否识别公式
 - `enable_table`: 是否识别表格
 - `from_page`/`to_page`: PDF 页面范围
+- `input_type`: 文件类型识别模式（默认: `'auto'`）
+  - `'auto'`: 优先使用文件扩展名，无扩展名时自动检测（推荐）
+  - `'pdf'`, `'office'`, `'html'`, `'image'`: 显式指定文件类型
 
 ### 结构化信息抽取
 
@@ -339,6 +411,62 @@ result = client.document.wait_for_parse_to_md(task_id, timeout=300)
 # 上传并解析为 Markdown（无需知识库）
 result = client.document.parse_to_md_upload("file.pdf", config={...})
 
+# 使用 file_url 参数从 URL 下载并解析（直接调用 API）
+import requests
+import json
+
+response = requests.post(
+    f"{client.api_url}/powerrag/parse_to_md/upload",
+    headers={"Authorization": f"Bearer {client.api_key}"},
+    data={
+        "file_url": "https://example.com/document.pdf",
+        "config": json.dumps({
+            "layout_recognize": "mineru",
+            "input_type": "auto"  # 可选，自动检测文件类型
+        })
+    }
+)
+result = response.json()
+
+# 使用 file_url 并指定文件名
+response = requests.post(
+    f"{client.api_url}/powerrag/parse_to_md/upload",
+    headers={"Authorization": f"Bearer {client.api_key}"},
+    data={
+        "file_url": "https://example.com/download?id=123",
+        "config": json.dumps({
+            "filename": "report.pdf",  # 自定义文件名
+            "input_type": "pdf",
+            "layout_recognize": "mineru"
+        })
+    }
+)
+
+# 使用二进制数据解析为 Markdown
+with open("document.pdf", "rb") as f:
+    binary_data = f.read()
+
+result = client.document.parse_to_md_binary(
+    file_binary=binary_data,
+    filename="document.pdf",
+    config={"layout_recognize": "mineru"},
+    input_type="auto"  # 默认值，自动识别文件类型
+)
+
+# 无扩展名文件解析（自动检测文件类型）
+result = client.document.parse_to_md_binary(
+    file_binary=binary_data,
+    filename="document_no_extension",  # 无扩展名
+    # input_type='auto' 会从二进制内容自动检测 PDF/Office/HTML 等
+)
+
+# 显式指定文件类型（跳过自动检测）
+result = client.document.parse_to_md_binary(
+    file_binary=binary_data,
+    filename="document",
+    input_type="pdf"  # 强制作为 PDF 处理
+)
+
 # 解析URL文档（同步等待）
 doc = client.document.parse_url(
     kb_id,
@@ -1043,6 +1171,103 @@ for result in results:
         # 重新解析或删除
 ```
 
+### Q: 如何解析无扩展名的文件？
+
+A: 使用 `parse_to_md_binary` 方法并使用 `input_type='auto'`（默认值）：
+```python
+with open("document_no_extension", "rb") as f:
+    binary_data = f.read()
+
+# input_type='auto' 会自动从二进制内容检测文件类型
+result = client.document.parse_to_md_binary(
+    file_binary=binary_data,
+    filename="document_no_extension"
+    # input_type='auto' 是默认值，可以省略
+)
+```
+
+支持的 `input_type` 值：
+- `'auto'` (默认): 优先使用文件扩展名，无扩展名或不支持时从二进制内容自动检测
+- `'pdf'`, `'office'`, `'html'`, `'image'`: 显式指定文件类型
+
+### Q: 如何从URL直接解析文件？
+
+A: 在 `/parse_to_md/upload` API 请求中使用 `file_url` 参数，服务器会自动下载并解析：
+
+**方式 1：基本用法**
+```python
+import requests
+import json
+
+# 使用 file_url 参数
+response = requests.post(
+    "http://localhost:9390/api/v1/powerrag/parse_to_md/upload",
+    headers={"Authorization": "Bearer YOUR_API_KEY"},
+    data={
+        "file_url": "https://example.com/document.pdf",
+        "config": json.dumps({
+            "layout_recognize": "mineru",
+            "input_type": "auto"  # 可选，默认为 'auto'
+        })
+    }
+)
+result = response.json()
+print(result['data']['markdown'])
+```
+
+**方式 2：指定文件名（适用于无扩展名URL）**
+```python
+response = requests.post(
+    "http://localhost:9390/api/v1/powerrag/parse_to_md/upload",
+    headers={"Authorization": "Bearer YOUR_API_KEY"},
+    data={
+        "file_url": "https://api.example.com/download?id=123",
+        "config": json.dumps({
+            "filename": "report.pdf",  # 覆盖文件名
+            "input_type": "pdf",  # 显式指定类型
+            "layout_recognize": "mineru",
+            "enable_table": True
+        })
+    }
+)
+```
+
+**方式 3：与 SDK 客户端结合使用**
+```python
+from powerrag.sdk import PowerRAGClient
+import requests
+import json
+
+client = PowerRAGClient(api_key="your-api-key", base_url="http://localhost:9390")
+
+# 使用客户端的 api_url 和认证信息
+response = requests.post(
+    f"{client.api_url}/powerrag/parse_to_md/upload",
+    headers={"Authorization": f"Bearer {client.api_key}"},
+    data={
+        "file_url": "https://example.com/document.pdf",
+        "config": json.dumps({"layout_recognize": "mineru"})
+    }
+)
+```
+
+**配置参数说明：**
+- `file_url` (str): 文件的 URL 地址（与 `file` 参数二选一）
+- `config.filename` (str): 自定义文件名（可选，不提供则从 URL 提取）
+- `config.input_type` (str): 文件类型检测模式
+  - `'auto'` (默认): 优先使用扩展名，无扩展名时自动检测
+  - `'pdf'`, `'office'`, `'html'`, `'image'`: 显式指定类型
+- `config.layout_recognize` (str): 布局识别引擎（`'mineru'` 或 `'dots_ocr'`）
+- 其他解析配置参数同 `parse_to_md` 方法
+
+**注意事项：**
+- ✅ URL 必须可公开访问，不支持需要认证的 URL
+- ✅ 下载超时时间为 60 秒
+- ✅ 支持所有文件格式（PDF, Office, HTML, 图片）
+- ✅ 自动从 URL 路径提取文件名，或使用 `config.filename` 覆盖
+- ❌ 不能同时提供 `file` 和 `file_url` 参数
+- ❌ 必须提供 `file` 或 `file_url` 其中之一
+
 ### Q: SDK 是否支持流式返回？
 
 A: 当前版本主要支持标准 REST API 调用。对于下载等操作，SDK 内部使用了流式传输。
diff --git a/powerrag/sdk/tests/test_document.py b/powerrag/sdk/tests/test_document.py
index 90e811c54..d882c4cda 100644
--- a/powerrag/sdk/tests/test_document.py
+++ b/powerrag/sdk/tests/test_document.py
@@ -766,4 +766,199 @@ def test_config_with_input_type(self, client: PowerRAGClient, tmp_path):
         )
         
         assert "markdown" in result
-        assert result["markdown_length"] > 0
\ No newline at end of file
+        assert result["markdown_length"] > 0
+
+
+class TestDocumentFileUrl:
+    """测试 file_url 参数功能"""
+    
+    def test_parse_from_url_basic(self, client: PowerRAGClient):
+        """测试从URL下载并解析文件（基本功能）"""
+        import requests
+        import json
+        
+        # 使用一个公开可访问的示例 HTML URL
+        file_url = "https://httpbin.org/html"
+        
+        # 直接调用 API（因为 SDK 方法已被删除）
+        response = requests.post(
+            f"{client.api_url}/powerrag/parse_to_md/upload",
+            headers={"Authorization": f"Bearer {client.api_key}"},
+            data={
+                "file_url": file_url,
+                "config": json.dumps({"input_type": "html"})
+            }
+        )
+        
+        assert response.status_code == 200
+        result = response.json()
+        assert result["code"] == 0
+        assert "markdown" in result["data"]
+        assert result["data"]["markdown_length"] > 0
+    
+    def test_parse_from_url_with_filename(self, client: PowerRAGClient):
+        """测试从URL下载并指定文件名"""
+        import requests
+        import json
+        
+        file_url = "https://httpbin.org/html"
+        custom_filename = "custom_document.html"
+        
+        response = requests.post(
+            f"{client.api_url}/powerrag/parse_to_md/upload",
+            headers={"Authorization": f"Bearer {client.api_key}"},
+            data={
+                "file_url": file_url,
+                "config": json.dumps({
+                    "filename": custom_filename,
+                    "input_type": "html"
+                })
+            }
+        )
+        
+        assert response.status_code == 200
+        result = response.json()
+        assert result["code"] == 0
+        assert result["data"]["filename"] == custom_filename
+    
+    def test_parse_from_url_with_auto_detection(self, client: PowerRAGClient):
+        """测试从URL下载，使用 input_type='auto' 自动检测"""
+        import requests
+        import json
+        
+        file_url = "https://httpbin.org/html"
+        
+        response = requests.post(
+            f"{client.api_url}/powerrag/parse_to_md/upload",
+            headers={"Authorization": f"Bearer {client.api_key}"},
+            data={
+                "file_url": file_url,
+                "config": json.dumps({
+                    "input_type": "auto"  # 自动检测
+                })
+            }
+        )
+        
+        assert response.status_code == 200
+        result = response.json()
+        assert result["code"] == 0
+        assert "markdown" in result["data"]
+    
+    def test_parse_from_invalid_url(self, client: PowerRAGClient):
+        """测试无效URL应返回错误"""
+        import requests
+        import json
+        
+        invalid_url = "https://invalid-url-that-does-not-exist-12345.com/file.pdf"
+        
+        response = requests.post(
+            f"{client.api_url}/powerrag/parse_to_md/upload",
+            headers={"Authorization": f"Bearer {client.api_key}"},
+            data={
+                "file_url": invalid_url,
+                "config": json.dumps({})
+            }
+        )
+        
+        # 应该返回 400 错误
+        assert response.status_code == 400
+        result = response.json()
+        assert result["code"] == 400
+        assert "Failed to download" in result["message"]
+    
+    def test_parse_cannot_provide_both_file_and_url(self, client: PowerRAGClient, tmp_path):
+        """测试不能同时提供 file 和 file_url"""
+        import requests
+        
+        # 创建临时文件
+        html_file = tmp_path / "test.html"
+        html_file.write_text("<html><body>Test</body></html>")
+        
+        file_url = "https://httpbin.org/html"
+        
+        # 同时提供 file 和 file_url
+        with open(html_file, "rb") as f:
+            response = requests.post(
+                f"{client.api_url}/powerrag/parse_to_md/upload",
+                headers={"Authorization": f"Bearer {client.api_key}"},
+                files={"file": ("test.html", f, "text/html")},
+                data={
+                    "file_url": file_url,
+                    "config": "{}"
+                }
+            )
+        
+        # 应该返回 400 错误
+        assert response.status_code == 400
+        result = response.json()
+        assert result["code"] == 400
+        assert "Cannot provide both" in result["message"]
+    
+    def test_parse_must_provide_file_or_url(self, client: PowerRAGClient):
+        """测试必须提供 file 或 file_url 其中之一"""
+        import requests
+        
+        # 不提供 file 也不提供 file_url
+        response = requests.post(
+            f"{client.api_url}/powerrag/parse_to_md/upload",
+            headers={"Authorization": f"Bearer {client.api_key}"},
+            data={"config": "{}"}
+        )
+        
+        # 应该返回 400 错误
+        assert response.status_code == 400
+        result = response.json()
+        assert result["code"] == 400
+        assert "Either 'file' or 'file_url' must be provided" in result["message"]
+    
+    def test_parse_from_url_with_config(self, client: PowerRAGClient):
+        """测试从URL下载并使用完整配置"""
+        import requests
+        import json
+        
+        file_url = "https://httpbin.org/html"
+        
+        response = requests.post(
+            f"{client.api_url}/powerrag/parse_to_md/upload",
+            headers={"Authorization": f"Bearer {client.api_key}"},
+            data={
+                "file_url": file_url,
+                "config": json.dumps({
+                    "filename": "complete_config.html",
+                    "input_type": "html",
+                    "layout_recognize": "mineru",
+                    "enable_table": True
+                })
+            }
+        )
+        
+        assert response.status_code == 200
+        result = response.json()
+        assert result["code"] == 0
+        assert result["data"]["filename"] == "complete_config.html"
+        assert "markdown" in result["data"]
+    
+    def test_parse_from_url_empty_file(self, client: PowerRAGClient):
+        """测试从URL下载空文件应返回错误"""
+        import requests
+        import json
+        
+        # 使用一个返回空内容的URL（如果存在）
+        # 注意：这个测试可能需要 mock，这里使用真实场景
+        # httpbin.org/bytes/0 返回 0 字节
+        empty_url = "https://httpbin.org/bytes/0"
+        
+        response = requests.post(
+            f"{client.api_url}/powerrag/parse_to_md/upload",
+            headers={"Authorization": f"Bearer {client.api_key}"},
+            data={
+                "file_url": empty_url,
+                "config": json.dumps({"filename": "empty.bin"})
+            }
+        )
+        
+        # 应该返回 400 错误
+        assert response.status_code == 400
+        result = response.json()
+        assert result["code"] == 400
+        assert "empty" in result["message"].lower()
\ No newline at end of file
diff --git a/powerrag/server/routes/powerrag_routes.py b/powerrag/server/routes/powerrag_routes.py
index b1a63674b..6a3dd6b1d 100644
--- a/powerrag/server/routes/powerrag_routes.py
+++ b/powerrag/server/routes/powerrag_routes.py
@@ -960,6 +960,7 @@ async def parse_to_md_upload(tenant_id):
     Parse uploaded file to Markdown WITHOUT chunking
     
     直接上传文件并解析为 Markdown，不进行切分。
+    支持两种方式：1) 直接上传文件 2) 提供文件URL
     
     支持的文件格式:
     - PDF (.pdf)
@@ -970,7 +971,8 @@ async def parse_to_md_upload(tenant_id):
     Authentication: Requires RAGFlow API key in Authorization header (Bearer token)
     
     Request (multipart/form-data):
-    - file: File to parse (required) - supports PDF, Office (doc/docx/ppt/pptx), Images (jpg/png), HTML
+    - file: File to parse (optional, required if file_url not provided)
+    - file_url: URL of file to download and parse (optional, required if file not provided)
     - config: JSON string of parser config (optional)
     
     Config parameters:
@@ -983,6 +985,7 @@ async def parse_to_md_upload(tenant_id):
     - input_type (str): File type detection mode (default: 'auto'). Options:
         * 'auto': Try filename extension first, then auto-detect from binary if no extension (default)
         * 'pdf', 'office', 'html', 'image': Explicit file type (bypass detection)
+    - filename (str): Override filename (optional, useful with file_url)
     
     Response JSON:
     {
@@ -998,21 +1001,6 @@ async def parse_to_md_upload(tenant_id):
     }
     """
     try:
-        # Check if file is present
-        files = await request.files
-        if 'file' not in files:
-            return jsonify({
-                "code": 400,
-                "message": "No file provided"
-            }), 400
-        
-        file = files['file']
-        if file.filename == '':
-            return jsonify({
-                "code": 400,
-                "message": "No file selected"
-            }), 400
-        
         # Parse config from JSON string if provided
         import json
         form = await request.form
@@ -1025,23 +1013,80 @@ async def parse_to_md_upload(tenant_id):
                 "message": "Invalid JSON in config parameter"
             }), 400
         
-        # Read file binary
-        filename = file.filename
-        logger.info(f"Received file upload: filename={filename}, file object={file}")
+        # Get file_url parameter
+        file_url = form.get('file_url')
+        
+        # Check if file or file_url is provided
+        files = await request.files
+        has_file = 'file' in files and files['file'].filename != ''
+        has_url = file_url and file_url.strip() != ''
         
-        if not filename:
+        if not has_file and not has_url:
             return jsonify({
                 "code": 400,
-                "message": "Filename is required"
+                "message": "Either 'file' or 'file_url' must be provided"
             }), 400
         
-        binary = file.read()
-        if not binary:
+        if has_file and has_url:
             return jsonify({
                 "code": 400,
-                "message": "File is empty"
+                "message": "Cannot provide both 'file' and 'file_url'. Please choose one."
             }), 400
         
+        # Handle file upload or URL download
+        if has_file:
+            # Direct file upload
+            file = files['file']
+            filename = file.filename
+            logger.info(f"Received file upload: filename={filename}")
+            
+            if not filename:
+                return jsonify({
+                    "code": 400,
+                    "message": "Filename is required"
+                }), 400
+            
+            binary = file.read()
+            if not binary:
+                return jsonify({
+                    "code": 400,
+                    "message": "File is empty"
+                }), 400
+        else:
+            # Download from URL
+            import requests
+            from urllib.parse import urlparse
+            from pathlib import Path
+            
+            logger.info(f"Downloading file from URL: {file_url}")
+            
+            try:
+                response = requests.get(file_url, timeout=60)
+                response.raise_for_status()
+                binary = response.content
+            except requests.exceptions.RequestException as e:
+                logger.error(f"Failed to download file from URL: {file_url}. Error: {e}")
+                return jsonify({
+                    "code": 400,
+                    "message": f"Failed to download file from URL: {str(e)}"
+                }), 400
+            
+            if not binary:
+                return jsonify({
+                    "code": 400,
+                    "message": "Downloaded file is empty"
+                }), 400
+            
+            # Extract filename from URL or use override from config
+            filename = config.get('filename')
+            if not filename:
+                parsed_url = urlparse(file_url)
+                filename = Path(parsed_url.path).name
+                if not filename:
+                    filename = "downloaded_file"
+            
+            logger.info(f"Downloaded file: {filename}, size: {len(binary)} bytes")
+        
         # Add filename to config
         config['filename'] = filename
         

From 5cd36ad13df160f88bcb274f2e78dc7eebdd665e Mon Sep 17 00:00:00 2001
From: zhanggan7723 <suiyu.zg@oceanbase.com>
Date: Mon, 26 Jan 2026 20:13:54 +0800
Subject: [PATCH 14/19] Update powerrag/utils/file_utils.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 powerrag/utils/file_utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/powerrag/utils/file_utils.py b/powerrag/utils/file_utils.py
index bf6d7b084..334517c5b 100644
--- a/powerrag/utils/file_utils.py
+++ b/powerrag/utils/file_utils.py
@@ -352,6 +352,8 @@ def detect_file_type(binary: bytes) -> str:
                 if any(n.startswith("xl/") for n in names):
                     return 'office'  # xlsx
         except Exception:
+            # Any error while reading as a ZIP (corrupt/non-Office archive, etc.)
+            # means we cannot classify it as a ZIP-based Office file; fall through.
             pass
     
     # Check OLE-based Office formats (doc, xls, ppt)

From 56358669c2517e16a441423cc82bd34110eb7b0c Mon Sep 17 00:00:00 2001
From: zhanggan7723 <suiyu.zg@oceanbase.com>
Date: Mon, 26 Jan 2026 20:20:19 +0800
Subject: [PATCH 15/19] Update powerrag/utils/file_utils.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 powerrag/utils/file_utils.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/powerrag/utils/file_utils.py b/powerrag/utils/file_utils.py
index 334517c5b..5e96988e8 100644
--- a/powerrag/utils/file_utils.py
+++ b/powerrag/utils/file_utils.py
@@ -352,6 +352,9 @@ def detect_file_type(binary: bytes) -> str:
                 if any(n.startswith("xl/") for n in names):
                     return 'office'  # xlsx
         except Exception:
+            # If the ZIP is not a recognized Office document (or cannot be read),
+            # fall through to other format checks; it may be classified as 'unknown'
+            # at the end if no other type matches.
             # Any error while reading as a ZIP (corrupt/non-Office archive, etc.)
             # means we cannot classify it as a ZIP-based Office file; fall through.
             pass

From 2b3496a769ad71666a5e9fa9042e04ada5d6bab9 Mon Sep 17 00:00:00 2001
From: "suiyu.zg" <suiyu.zg@oceanbase.com>
Date: Tue, 27 Jan 2026 21:11:55 +0800
Subject: [PATCH 16/19] refactor(document): update README and code to use
 'content' instead of 'markdown' for parsed results

---
 powerrag/sdk/README.md                    |  54 ++++-
 powerrag/sdk/modules/document_manager.py  |  16 +-
 powerrag/sdk/tests/test_document.py       | 227 ++++++++++++------
 powerrag/server/routes/powerrag_routes.py | 276 +++++++++++++++++-----
 powerrag/server/services/parse_service.py |  61 +++--
 powerrag/utils/file_utils.py              |   2 +
 6 files changed, 470 insertions(+), 166 deletions(-)

diff --git a/powerrag/sdk/README.md b/powerrag/sdk/README.md
index 410d07f2c..f59868396 100644
--- a/powerrag/sdk/README.md
+++ b/powerrag/sdk/README.md
@@ -124,6 +124,9 @@ result = client.document.parse_to_md_upload(
     "document.pdf",
     config={"layout_recognize": "mineru"}
 )
+# 访问返回结果
+print(result['content'])  # Markdown 内容
+print(result['total_images'])  # 图片总数
 
 # 从 URL 下载并解析（直接调用 API）
 import requests
@@ -138,6 +141,9 @@ response = requests.post(
     }
 )
 result = response.json()
+# 访问返回结果
+print(result['data']['content'])  # Markdown 内容
+print(result['data']['total_images'])  # 图片总数
 
 # 使用二进制数据解析（支持无扩展名文件）
 with open("document.pdf", "rb") as f:
@@ -148,6 +154,8 @@ result = client.document.parse_to_md_binary(
     filename="document.pdf",  # 有扩展名时自动识别
     config={"layout_recognize": "mineru"}
 )
+# 访问返回结果
+print(result['content'])  # Markdown 内容
 
 # 对于无扩展名的文件，使用 input_type='auto' 自动识别
 result = client.document.parse_to_md_binary(
@@ -155,13 +163,17 @@ result = client.document.parse_to_md_binary(
     filename="document_no_extension",  # 无扩展名
     # input_type='auto' 是默认值，会自动从二进制内容检测文件类型
 )
+# 访问返回结果
+print(result['content'])  # Markdown 内容
 
-# 或显式指定文件类型
+# 或显式指定文件类型（使用具体扩展名）
 result = client.document.parse_to_md_binary(
     file_binary=binary_data,
     filename="document",
-    input_type="pdf"  # 强制作为 PDF 处理
+    input_type="pdf"  # 具体扩展名: 'pdf', 'docx', 'html', 'jpg' 等
 )
+# 访问返回结果
+print(result['content'])  # Markdown 内容
 ```
 
 ### 检索
@@ -236,13 +248,20 @@ result = client.document.parse_to_md_binary(
     file_binary=binary_data,
     filename="document_no_extension"  # 无扩展名也可以
 )
+# 访问返回结果
+print(result['content'])  # Markdown 内容
 
 # 或显式指定类型
 result = client.document.parse_to_md_binary(
     file_binary=binary_data,
     filename="document",
-    input_type="pdf"  # 'pdf', 'office', 'html', 'image'
+    input_type="pdf"  # 具体扩展名: 'pdf', 'docx', 'html', 'jpg' 等
 )
+
+# 访问返回结果
+print(result['content'])  # Markdown 内容
+print(result['total_images'])  # 图片总数
+print(result['images'])  # 图片字典
 ```
 
 **配置选项：**
@@ -253,7 +272,7 @@ result = client.document.parse_to_md_binary(
 - `from_page`/`to_page`: PDF 页面范围
 - `input_type`: 文件类型识别模式（默认: `'auto'`）
   - `'auto'`: 优先使用文件扩展名，无扩展名时自动检测（推荐）
-  - `'pdf'`, `'office'`, `'html'`, `'image'`: 显式指定文件类型
+  - 具体文件扩展名: `'pdf'`, `'docx'`, `'doc'`, `'xlsx'`, `'xls'`, `'pptx'`, `'ppt'`, `'html'`, `'htm'`, `'jpg'`, `'jpeg'`, `'png'` - 显式指定文件扩展名
 
 ### 结构化信息抽取
 
@@ -410,6 +429,8 @@ result = client.document.wait_for_parse_to_md(task_id, timeout=300)
 
 # 上传并解析为 Markdown（无需知识库）
 result = client.document.parse_to_md_upload("file.pdf", config={...})
+# 访问返回结果
+print(result['content'])  # Markdown 内容
 
 # 使用 file_url 参数从 URL 下载并解析（直接调用 API）
 import requests
@@ -427,6 +448,8 @@ response = requests.post(
     }
 )
 result = response.json()
+# 访问返回结果
+print(result['data']['content'])  # Markdown 内容
 
 # 使用 file_url 并指定文件名
 response = requests.post(
@@ -441,6 +464,9 @@ response = requests.post(
         })
     }
 )
+result = response.json()
+# 访问返回结果
+print(result['data']['content'])  # Markdown 内容
 
 # 使用二进制数据解析为 Markdown
 with open("document.pdf", "rb") as f:
@@ -452,6 +478,8 @@ result = client.document.parse_to_md_binary(
     config={"layout_recognize": "mineru"},
     input_type="auto"  # 默认值，自动识别文件类型
 )
+# 访问返回结果
+print(result['content'])  # Markdown 内容
 
 # 无扩展名文件解析（自动检测文件类型）
 result = client.document.parse_to_md_binary(
@@ -459,6 +487,8 @@ result = client.document.parse_to_md_binary(
     filename="document_no_extension",  # 无扩展名
     # input_type='auto' 会从二进制内容自动检测 PDF/Office/HTML 等
 )
+# 访问返回结果
+print(result['content'])  # Markdown 内容
 
 # 显式指定文件类型（跳过自动检测）
 result = client.document.parse_to_md_binary(
@@ -466,6 +496,8 @@ result = client.document.parse_to_md_binary(
     filename="document",
     input_type="pdf"  # 强制作为 PDF 处理
 )
+# 访问返回结果
+print(result['content'])  # Markdown 内容
 
 # 解析URL文档（同步等待）
 doc = client.document.parse_url(
@@ -1184,11 +1216,13 @@ result = client.document.parse_to_md_binary(
     filename="document_no_extension"
     # input_type='auto' 是默认值，可以省略
 )
+# 访问返回结果
+print(result['content'])  # Markdown 内容
 ```
 
 支持的 `input_type` 值：
 - `'auto'` (默认): 优先使用文件扩展名，无扩展名或不支持时从二进制内容自动检测
-- `'pdf'`, `'office'`, `'html'`, `'image'`: 显式指定文件类型
+- 具体文件扩展名: `'pdf'`, `'docx'`, `'doc'`, `'xlsx'`, `'xls'`, `'pptx'`, `'ppt'`, `'html'`, `'htm'`, `'jpg'`, `'jpeg'`, `'png'` - 显式指定文件扩展名
 
 ### Q: 如何从URL直接解析文件？
 
@@ -1212,7 +1246,7 @@ response = requests.post(
     }
 )
 result = response.json()
-print(result['data']['markdown'])
+print(result['data']['content'])
 ```
 
 **方式 2：指定文件名（适用于无扩展名URL）**
@@ -1224,7 +1258,7 @@ response = requests.post(
         "file_url": "https://api.example.com/download?id=123",
         "config": json.dumps({
             "filename": "report.pdf",  # 覆盖文件名
-            "input_type": "pdf",  # 显式指定类型
+            "input_type": "pdf",  # 显式指定文件扩展名
             "layout_recognize": "mineru",
             "enable_table": True
         })
@@ -1249,6 +1283,10 @@ response = requests.post(
         "config": json.dumps({"layout_recognize": "mineru"})
     }
 )
+result = response.json()
+# 访问返回结果
+print(result['data']['content'])  # Markdown 内容
+print(result['data']['total_images'])  # 图片总数
 ```
 
 **配置参数说明：**
@@ -1256,7 +1294,7 @@ response = requests.post(
 - `config.filename` (str): 自定义文件名（可选，不提供则从 URL 提取）
 - `config.input_type` (str): 文件类型检测模式
   - `'auto'` (默认): 优先使用扩展名，无扩展名时自动检测
-  - `'pdf'`, `'office'`, `'html'`, `'image'`: 显式指定类型
+  - 具体文件扩展名: `'pdf'`, `'docx'`, `'doc'`, `'xlsx'`, `'xls'`, `'pptx'`, `'ppt'`, `'html'`, `'htm'`, `'jpg'`, `'jpeg'`, `'png'` - 显式指定文件扩展名
 - `config.layout_recognize` (str): 布局识别引擎（`'mineru'` 或 `'dots_ocr'`）
 - 其他解析配置参数同 `parse_to_md` 方法
 
diff --git a/powerrag/sdk/modules/document_manager.py b/powerrag/sdk/modules/document_manager.py
index 08b58fea4..8f3ac6bcd 100644
--- a/powerrag/sdk/modules/document_manager.py
+++ b/powerrag/sdk/modules/document_manager.py
@@ -695,9 +695,7 @@ def parse_to_md_upload(
         
         Returns:
             解析结果字典，包含以下字段：
-            - filename: 文件名
-            - markdown: Markdown 内容
-            - markdown_length: Markdown 长度
+            - content: Markdown 内容
             - images: 图片字典
             - total_images: 图片总数
         
@@ -710,14 +708,14 @@ def parse_to_md_upload(
             >>> result = doc_manager.parse_to_md_upload(
             ...     file_path="document.pdf"
             ... )
-            >>> print(result['markdown'])
+            >>> print(result['content'])
             >>> 
             >>> # 对于无扩展名文件，input_type='auto' 会自动从二进制内容识别
             >>> result = doc_manager.parse_to_md_upload(
             ...     file_path="document_no_ext"
             ...     # input_type='auto' 是默认值，可以省略
             ... )
-            >>> print(result['markdown'])
+            >>> print(result['content'])
             >>> 
             >>> # 显式指定文件类型（跳过自动识别）
             >>> result = doc_manager.parse_to_md_upload(
@@ -769,9 +767,7 @@ def parse_to_md_binary(
         
         Returns:
             解析结果字典，包含以下字段：
-            - filename: 文件名
-            - markdown: Markdown 内容
-            - markdown_length: Markdown 长度
+            - content: Markdown 内容
             - images: 图片字典 (base64)
             - total_images: 图片总数
         
@@ -787,7 +783,7 @@ def parse_to_md_binary(
             ...     file_binary=file_binary,
             ...     filename="document.pdf"
             ... )
-            >>> print(result['markdown'])
+            >>> print(result['content'])
             >>> 
             >>> # 对于无扩展名的二进制数据，input_type='auto' 会自动识别
             >>> result = doc_manager.parse_to_md_binary(
@@ -795,7 +791,7 @@ def parse_to_md_binary(
             ...     filename="document"  # 无扩展名
             ...     # input_type='auto' 是默认值
             ... )
-            >>> print(result['markdown'])
+            >>> print(result['content'])
             >>> 
             >>> # 显式指定文件类型（跳过自动识别）
             >>> result = doc_manager.parse_to_md_binary(
diff --git a/powerrag/sdk/tests/test_document.py b/powerrag/sdk/tests/test_document.py
index d882c4cda..173b1ddde 100644
--- a/powerrag/sdk/tests/test_document.py
+++ b/powerrag/sdk/tests/test_document.py
@@ -396,13 +396,11 @@ def test_parse_to_md_upload_json_response(self, client: PowerRAGClient, test_fil
         result = client.document.parse_to_md_upload(test_file_path)
         
         # 验证返回结果
-        assert "filename" in result
-        assert "markdown" in result
-        assert "markdown_length" in result
+        assert "content" in result
         assert "images" in result
         assert "total_images" in result
-        assert isinstance(result["markdown"], str)
-        assert result["markdown_length"] > 0
+        assert isinstance(result["content"], str)
+        assert len(result["content"]) > 0
     
     def test_parse_to_md_upload_with_config(self, client: PowerRAGClient, test_file_path: str):
         """测试带配置参数上传并解析"""
@@ -412,8 +410,8 @@ def test_parse_to_md_upload_with_config(self, client: PowerRAGClient, test_file_
         }
         result = client.document.parse_to_md_upload(test_file_path, config=config)
         
-        assert "markdown" in result
-        assert len(result["markdown"]) > 0
+        assert "content" in result
+        assert len(result["content"]) > 0
     
     def test_parse_to_md_upload_nonexistent_file(self, client: PowerRAGClient):
         """测试上传不存在的文件"""
@@ -426,8 +424,8 @@ def test_parse_to_md_upload_different_formats(self, client: PowerRAGClient, test
         # 这里我们只测试 txt 文件，实际使用时可以添加更多格式
         result = client.document.parse_to_md_upload(test_file_path)
         
-        assert "markdown" in result
-        assert result["markdown_length"] > 0
+        assert "content" in result
+        assert len(result["content"]) > 0
 
 
 class TestDocumentParseToMDBinary:
@@ -446,13 +444,11 @@ def test_parse_to_md_binary_basic(self, client: PowerRAGClient, test_file_path:
         )
         
         # 验证返回结果
-        assert "filename" in result
-        assert "markdown" in result
-        assert "markdown_length" in result
+        assert "content" in result
         assert "images" in result
         assert "total_images" in result
-        assert isinstance(result["markdown"], str)
-        assert result["markdown_length"] > 0
+        assert isinstance(result["content"], str)
+        assert len(result["content"]) > 0
     
     def test_parse_to_md_binary_with_config(self, client: PowerRAGClient, test_file_path: str):
         """测试带配置参数的二进制文件解析"""
@@ -471,9 +467,8 @@ def test_parse_to_md_binary_with_config(self, client: PowerRAGClient, test_file_
             config=config
         )
         
-        assert "markdown" in result
-        assert len(result["markdown"]) > 0
-        assert result["markdown_length"] > 0
+        assert "content" in result
+        assert len(result["content"]) > 0
     
     def test_parse_to_md_binary_empty_content(self, client: PowerRAGClient):
         """测试空的二进制内容"""
@@ -510,8 +505,8 @@ def test_parse_to_md_binary_different_file_types(self, client: PowerRAGClient, t
             filename="test.html"
         )
         
-        assert "markdown" in result
-        assert result["markdown_length"] > 0
+        assert "content" in result
+        assert len(result["content"]) > 0
     
     def test_parse_to_md_binary_with_images(self, client: PowerRAGClient, test_file_path: str):
         """测试解析带图片的文档（二进制）"""
@@ -541,8 +536,7 @@ def test_parse_to_md_binary_filename_with_extension(self, client: PowerRAGClient
             filename="document.html"
         )
         
-        assert result["filename"] == "document.html"
-        assert "markdown" in result
+        assert "content" in result
     
     def test_parse_to_md_binary_large_file(self, client: PowerRAGClient, tmp_path):
         """测试较大文件的二进制解析"""
@@ -559,10 +553,10 @@ def test_parse_to_md_binary_large_file(self, client: PowerRAGClient, tmp_path):
             filename="large_test.html"
         )
         
-        assert "markdown" in result
-        assert result["markdown_length"] > 0
+        assert "content" in result
+        assert len(result["content"]) > 0
         # 验证内容长度合理
-        assert len(result["markdown"]) > 1000
+        assert len(result["content"]) > 1000
     
     def test_parse_to_md_binary_utf8_content(self, client: PowerRAGClient, tmp_path):
         """测试包含UTF-8字符的文件"""
@@ -588,13 +582,111 @@ def test_parse_to_md_binary_utf8_content(self, client: PowerRAGClient, tmp_path)
             filename="utf8_test.html"
         )
         
-        assert "markdown" in result
-        assert result["markdown_length"] > 0
+        assert "content" in result
+        assert len(result["content"]) > 0
 
 
 class TestDocumentInputTypeAutoDetection:
     """测试 input_type 自动检测功能"""
     
+    @staticmethod
+    def _create_valid_pdf_content(text_lines=None):
+        """
+        创建一个有效的 PDF 文件内容
+        
+        Args:
+            text_lines: 可选的文本行列表，用于自定义 PDF 内容
+        
+        Returns:
+            bytes: PDF 文件的二进制内容
+        """
+        if text_lines is None:
+            text_lines = ["Test PDF File"]
+        
+        try:
+            from reportlab.pdfgen import canvas
+            from io import BytesIO
+            
+            # 创建一个有效的 PDF 内容
+            buffer = BytesIO()
+            c = canvas.Canvas(buffer)
+            y_pos = 750
+            for line in text_lines:
+                c.drawString(100, y_pos, line)
+                y_pos -= 20
+            c.showPage()
+            c.save()
+            pdf_content = buffer.getvalue()
+            buffer.close()
+            return pdf_content
+        except ImportError:
+            # 如果 reportlab 不可用，创建一个最小但有效的 PDF
+            # 这是一个最小有效的 PDF 1.4 文档，包含一个文本对象
+            return (
+                b"%PDF-1.4\n"
+                b"1 0 obj\n"
+                b"<<\n"
+                b"/Type /Catalog\n"
+                b"/Pages 2 0 R\n"
+                b">>\n"
+                b"endobj\n"
+                b"2 0 obj\n"
+                b"<<\n"
+                b"/Type /Pages\n"
+                b"/Kids [3 0 R]\n"
+                b"/Count 1\n"
+                b">>\n"
+                b"endobj\n"
+                b"3 0 obj\n"
+                b"<<\n"
+                b"/Type /Page\n"
+                b"/Parent 2 0 R\n"
+                b"/MediaBox [0 0 612 792]\n"
+                b"/Contents 4 0 R\n"
+                b"/Resources <<\n"
+                b"/Font <<\n"
+                b"/F1 5 0 R\n"
+                b">>\n"
+                b">>\n"
+                b">>\n"
+                b"endobj\n"
+                b"4 0 obj\n"
+                b"<<\n"
+                b"/Length 44\n"
+                b">>\n"
+                b"stream\n"
+                b"BT\n"
+                b"/F1 12 Tf\n"
+                b"100 700 Td\n"
+                b"(Test PDF) Tj\n"
+                b"ET\n"
+                b"endstream\n"
+                b"endobj\n"
+                b"5 0 obj\n"
+                b"<<\n"
+                b"/Type /Font\n"
+                b"/Subtype /Type1\n"
+                b"/BaseFont /Helvetica\n"
+                b">>\n"
+                b"endobj\n"
+                b"xref\n"
+                b"0 6\n"
+                b"0000000000 65535 f \n"
+                b"0000000009 00000 n \n"
+                b"0000000058 00000 n \n"
+                b"0000000115 00000 n \n"
+                b"0000000306 00000 n \n"
+                b"0000000400 00000 n \n"
+                b"trailer\n"
+                b"<<\n"
+                b"/Size 6\n"
+                b"/Root 1 0 R\n"
+                b">>\n"
+                b"startxref\n"
+                b"492\n"
+                b"%%EOF\n"
+            )
+    
     def test_auto_detection_with_valid_extension(self, client: PowerRAGClient, tmp_path):
         """测试有有效扩展名时，input_type='auto' 优先使用扩展名"""
         # 创建一个 HTML 文件
@@ -612,28 +704,29 @@ def test_auto_detection_with_valid_extension(self, client: PowerRAGClient, tmp_p
             # input_type='auto' 是默认值，可以省略
         )
         
-        assert "markdown" in result
-        assert result["filename"] == "test.html"
-        assert result["markdown_length"] > 0
+        assert "content" in result
+        assert len(result["content"]) > 0
     
     def test_auto_detection_without_extension_pdf(self, client: PowerRAGClient, tmp_path):
         """测试无扩展名 PDF 文件，input_type='auto' 会自动从二进制检测"""
-        # 创建一个简单的 PDF 文件头（实际测试可能需要真实的 PDF）
-        # 这里我们创建一个有 PDF 魔术数的文件
-        pdf_header = b"%PDF-1.4\n%\xE2\xE3\xCF\xD3\n"
-        pdf_content = pdf_header + b"1 0 obj\n<<\n/Type /Catalog\n>>\nendobj\n"
-        
-        # 使用没有扩展名的文件名
+        # 创建一个有效的 PDF 文件
+        pdf_content = self._create_valid_pdf_content([
+            "Test PDF File",
+            "This is a test PDF document for auto-detection testing.",
+            "The file has no extension, so binary detection should be used."
+        ])
+        
+        # 使用没有扩展名的文件名，input_type='auto' 会从二进制内容检测出 PDF
         result = client.document.parse_to_md_binary(
             file_binary=pdf_content,
             filename="document_no_extension"
-            # input_type='auto' 会从二进制内容检测出 PDF
+            # input_type='auto' 是默认值，会从二进制内容检测出 PDF
         )
         
-        # 注意：这个测试可能会因为 PDF 内容不完整而失败
-        # 实际环境中需要使用真实的 PDF 文件
-        assert "filename" in result
-        assert result["filename"] == "document_no_extension"
+        # 验证解析结果
+        assert "content" in result, "Result should contain 'content' field"
+        assert len(result["content"]) > 0, "Content should not be empty (PDF should be successfully parsed)"
+        assert isinstance(result["content"], str), "Content should be a string"
     
     def test_auto_detection_without_extension_html(self, client: PowerRAGClient):
         """测试无扩展名 HTML 文件，input_type='auto' 会自动从二进制检测"""
@@ -646,15 +739,16 @@ def test_auto_detection_without_extension_html(self, client: PowerRAGClient):
             # input_type='auto' 会从二进制内容检测出 HTML
         )
         
-        assert "markdown" in result
-        assert result["filename"] == "document_without_ext"
-        assert result["markdown_length"] > 0
+        assert "content" in result
+        assert len(result["content"]) > 0
     
     def test_explicit_input_type_pdf(self, client: PowerRAGClient, tmp_path):
         """测试显式指定 input_type='pdf'"""
-        # 创建一个简单的 PDF 内容
-        pdf_header = b"%PDF-1.4\n%\xE2\xE3\xCF\xD3\n"
-        pdf_content = pdf_header + b"1 0 obj\n<<\n/Type /Catalog\n>>\nendobj\n"
+        # 创建一个有效的 PDF 文件
+        pdf_content = self._create_valid_pdf_content([
+            "Test PDF File",
+            "This is a test PDF document for PowerRAG SDK testing."
+        ])
         
         # 显式指定为 PDF 类型，即使文件名没有扩展名
         result = client.document.parse_to_md_binary(
@@ -663,7 +757,10 @@ def test_explicit_input_type_pdf(self, client: PowerRAGClient, tmp_path):
             input_type="pdf"  # 显式指定类型
         )
         
-        assert "filename" in result
+        # 验证解析结果
+        assert "content" in result, "Result should contain 'content' field"
+        assert len(result["content"]) > 0, "Content should not be empty"
+        assert isinstance(result["content"], str), "Content should be a string"
     
     def test_explicit_input_type_html(self, client: PowerRAGClient):
         """测试显式指定 input_type='html'"""
@@ -676,8 +773,8 @@ def test_explicit_input_type_html(self, client: PowerRAGClient):
             input_type="html"  # 显式指定类型
         )
         
-        assert "markdown" in result
-        assert result["markdown_length"] > 0
+        assert "content" in result
+        assert len(result["content"]) > 0
     
     def test_parse_to_md_upload_with_auto_detection(self, client: PowerRAGClient, tmp_path):
         """测试 parse_to_md_upload 方法的自动检测功能"""
@@ -689,9 +786,8 @@ def test_parse_to_md_upload_with_auto_detection(self, client: PowerRAGClient, tm
         # 使用默认的 input_type='auto'
         result = client.document.parse_to_md_upload(str(html_file))
         
-        assert "markdown" in result
-        assert "filename" in result
-        assert result["markdown_length"] > 0
+        assert "content" in result
+        assert len(result["content"]) > 0
     
     def test_parse_to_md_upload_with_explicit_type(self, client: PowerRAGClient, tmp_path):
         """测试 parse_to_md_upload 显式指定类型"""
@@ -705,8 +801,8 @@ def test_parse_to_md_upload_with_explicit_type(self, client: PowerRAGClient, tmp
             input_type="html"
         )
         
-        assert "markdown" in result
-        assert result["markdown_length"] > 0
+        assert "content" in result
+        assert len(result["content"]) > 0
     
     def test_auto_detection_priority_extension_over_binary(self, client: PowerRAGClient, tmp_path):
         """测试 input_type='auto' 优先使用扩展名而非二进制检测"""
@@ -725,10 +821,9 @@ def test_auto_detection_priority_extension_over_binary(self, client: PowerRAGCli
             # input_type='auto' 默认值
         )
         
-        assert "markdown" in result
-        assert result["filename"] == "priority_test.html"
+        assert "content" in result
         # 验证确实解析成功（说明使用了正确的类型）
-        assert "Priority Test" in result["markdown"] or result["markdown_length"] > 0
+        assert "Priority Test" in result["content"] or len(result["content"]) > 0
     
     def test_auto_detection_fallback_to_binary(self, client: PowerRAGClient):
         """测试扩展名不支持时，fallback 到二进制检测"""
@@ -742,8 +837,8 @@ def test_auto_detection_fallback_to_binary(self, client: PowerRAGClient):
         )
         
         # 应该能够通过二进制检测识别为 HTML
-        assert "markdown" in result
-        assert result["markdown_length"] > 0
+        assert "content" in result
+        assert len(result["content"]) > 0
     
     def test_config_with_input_type(self, client: PowerRAGClient, tmp_path):
         """测试 config 中包含 input_type 参数"""
@@ -765,8 +860,8 @@ def test_config_with_input_type(self, client: PowerRAGClient, tmp_path):
             input_type="html"
         )
         
-        assert "markdown" in result
-        assert result["markdown_length"] > 0
+        assert "content" in result
+        assert len(result["content"]) > 0
 
 
 class TestDocumentFileUrl:
@@ -793,8 +888,8 @@ def test_parse_from_url_basic(self, client: PowerRAGClient):
         assert response.status_code == 200
         result = response.json()
         assert result["code"] == 0
-        assert "markdown" in result["data"]
-        assert result["data"]["markdown_length"] > 0
+        assert "content" in result["data"]
+        assert len(result["data"]["content"]) > 0
     
     def test_parse_from_url_with_filename(self, client: PowerRAGClient):
         """测试从URL下载并指定文件名"""
@@ -819,7 +914,6 @@ def test_parse_from_url_with_filename(self, client: PowerRAGClient):
         assert response.status_code == 200
         result = response.json()
         assert result["code"] == 0
-        assert result["data"]["filename"] == custom_filename
     
     def test_parse_from_url_with_auto_detection(self, client: PowerRAGClient):
         """测试从URL下载，使用 input_type='auto' 自动检测"""
@@ -842,7 +936,7 @@ def test_parse_from_url_with_auto_detection(self, client: PowerRAGClient):
         assert response.status_code == 200
         result = response.json()
         assert result["code"] == 0
-        assert "markdown" in result["data"]
+        assert "content" in result["data"]
     
     def test_parse_from_invalid_url(self, client: PowerRAGClient):
         """测试无效URL应返回错误"""
@@ -935,8 +1029,7 @@ def test_parse_from_url_with_config(self, client: PowerRAGClient):
         assert response.status_code == 200
         result = response.json()
         assert result["code"] == 0
-        assert result["data"]["filename"] == "complete_config.html"
-        assert "markdown" in result["data"]
+        assert "content" in result["data"]
     
     def test_parse_from_url_empty_file(self, client: PowerRAGClient):
         """测试从URL下载空文件应返回错误"""
diff --git a/powerrag/server/routes/powerrag_routes.py b/powerrag/server/routes/powerrag_routes.py
index 6a3dd6b1d..b7d7f3221 100644
--- a/powerrag/server/routes/powerrag_routes.py
+++ b/powerrag/server/routes/powerrag_routes.py
@@ -17,15 +17,20 @@
 """PowerRAG Unified API Routes"""
 
 import os
+import json
 import logging
+from pathlib import Path
+from urllib.parse import urlparse
 from quart import Blueprint, request, jsonify, Response
 from powerrag.server.services.parse_service import PowerRAGParseService
 from powerrag.server.services.convert_service import PowerRAGConvertService
 from powerrag.server.services.split_service import PowerRAGSplitService
 from powerrag.server.services.extract_service import PowerRAGExtractService
 from powerrag.utils.api_utils import get_data_error_result
+from powerrag.utils.file_utils import detect_file_type
 from api.utils.api_utils import apikey_required
 import langextract as lx
+import requests
 
 # Import RAGFlow services for task queue integration
 from api.db.services.document_service import DocumentService
@@ -44,6 +49,114 @@
 gotenberg_config = get_base_config("gotenberg", {})
 GOTENBERG_URL = gotenberg_config.get("url", os.environ.get("GOTENBERG_URL", "http://localhost:3000"))
 
+# File download timeout settings (in seconds)
+DEFAULT_DOWNLOAD_TIMEOUT = int(os.environ.get("FILE_DOWNLOAD_TIMEOUT", "300"))  # 5 minutes default
+DEFAULT_HEAD_REQUEST_TIMEOUT = int(os.environ.get("FILE_HEAD_REQUEST_TIMEOUT", "30"))  # 30 seconds default
+
+# File extension to format type mapping (used across multiple endpoints)
+FILE_EXTENSION_TO_FORMAT_TYPE = {
+    'pdf': 'pdf',
+    'docx': 'office', 'doc': 'office',
+    'xlsx': 'office', 'xls': 'office',
+    'pptx': 'office', 'ppt': 'office',
+    'html': 'html', 'htm': 'html',
+    'jpg': 'image', 'jpeg': 'image',
+    'png': 'image'
+}
+
+
+def download_file_with_validation(
+    file_url: str,
+    max_file_size: int,
+    download_timeout: int = DEFAULT_DOWNLOAD_TIMEOUT,
+    head_timeout: int = DEFAULT_HEAD_REQUEST_TIMEOUT
+) -> tuple[bytes, str | None]:
+    """
+    Download file from URL with size validation and intelligent download strategy.
+    
+    This function implements a two-layer defense strategy:
+    1. HEAD request pre-check: Fast rejection for oversized files (saves bandwidth)
+    2. Intelligent download: Direct or streaming based on Content-Length availability
+    
+    Args:
+        file_url: URL of the file to download
+        max_file_size: Maximum allowed file size in bytes
+        download_timeout: Timeout for GET request in seconds
+        head_timeout: Timeout for HEAD request in seconds
+        
+    Returns:
+        tuple: (binary_content, error_message)
+            - binary_content: Downloaded file content as bytes (None if error)
+            - error_message: Error message if download failed (None if success)
+            
+    Raises:
+        requests.exceptions.RequestException: For network-related errors
+    """
+    max_size_mb = max_file_size / (1024 * 1024)
+    
+    # First, make a HEAD request to check Content-Length before downloading
+    content_length_known = False
+    verified_content_length = None
+    
+    try:
+        head_response = requests.head(file_url, timeout=head_timeout, allow_redirects=True)
+        content_length = head_response.headers.get('Content-Length')
+        
+        if content_length:
+            content_length = int(content_length)
+            if content_length > max_file_size:
+                logger.warning(f"File size {content_length} bytes exceeds limit {max_file_size} bytes")
+                return None, f"File size ({content_length / (1024 * 1024):.2f}MB) exceeds maximum allowed size ({max_size_mb:.2f}MB)"
+            logger.info(f"Content-Length check passed: {content_length} bytes")
+            content_length_known = True
+            verified_content_length = content_length
+    except requests.exceptions.RequestException as e:
+        # HEAD request failed, continue with streaming download with size checks
+        logger.info(f"HEAD request failed, will use streaming download with size checks: {e}")
+    
+    # Choose download strategy based on whether Content-Length is known
+    if content_length_known:
+        # Direct download: Content-Length verified, size is within limit
+        logger.info(f"Using direct download (Content-Length verified: {verified_content_length} bytes)")
+        response = requests.get(file_url, timeout=download_timeout)
+        response.raise_for_status()
+        binary = response.content
+        
+        # Verify actual size matches Content-Length (defense against malicious servers)
+        actual_size = len(binary)
+        if actual_size != verified_content_length:
+            logger.warning(f"Size mismatch: Content-Length={verified_content_length}, actual={actual_size}")
+            if actual_size > max_file_size:
+                return None, f"File size ({actual_size / (1024 * 1024):.2f}MB) exceeds maximum allowed size ({max_size_mb:.2f}MB)"
+        
+        logger.info(f"Successfully downloaded {actual_size} bytes")
+        return binary, None
+    else:
+        # Streaming download with size limit enforcement
+        logger.info(f"Using streaming download (Content-Length unknown, will enforce size limit during download)")
+        response = requests.get(file_url, timeout=download_timeout, stream=True)
+        response.raise_for_status()
+        
+        # Download in chunks and enforce size limit
+        downloaded_size = 0
+        chunks = []
+        chunk_size = 8192  # 8KB chunks
+        
+        for chunk in response.iter_content(chunk_size=chunk_size):
+            if chunk:
+                downloaded_size += len(chunk)
+                
+                # Check if size limit exceeded during download
+                if downloaded_size > max_file_size:
+                    logger.warning(f"Download aborted: size exceeded {max_file_size} bytes during streaming")
+                    return None, f"File size exceeds maximum allowed size ({max_size_mb:.2f}MB). Download aborted at {downloaded_size / (1024 * 1024):.2f}MB."
+                
+                chunks.append(chunk)
+        
+        binary = b''.join(chunks)
+        logger.info(f"Successfully downloaded {downloaded_size} bytes")
+        return binary, None
+
 
 # ============================================================================
 # 文档解析接口
@@ -727,22 +840,11 @@ async def parse_to_md(tenant_id):
         service = PowerRAGParseService(gotenberg_url=gotenberg_url)
         
         # Parse document to markdown (no chunking)
-        from pathlib import Path
         file_ext = Path(doc.name).suffix.lstrip('.').lower()
         
         # Determine format type
         # Supported: PDF, Office (doc/docx/ppt/pptx), HTML, Images (jpg/png)
-        format_type_map = {
-            'pdf': 'pdf',
-            'docx': 'office', 'doc': 'office',
-            'xlsx': 'office', 'xls': 'office',
-            'pptx': 'office', 'ppt': 'office',
-            'html': 'html', 'htm': 'html',
-            'jpg': 'image', 'jpeg': 'image',
-            'png': 'image'
-        }
-        
-        format_type = format_type_map.get(file_ext)
+        format_type = FILE_EXTENSION_TO_FORMAT_TYPE.get(file_ext)
         if not format_type:
             return jsonify({
                 "code": 400,
@@ -852,15 +954,8 @@ async def parse_to_md_async(tenant_id):
             }), 404
         
         # Determine format type
-        from pathlib import Path
         file_ext = Path(doc.name).suffix.lstrip('.').lower()
-        format_type_map = {
-            'pdf': 'pdf', 'docx': 'office', 'doc': 'office',
-            'xlsx': 'office', 'xls': 'office', 'pptx': 'office', 'ppt': 'office',
-            'html': 'html', 'htm': 'html',
-            'jpg': 'image', 'jpeg': 'image', 'png': 'image'
-        }
-        format_type = format_type_map.get(file_ext, 'pdf')
+        format_type = FILE_EXTENSION_TO_FORMAT_TYPE.get(file_ext, 'pdf')
         
         # Get task manager and service
         from powerrag.server.services.parse_to_md_task_manager import get_task_manager
@@ -974,6 +1069,8 @@ async def parse_to_md_upload(tenant_id):
     - file: File to parse (optional, required if file_url not provided)
     - file_url: URL of file to download and parse (optional, required if file not provided)
     - config: JSON string of parser config (optional)
+    - download_timeout: Timeout in seconds for file download (optional, default: 300)
+    - head_request_timeout: Timeout in seconds for HEAD request (optional, default: 30)
     
     Config parameters:
     - layout_recognize (str): mineru or dots_ocr (default: mineru)
@@ -984,16 +1081,16 @@ async def parse_to_md_upload(tenant_id):
     - to_page (int): End page number (default: 100000)
     - input_type (str): File type detection mode (default: 'auto'). Options:
         * 'auto': Try filename extension first, then auto-detect from binary if no extension (default)
-        * 'pdf', 'office', 'html', 'image': Explicit file type (bypass detection)
+        * Specific file extension: 'pdf', 'docx', 'doc', 'xlsx', 'xls', 'pptx', 'ppt', 'html', 'htm', 'jpg', 'jpeg', 'png' (bypass detection)
     - filename (str): Override filename (optional, useful with file_url)
+    - max_file_size (int): Maximum file size in bytes for URL downloads (optional, 
+        default: uses DOC_MAXIMUM_SIZE from settings, typically 128MB)
     
     Response JSON:
     {
         "code": 0,
         "data": {
-            "filename": "document.pdf",
-            "markdown": "# Title\n\nContent...",
-            "markdown_length": 5000,
+            "content": "# Title\n\nContent...",
             "images": {...},
             "total_images": 2
         },
@@ -1002,20 +1099,23 @@ async def parse_to_md_upload(tenant_id):
     """
     try:
         # Parse config from JSON string if provided
-        import json
         form = await request.form
         config_str = form.get('config', '{}')
         try:
             config = json.loads(config_str)
-        except json.JSONDecodeError:
+        except json.JSONDecodeError as e:
+            logger.warning(f"Invalid JSON in config parameter: {e}")
             return jsonify({
                 "code": 400,
-                "message": "Invalid JSON in config parameter"
+                "message": f"Invalid JSON in config parameter: {str(e)}"
             }), 400
         
         # Get file_url parameter
         file_url = form.get('file_url')
         
+        # Get timeout parameters from form (optional)
+        download_timeout_str = form.get('download_timeout')
+        
         # Check if file or file_url is provided
         files = await request.files
         has_file = 'file' in files and files['file'].filename != ''
@@ -1036,16 +1136,12 @@ async def parse_to_md_upload(tenant_id):
         # Handle file upload or URL download
         if has_file:
             # Direct file upload
+            # Note: file.read() is synchronous in Quart but runs in async context.
+            # For large files, consider using streaming or async file reading in future.
             file = files['file']
             filename = file.filename
             logger.info(f"Received file upload: filename={filename}")
             
-            if not filename:
-                return jsonify({
-                    "code": 400,
-                    "message": "Filename is required"
-                }), 400
-            
             binary = file.read()
             if not binary:
                 return jsonify({
@@ -1054,22 +1150,66 @@ async def parse_to_md_upload(tenant_id):
                 }), 400
         else:
             # Download from URL
-            import requests
-            from urllib.parse import urlparse
-            from pathlib import Path
+            # Get maximum file size from settings (default: 128MB or from environment)
+            max_file_size = config.get('max_file_size', settings.DOC_MAXIMUM_SIZE)
+            max_size_mb = max_file_size / (1024 * 1024)
+            
+            # Get timeout settings from form parameters with fallback to config or environment defaults
+            try:
+                download_timeout = int(download_timeout_str) if download_timeout_str else config.get('download_timeout', DEFAULT_DOWNLOAD_TIMEOUT)
+            except (ValueError, TypeError):
+                download_timeout = config.get('download_timeout', DEFAULT_DOWNLOAD_TIMEOUT)
+            
+            head_timeout = config.get('head_request_timeout', DEFAULT_HEAD_REQUEST_TIMEOUT)
             
-            logger.info(f"Downloading file from URL: {file_url}")
+            logger.info(f"Downloading file from URL: {file_url} (max size: {max_size_mb:.2f}MB, timeout: {download_timeout}s)")
             
             try:
-                response = requests.get(file_url, timeout=60)
-                response.raise_for_status()
-                binary = response.content
+                # Use the download_file_with_validation function
+                binary, error_msg = download_file_with_validation(
+                    file_url=file_url,
+                    max_file_size=max_file_size,
+                    download_timeout=download_timeout,
+                    head_timeout=head_timeout
+                )
+                
+                if error_msg:
+                    # Download failed due to size limit or other validation error
+                    return jsonify({
+                        "code": 400,
+                        "message": error_msg
+                    }), 400
+                    
+            except requests.exceptions.Timeout as e:
+                logger.error(f"Timeout downloading file from URL: {file_url}. Error: {e}")
+                return jsonify({
+                    "code": 408,
+                    "message": f"Request timeout while downloading file from URL. Please try again or increase timeout."
+                }), 408
+            except requests.exceptions.ConnectionError as e:
+                logger.error(f"Connection error downloading file from URL: {file_url}. Error: {e}")
+                return jsonify({
+                    "code": 503,
+                    "message": f"Failed to connect to file URL. Please check the URL and try again."
+                }), 503
+            except requests.exceptions.HTTPError as e:
+                logger.error(f"HTTP error downloading file from URL: {file_url}. Error: {e}")
+                return jsonify({
+                    "code": 502,
+                    "message": f"HTTP error while downloading file: {str(e)}"
+                }), 502
             except requests.exceptions.RequestException as e:
-                logger.error(f"Failed to download file from URL: {file_url}. Error: {e}")
+                logger.error(f"Request error downloading file from URL: {file_url}. Error: {e}")
                 return jsonify({
                     "code": 400,
                     "message": f"Failed to download file from URL: {str(e)}"
                 }), 400
+            except Exception as e:
+                logger.error(f"Unexpected error downloading file from URL: {file_url}. Error: {e}", exc_info=True)
+                return jsonify({
+                    "code": 500,
+                    "message": f"Unexpected error while downloading file: {str(e)}"
+                }), 500
             
             if not binary:
                 return jsonify({
@@ -1096,28 +1236,17 @@ async def parse_to_md_upload(tenant_id):
         # Determine format type based on input_type
         if input_type == 'auto':
             # Auto mode: Try extension first, then binary detection
-            from pathlib import Path
             file_ext = Path(filename).suffix.lstrip('.').lower()
             
             if file_ext:
                 # Has extension, try to use it
-                format_type_map = {
-                    'pdf': 'pdf',
-                    'docx': 'office', 'doc': 'office',
-                    'xlsx': 'office', 'xls': 'office',
-                    'pptx': 'office', 'ppt': 'office',
-                    'html': 'html', 'htm': 'html',
-                    'jpg': 'image', 'jpeg': 'image',
-                    'png': 'image'
-                }
-                format_type = format_type_map.get(file_ext)
+                format_type = FILE_EXTENSION_TO_FORMAT_TYPE.get(file_ext)
                 
                 if format_type:
                     # Valid extension found
                     logger.info(f"Using filename extension: {format_type} (.{file_ext}) for file: {filename}")
                 else:
                     # Unsupported extension, try auto-detect from binary
-                    from powerrag.utils.file_utils import detect_file_type
                     format_type = detect_file_type(binary)
                     logger.info(f"Extension '{file_ext}' not supported, auto-detected from binary: {format_type} for file: {filename}")
                     
@@ -1128,7 +1257,6 @@ async def parse_to_md_upload(tenant_id):
                         }), 400
             else:
                 # No extension, auto-detect from binary content
-                from powerrag.utils.file_utils import detect_file_type
                 format_type = detect_file_type(binary)
                 logger.info(f"No extension found, auto-detected file type from binary: {format_type} for file: {filename}")
                 
@@ -1137,15 +1265,23 @@ async def parse_to_md_upload(tenant_id):
                         "code": 400,
                         "message": f"Unable to determine file type for {filename}. File has no extension and binary auto-detection failed. Please provide a file with a valid extension or specify input_type explicitly."
                     }), 400
-        elif input_type in ['pdf', 'office', 'html', 'image']:
-            # Use explicitly specified input_type
-            format_type = input_type
-            logger.info(f"Using explicit input_type: {format_type} for file: {filename}")
         else:
-            return jsonify({
-                "code": 400,
-                "message": f"Invalid input_type: {input_type}. Must be 'auto' (default), 'pdf', 'office', 'html', or 'image'."
-            }), 400
+            # input_type is a specific file extension (e.g., 'pdf', 'docx', 'html', 'jpg')
+            # Normalize to lowercase and remove leading dot if present
+            input_ext = input_type.lstrip('.').lower()
+            
+            # Map extension to format type
+            format_type = FILE_EXTENSION_TO_FORMAT_TYPE.get(input_ext)
+            
+            if format_type:
+                logger.info(f"Using explicit input_type extension: {format_type} (.{input_ext}) for file: {filename}")
+            else:
+                # Invalid extension specified
+                supported_extensions = ', '.join(sorted(set(FILE_EXTENSION_TO_FORMAT_TYPE.keys())))
+                return jsonify({
+                    "code": 400,
+                    "message": f"Invalid input_type: '{input_type}'. Must be 'auto' (default) or a specific file extension: {supported_extensions}"
+                }), 400
         
         # Create service and parse
         gotenberg_url = config.get("gotenberg_url", GOTENBERG_URL)
@@ -1163,15 +1299,25 @@ async def parse_to_md_upload(tenant_id):
         return jsonify({
             "code": 0,
             "data": {
-                "filename": filename,
-                "markdown": md_content,
-                "markdown_length": len(md_content),
+                "content": md_content,
                 "images": images,
                 "total_images": len(images)
             },
             "message": "success"
         }), 200
         
+    except json.JSONDecodeError as e:
+        logger.error(f"JSON decode error in parse_to_md_upload: {e}", exc_info=True)
+        return jsonify({
+            "code": 400,
+            "message": f"Invalid JSON in request: {str(e)}"
+        }), 400
+    except ValueError as e:
+        logger.error(f"Value error in parse_to_md_upload: {e}", exc_info=True)
+        return jsonify({
+            "code": 400,
+            "message": str(e)
+        }), 400
     except Exception as e:
         logger.error(f"Parse to markdown (upload) error: {e}", exc_info=True)
         return jsonify({
diff --git a/powerrag/server/services/parse_service.py b/powerrag/server/services/parse_service.py
index 1c4d24f09..227e779a7 100644
--- a/powerrag/server/services/parse_service.py
+++ b/powerrag/server/services/parse_service.py
@@ -166,6 +166,10 @@ def parse_file_binary(self, binary: bytes, filename: str,
             }
         """
         try:
+            # Normalize input_type: treat None as 'auto'
+            if input_type is None:
+                input_type = 'auto'
+            
             # Determine format type based on input_type parameter
             if input_type == 'auto':
                 # Auto mode: Try extension first, then binary detection
@@ -187,15 +191,27 @@ def parse_file_binary(self, binary: bytes, filename: str,
                             f"File has no extension or unsupported extension '{file_ext}', and binary auto-detection failed. "
                             f"Please provide a valid input_type explicitly."
                         )
-            elif input_type in ['pdf', 'office', 'html', 'image', 'markdown']:
-                # Use explicitly specified input_type
-                format_type = input_type
-                logger.info(f"Using explicit input_type: {format_type} for file: {filename}")
             else:
-                raise ValueError(
-                    f"Invalid input_type: {input_type}. "
-                    f"Must be 'auto', 'pdf', 'office', 'html', 'image', or 'markdown'"
-                )
+                # input_type is a specific file extension (e.g., 'pdf', 'docx', 'html', 'jpg')
+                # Normalize to lowercase and remove leading dot if present
+                input_ext = input_type.lstrip('.').lower()
+                
+                # Map extension to format type
+                format_type = self.SUPPORTED_FORMATS.get(input_ext)
+                
+                if format_type:
+                    logger.info(f"Using explicit input_type extension: {format_type} (.{input_ext}) for file: {filename}")
+                elif input_ext == 'markdown' or input_ext == 'md':
+                    # Special case for markdown files
+                    format_type = 'markdown'
+                    logger.info(f"Using explicit input_type: {format_type} for file: {filename}")
+                else:
+                    # Invalid extension specified
+                    supported_extensions = ', '.join(sorted(set(self.SUPPORTED_FORMATS.keys()) | {'md', 'markdown'}))
+                    raise ValueError(
+                        f"Invalid input_type: '{input_type}'. "
+                        f"Must be 'auto' (default) or a specific file extension: {supported_extensions}"
+                    )
             
             # Parse document to get markdown and images
             md_content, images = self._parse_to_markdown(filename, binary, format_type, config)
@@ -666,17 +682,30 @@ def _parse_to_markdown_for_task(self, doc_id: str = None, filename: str = None,
                 format_type = detect_file_type(binary)
                 logger.info(f"Auto-detected file type: {format_type} for document {doc_id}")
             elif input_type:
-                format_type = input_type
+                # input_type is a specific file extension (e.g., 'pdf', 'docx', 'html', 'jpg')
+                # Normalize to lowercase and remove leading dot if present
+                input_ext = input_type.lstrip('.').lower()
+                
+                # Map extension to format type using SUPPORTED_FORMATS
+                format_type = self.SUPPORTED_FORMATS.get(input_ext)
+                
+                if format_type:
+                    logger.info(f"Using explicit input_type extension: {format_type} (.{input_ext}) for document {doc_id}")
+                elif input_ext == 'markdown' or input_ext == 'md':
+                    # Special case for markdown files
+                    format_type = 'markdown'
+                    logger.info(f"Using explicit input_type: {format_type} for document {doc_id}")
+                else:
+                    # Invalid extension specified
+                    supported_extensions = ', '.join(sorted(set(self.SUPPORTED_FORMATS.keys()) | {'md', 'markdown'}))
+                    raise ValueError(
+                        f"Invalid input_type: '{input_type}'. "
+                        f"Must be 'auto' (default) or a specific file extension: {supported_extensions}"
+                    )
             else:
                 # Auto-detect from file extension
                 file_ext = Path(doc.name).suffix.lstrip('.').lower()
-                format_type_map = {
-                    'pdf': 'pdf', 'docx': 'office', 'doc': 'office',
-                    'xlsx': 'office', 'xls': 'office', 'pptx': 'office', 'ppt': 'office',
-                    'html': 'html', 'htm': 'html',
-                    'jpg': 'image', 'jpeg': 'image', 'png': 'image'
-                }
-                format_type = format_type_map.get(file_ext, 'pdf')
+                format_type = self.SUPPORTED_FORMATS.get(file_ext, 'pdf')
             filename = doc.name
         
         # Case 2: Parse from direct binary (filename, binary, format_type provided)
diff --git a/powerrag/utils/file_utils.py b/powerrag/utils/file_utils.py
index 5e96988e8..7c79e527b 100644
--- a/powerrag/utils/file_utils.py
+++ b/powerrag/utils/file_utils.py
@@ -374,6 +374,8 @@ def detect_file_type(binary: bytes) -> str:
     
     # Check HTML (basic detection)
     # Try to decode as text and check for HTML markers
+    # Note: Detection is case-insensitive - works regardless of original HTML tag casing
+    # (e.g., '<HTML>', '<Html>', '<html>' are all detected)
     try:
         text_sample = binary[:1024].decode('utf-8', errors='ignore').lower()
         if '<html' in text_sample or '<!doctype html' in text_sample or '<head' in text_sample:

From 15d378b32a6358a867bfb329d44fe96deef004e9 Mon Sep 17 00:00:00 2001
From: "suiyu.zg" <suiyu.zg@oceanbase.com>
Date: Wed, 28 Jan 2026 16:22:49 +0800
Subject: [PATCH 17/19] feat(sdk): implement file splitting functionality and
 enhance documentation

---
 api/apps/sdk/powerrag_proxy.py            |  45 ++++-
 powerrag/sdk/README.md                    | 195 ++++++++++++++++++++-
 powerrag/sdk/modules/chunk_manager.py     | 168 +++++++++++++++++-
 powerrag/sdk/tests/test_chunk.py          |  57 +++++-
 powerrag/sdk/tests/test_document.py       |   5 +
 powerrag/server/routes/powerrag_routes.py | 203 +++++++++++++++++++++-
 powerrag/server/services/split_service.py | 177 ++++++++++++++++++-
 7 files changed, 834 insertions(+), 16 deletions(-)

diff --git a/api/apps/sdk/powerrag_proxy.py b/api/apps/sdk/powerrag_proxy.py
index 3bafdf867..f0fffb811 100644
--- a/api/apps/sdk/powerrag_proxy.py
+++ b/api/apps/sdk/powerrag_proxy.py
@@ -91,12 +91,19 @@ async def _forward_request(method: str, endpoint: str, tenant_id: str = None):
                 if files_dict:
                     # 保留文件名信息！重要：不能直接 dict(files_dict)
                     # 因为会丢失文件名。需要构造 httpx 期望的格式
+                    import asyncio
+                    from io import BytesIO
                     files = {}
                     for field_name, file_storage in files_dict.items():
-                        # httpx 期望格式: (filename, content, content_type)
+                        # 在线程中读取文件内容（避免阻塞事件循环）
+                        # httpx 期望文件对象或元组格式
+                        # 使用 BytesIO 将 bytes 包装成文件对象
+                        file_content = await asyncio.to_thread(file_storage.read)
+                        # httpx 期望格式: (filename, file_object, content_type) 或 (filename, file_object)
+                        file_obj = BytesIO(file_content)
                         files[field_name] = (
                             file_storage.filename,
-                            file_storage.read(),
+                            file_obj,
                             file_storage.content_type or 'application/octet-stream'
                         )
             except Exception:
@@ -592,3 +599,37 @@ async def parse_to_md_upload_proxy(tenant_id):
     """
     return await _forward_request("POST", "/parse_to_md/upload", tenant_id)
 
+
+@manager.route("/powerrag/split/file", methods=["POST"])  # noqa: F821
+@token_required
+async def split_file_proxy(tenant_id):
+    """
+    代理 split/file API 请求到 PowerRAG server
+    
+    支持所有ParserType方法对文件进行切片（使用文件路径或URL）
+    
+    ---
+    tags:
+      - PowerRAG Proxy
+    security:
+      - ApiKeyAuth: []
+    """
+    return await _forward_request("POST", "/split/file", tenant_id)
+
+
+@manager.route("/powerrag/split/file/upload", methods=["POST"])  # noqa: F821
+@token_required
+async def split_file_upload_proxy(tenant_id):
+    """
+    代理 split/file/upload API 请求到 PowerRAG server
+    
+    上传文件并切片，支持所有ParserType方法
+    
+    ---
+    tags:
+      - PowerRAG Proxy
+    security:
+      - ApiKeyAuth: []
+    """
+    return await _forward_request("POST", "/split/file/upload", tenant_id)
+
diff --git a/powerrag/sdk/README.md b/powerrag/sdk/README.md
index f59868396..6f4e37a4b 100644
--- a/powerrag/sdk/README.md
+++ b/powerrag/sdk/README.md
@@ -290,14 +290,123 @@ status = client.extraction.get_struct_extract_status(task['task_id'])
 
 ### 文本切片
 
-无需上传文档即可对文本进行切片：
+无需上传文档即可对文本进行切片。
+
+**注意**: `split_text` 方法仅支持以下三种解析器：
+- `title`: 基于标题的切片
+- `regex`: 基于正则表达式的切片
+- `smart`: 智能切片
+
+对于其他解析器（如 `naive`, `book`, `qa` 等），请使用 `split_file` 或 `split_file_upload` 方法。
 
 ```python
+# 使用 title 解析器
 result = client.chunk.split_text(
     text="# Title\n\nContent...",
     parser_id="title",
     config={"chunk_token_num": 512}
 )
+
+# 使用 regex 解析器
+result = client.chunk.split_text(
+    text="Section 1\n\nContent...",
+    parser_id="regex",
+    config={
+        "chunk_token_num": 256,
+        "regex_pattern": r"Section \d+"
+    }
+)
+
+# 使用 smart 解析器
+result = client.chunk.split_text(
+    text="Long text content...",
+    parser_id="smart",
+    config={"chunk_token_num": 512}
+)
+
+print(f"Total chunks: {result['total_chunks']}")
+for chunk in result['chunks']:
+    print(chunk)
+```
+
+### 文件切片
+
+文件切片支持所有 ParserType 方法，提供三种使用方式：
+
+#### 方式 1: 使用本地文件路径
+
+```python
+result = client.chunk.split_file(
+    file_path="/path/to/document.pdf",
+    parser_id="book",  # 支持所有 ParserType
+    config={
+        "chunk_token_num": 512,
+        "delimiter": "\n。.；;！!？？",
+        "lang": "Chinese",
+        "from_page": 0,
+        "to_page": 100000
+    }
+)
+```
+
+#### 方式 2: 使用文件 URL
+
+```python
+result = client.chunk.split_file(
+    file_url="https://example.com/doc.pdf",
+    parser_id="naive",
+    config={
+        "chunk_token_num": 256,
+        "max_file_size": 128 * 1024 * 1024,  # 128MB
+        "download_timeout": 300,  # 5分钟
+        "head_request_timeout": 30  # 30秒
+    }
+)
+```
+
+#### 方式 3: 上传文件并切片
+
+```python
+result = client.chunk.split_file_upload(
+    file_path="/path/to/document.pdf",
+    parser_id="book",
+    config={
+        "chunk_token_num": 512,
+        "delimiter": "\n。.；;！!？？",
+        "lang": "Chinese"
+    }
+)
+
+print(f"Total chunks: {result['total_chunks']}")
+print(f"Filename: {result['filename']}")
+for chunk in result['chunks']:
+    print(chunk)
+```
+
+**支持的 ParserType 方法：**
+- 基础方法: `naive`, `title`, `regex`, `smart`
+- 专业方法: `qa`, `book`, `laws`, `paper`, `manual`, `presentation`
+- 特殊格式: `table`, `resume`, `picture`, `one`, `email`
+- 高级方法: `knowledge_graph`
+
+**配置参数说明：**
+- `chunk_token_num` (int): 目标分块大小（tokens），默认 512
+- `delimiter` (str): 分隔符字符串，默认 `"\n。.；;！!？？"`
+- `lang` (str): 语言，默认 `"Chinese"`
+- `from_page` (int): PDF 起始页码，默认 0
+- `to_page` (int): PDF 结束页码，默认 100000
+- `max_file_size` (int): URL 下载的最大文件大小（字节），仅用于 `file_url` 方式
+- `download_timeout` (int): 下载超时时间（秒），仅用于 `file_url` 方式
+- `head_request_timeout` (int): HEAD 请求超时时间（秒），仅用于 `file_url` 方式
+
+**返回值结构：**
+```python
+{
+    "parser_id": "book",
+    "chunks": ["chunk1", "chunk2", ...],  # 字符串列表
+    "total_chunks": 10,
+    "filename": "document.pdf"
+}
 ```
 
 ## 核心模块
@@ -558,15 +667,49 @@ client.chunk.delete(kb_id, doc_id, [chunk_id])
 # 删除文档的所有切片
 client.chunk.delete(kb_id, doc_id, None)
 
-# 文本切片（无需上传文档）
+# 文本切片（仅支持 title, regex, smart）
 result = client.chunk.split_text(
     text="# Title\n\nLong text to be chunked...",
-    parser_id="title",  # 解析器ID
-    config={"chunk_token_num": 512}  # 自定义配置
+    parser_id="title",  # 仅支持: title, regex, smart
+    config={"chunk_token_num": 512}
 )
 print(f"Total chunks: {result['total_chunks']}")
 for chunk in result['chunks']:
-    print(chunk['content'])
+    print(chunk)
+
+# 文件切片（支持所有ParserType方法）
+# 方式1: 使用本地文件路径
+result = client.chunk.split_file(
+    file_path="/path/to/document.pdf",
+    parser_id="book",  # 支持所有 ParserType
+    config={
+        "chunk_token_num": 512,
+        "delimiter": "\n。.；;！!？？",
+        "lang": "Chinese"
+    }
+)
+
+# 方式2: 使用文件URL
+result = client.chunk.split_file(
+    file_url="https://example.com/doc.pdf",
+    parser_id="naive",
+    config={
+        "chunk_token_num": 256,
+        "max_file_size": 128 * 1024 * 1024,  # 128MB
+        "download_timeout": 300
+    }
+)
+
+# 方式3: 上传文件并切片
+result = client.chunk.split_file_upload(
+    file_path="/path/to/document.pdf",
+    parser_id="book",
+    config={"chunk_token_num": 512}
+)
+print(f"Total chunks: {result['total_chunks']}")
+print(f"Filename: {result['filename']}")
+for chunk in result['chunks']:
+    print(chunk)
 ```
 
 ### 4. 信息抽取 (Extraction)
@@ -894,6 +1037,7 @@ SDK 包含完整的测试套件，覆盖所有功能模块。
 # 设置环境变量
 export HOST_ADDRESS="http://127.0.0.1:9380"
 export POWERRAG_API_KEY="your-api-key"
+export PYTHONPATH=$(pwd)
 
 # 运行测试
 pytest powerrag/sdk/tests/
@@ -1203,6 +1347,47 @@ for result in results:
         # 重新解析或删除
 ```
 
+### Q: 文本切片和文件切片有什么区别？应该使用哪个？
+
+A: 
+- **`split_text`**: 仅支持 `title`, `regex`, `smart` 三种解析器，适用于纯文本内容（Markdown格式）
+- **`split_file`**: 支持所有 ParserType 方法，适用于文件（通过路径或URL）
+- **`split_file_upload`**: 支持所有 ParserType 方法，适用于文件上传
+
+**使用建议：**
+- 如果只有文本内容且需要使用 `title`/`regex`/`smart`，使用 `split_text`
+- 如果有文件且需要使用其他解析器（如 `book`, `qa`, `naive` 等），使用 `split_file` 或 `split_file_upload`
+- 如果文件在本地，使用 `split_file(file_path=...)` 或 `split_file_upload`
+- 如果文件在远程URL，使用 `split_file(file_url=...)`
+
+**示例：**
+```python
+# 文本切片（仅支持 title, regex, smart）
+result = client.chunk.split_text(
+    text="# Title\n\nContent...",
+    parser_id="title"
+)
+
+# 文件切片（支持所有解析器）
+# 本地文件
+result = client.chunk.split_file(
+    file_path="/path/to/doc.pdf",
+    parser_id="book"  # 可以使用任何解析器
+)
+
+# 远程文件
+result = client.chunk.split_file(
+    file_url="https://example.com/doc.pdf",
+    parser_id="naive"
+)
+
+# 文件上传
+result = client.chunk.split_file_upload(
+    file_path="/path/to/doc.pdf",
+    parser_id="qa"
+)
+```
+
 ### Q: 如何解析无扩展名的文件？
 
 A: 使用 `parse_to_md_binary` 方法并使用 `input_type='auto'`（默认值）：
diff --git a/powerrag/sdk/modules/chunk_manager.py b/powerrag/sdk/modules/chunk_manager.py
index 61acccd1b..84bbdf643 100644
--- a/powerrag/sdk/modules/chunk_manager.py
+++ b/powerrag/sdk/modules/chunk_manager.py
@@ -14,7 +14,8 @@
 #  limitations under the License.
 #
 
-from typing import Optional, List, Dict, Any
+from typing import Optional, List, Dict, Any, Union
+from pathlib import Path
 from .chunk import ChunkInfo
 
 
@@ -268,4 +269,169 @@ def split_text(
             raise Exception(res_json.get("message", "Split text failed"))
         
         return res_json.get("data", {})
+    
+    def split_file(
+        self,
+        file_path: Optional[str] = None,
+        file_url: Optional[str] = None,
+        parser_id: str = "naive",
+        config: Optional[Dict[str, Any]] = None,
+    ) -> Dict[str, Any]:
+        """
+        文件切片（支持所有ParserType方法）
+        
+        支持三种方式：
+        1. 本地文件路径：file_path
+        2. 文件URL：file_url
+        3. 文件上传：使用 split_file_upload 方法
+        
+        Args:
+            file_path: 本地文件路径（可选，与file_url二选一）
+            file_url: 文件URL（可选，与file_path二选一）
+            parser_id: 解析器ID，支持所有ParserType：
+                - naive, qa, book, laws, paper, manual, presentation
+                - table, resume, picture, one, audio, email, tag
+                - knowledge_graph, title, regex, smart
+                默认"naive"
+            config: 解析配置（可选）
+                - chunk_token_num: 目标分块大小（tokens），默认512
+                - delimiter: 分隔符字符串，默认"\n。.；;！!？？"
+                - lang: 语言，默认"Chinese"
+                - from_page: 起始页码，默认0
+                - to_page: 结束页码，默认100000
+                - max_file_size: URL下载的最大文件大小（字节）
+                - download_timeout: 下载超时时间（秒）
+                - head_request_timeout: HEAD请求超时时间（秒）
+        
+        Returns:
+            切片结果，包含chunks列表、total_chunks数量和filename
+        
+        Raises:
+            Exception: API调用失败
+            ValueError: file_path和file_url都未提供
+        
+        Example:
+            ```python
+            # 使用本地文件路径
+            result = client.chunk.split_file(
+                file_path="/path/to/document.pdf",
+                parser_id="book",
+                config={"chunk_token_num": 512}
+            )
+            
+            # 使用文件URL
+            result = client.chunk.split_file(
+                file_url="https://example.com/doc.pdf",
+                parser_id="naive",
+                config={"chunk_token_num": 256}
+            )
+            ```
+        """
+        if not file_path and not file_url:
+            raise ValueError("Either file_path or file_url must be provided")
+        
+        payload = {
+            "parser_id": parser_id,
+        }
+        
+        if file_path:
+            payload["file_path"] = file_path
+        if file_url:
+            payload["file_url"] = file_url
+        
+        if config:
+            payload["config"] = config
+        
+        url = "/powerrag/split/file"
+        res = self.client.post(url, json=payload)
+        
+        # 检查响应状态码
+        if res.status_code != 200:
+            try:
+                error_json = res.json()
+                error_msg = error_json.get("message", f"HTTP {res.status_code}")
+            except Exception:
+                error_msg = f"HTTP {res.status_code}: {res.text[:200]}"
+            raise Exception(error_msg)
+        
+        res_json = res.json()
+        
+        if res_json.get("code") != 0:
+            raise Exception(res_json.get("message", "Split file failed"))
+        
+        return res_json.get("data", {})
+    
+    def split_file_upload(
+        self,
+        file_path: Union[str, Path],
+        parser_id: str = "naive",
+        config: Optional[Dict[str, Any]] = None,
+    ) -> Dict[str, Any]:
+        """
+        上传文件并切片（支持所有ParserType方法）
+        
+        Args:
+            file_path: 本地文件路径
+            parser_id: 解析器ID，支持所有ParserType，默认"naive"
+            config: 解析配置（可选）
+                - chunk_token_num: 目标分块大小（tokens），默认512
+                - delimiter: 分隔符字符串，默认"\n。.；;！!？？"
+                - lang: 语言，默认"Chinese"
+                - from_page: 起始页码，默认0
+                - to_page: 结束页码，默认100000
+        
+        Returns:
+            切片结果，包含chunks列表、total_chunks数量和filename
+        
+        Raises:
+            Exception: API调用失败
+            FileNotFoundError: 文件不存在
+        
+        Example:
+            ```python
+            result = client.chunk.split_file_upload(
+                file_path="/path/to/document.pdf",
+                parser_id="book",
+                config={"chunk_token_num": 512}
+            )
+            print(f"Total chunks: {result['total_chunks']}")
+            for chunk in result['chunks']:
+                print(chunk)
+            ```
+        """
+        path = Path(file_path)
+        if not path.exists():
+            raise FileNotFoundError(f"File not found: {file_path}")
+        
+        # 准备文件
+        with open(path, "rb") as f:
+            files = [("file", (path.name, f.read()))]
+        
+        # 准备表单数据
+        form_data = {
+            "parser_id": parser_id,
+        }
+        
+        if config:
+            import json
+            form_data["config"] = json.dumps(config)
+        
+        url = "/powerrag/split/file/upload"
+        res = self.client.post(url, json=None, files=files, data=form_data)
+        
+        # 检查响应状态码
+        if res.status_code != 200:
+            try:
+                error_json = res.json()
+                error_msg = error_json.get("message", f"HTTP {res.status_code}")
+            except Exception:
+                error_msg = f"HTTP {res.status_code}: {res.text[:200]}"
+            raise Exception(error_msg)
+        
+        res_json = res.json()
+        
+        if res_json.get("code") != 0:
+            raise Exception(res_json.get("message", "Split file upload failed"))
+        
+        return res_json.get("data", {})
 
diff --git a/powerrag/sdk/tests/test_chunk.py b/powerrag/sdk/tests/test_chunk.py
index 940460201..84f649d6f 100644
--- a/powerrag/sdk/tests/test_chunk.py
+++ b/powerrag/sdk/tests/test_chunk.py
@@ -156,8 +156,63 @@ def test_split_text_with_config(self, client: PowerRAGClient):
         text = "This is a test document with multiple paragraphs."
         result = client.chunk.split_text(
             text=text,
-            parser_id="naive",
+            parser_id="regex",
             config={"chunk_token_num": 128}
         )
         assert "chunks" in result or "total_chunks" in result
+    
+    def test_split_text_unsupported_parser(self, client: PowerRAGClient):
+        """测试不支持的parser_id应该抛出错误"""
+        text = "Test text"
+        # 使用一个真正不支持的 parser_id（如 "paper"）
+        # 注意：naive 实际上是被支持的（通过 RAGFlow 代理）
+        with pytest.raises(Exception) as exc_info:
+            client.chunk.split_text(
+                text=text,
+                parser_id="paper",  # paper 不支持纯文本切片，需要文件处理
+                config={"chunk_token_num": 128}
+            )
+        assert "not supported" in str(exc_info.value).lower() or "unknown" in str(exc_info.value).lower() or "failed" in str(exc_info.value).lower()
+
+
+class TestChunkSplitFile:
+    """测试文件切片"""
+    
+    def test_split_file_upload(self, client: PowerRAGClient, test_file_path: str):
+        """测试上传文件并切片"""
+        result = client.chunk.split_file_upload(
+            file_path=test_file_path,
+            parser_id="naive",
+            config={"chunk_token_num": 512}
+        )
+        assert "chunks" in result
+        assert "total_chunks" in result
+        assert "filename" in result
+        assert isinstance(result["chunks"], list)
+        assert result["total_chunks"] >= 0
+    
+    def test_split_file_upload_with_different_parsers(self, client: PowerRAGClient, test_file_path: str):
+        """测试使用不同parser_id的文件切片"""
+        parsers = ["naive", "book", "title"]
+        for parser_id in parsers:
+            try:
+                result = client.chunk.split_file_upload(
+                    file_path=test_file_path,
+                    parser_id=parser_id,
+                    config={"chunk_token_num": 256}
+                )
+                assert "chunks" in result
+                assert result["total_chunks"] >= 0
+            except Exception as e:
+                # 某些parser可能不支持特定文件类型，这是正常的
+                if "not supported" not in str(e).lower():
+                    raise
+    
+    def test_split_file_upload_nonexistent_file(self, client: PowerRAGClient):
+        """测试不存在的文件应该抛出错误"""
+        with pytest.raises(FileNotFoundError):
+            client.chunk.split_file_upload(
+                file_path="/nonexistent/file.pdf",
+                parser_id="naive"
+            )
 
diff --git a/powerrag/sdk/tests/test_document.py b/powerrag/sdk/tests/test_document.py
index 173b1ddde..947c385d4 100644
--- a/powerrag/sdk/tests/test_document.py
+++ b/powerrag/sdk/tests/test_document.py
@@ -217,13 +217,18 @@ def test_parse_to_chunk_async(self, client: PowerRAGClient, kb_id: str, test_fil
     
     def test_cancel_parse(self, client: PowerRAGClient, kb_id: str, test_file_path: str):
         """测试取消解析"""
+        import time
         uploaded_docs = client.document.upload(kb_id, test_file_path)
         doc_id = uploaded_docs[0]["id"]
         
         try:
             client.document.parse_to_chunk(kb_id, [doc_id], wait=False)
+            # Wait a bit for parsing to start
+            time.sleep(0.5)
             client.document.cancel_parse(kb_id, [doc_id])
             
+            # Wait a bit for status update
+            time.sleep(0.5)
             doc = client.document.get(kb_id, doc_id)
             assert doc["run"] in ["CANCEL", "UNSTART"]
         finally:
diff --git a/powerrag/server/routes/powerrag_routes.py b/powerrag/server/routes/powerrag_routes.py
index b7d7f3221..03c7d08fb 100644
--- a/powerrag/server/routes/powerrag_routes.py
+++ b/powerrag/server/routes/powerrag_routes.py
@@ -1187,11 +1187,13 @@ async def parse_to_md_upload(tenant_id):
                     "message": f"Request timeout while downloading file from URL. Please try again or increase timeout."
                 }), 408
             except requests.exceptions.ConnectionError as e:
+                # ConnectionError includes DNS resolution failures, invalid URLs, etc.
+                # Return 400 (Bad Request) instead of 503 (Service Unavailable) for invalid URLs
                 logger.error(f"Connection error downloading file from URL: {file_url}. Error: {e}")
                 return jsonify({
-                    "code": 503,
-                    "message": f"Failed to connect to file URL. Please check the URL and try again."
-                }), 503
+                    "code": 400,
+                    "message": f"Failed to download file from URL: {str(e)}"
+                }), 400
             except requests.exceptions.HTTPError as e:
                 logger.error(f"HTTP error downloading file from URL: {file_url}. Error: {e}")
                 return jsonify({
@@ -1402,6 +1404,201 @@ async def split_text(tenant_id):
         }), 500
 
 
+@powerrag_bp.route("/split/file", methods=["POST"])
+@apikey_required
+async def split_file(tenant_id):
+    """
+    Split file into chunks using rag/app chunking methods
+    
+    Supports all ParserType methods: naive, qa, book, laws, paper, manual, 
+    presentation, table, resume, picture, one, audio, email, tag, knowledge_graph,
+    title, regex, smart
+    
+    Request JSON:
+    {
+        "file_path": "/path/to/document.pdf",  # or use file_url
+        "file_url": "https://example.com/doc.pdf",  # optional
+        "parser_id": "naive",
+        "config": {
+            "chunk_token_num": 512,
+            "delimiter": "\n。.；;！!？？",
+            "lang": "Chinese",
+            "from_page": 0,
+            "to_page": 100000
+        }
+    }
+    
+    Response:
+    {
+        "code": 0,
+        "data": {
+            "parser_id": "naive",
+            "chunks": ["chunk1", "chunk2", ...],
+            "total_chunks": 10,
+            "filename": "document.pdf"
+        }
+    }
+    """
+    try:
+        data = await request.get_json()
+        
+        if not data:
+            return jsonify({
+                "code": 400,
+                "message": "No JSON data provided"
+            }), 400
+        
+        file_path = data.get("file_path")
+        file_url = data.get("file_url")
+        parser_id = data.get("parser_id", "naive")
+        config = data.get("config", {})
+        
+        if not file_path and not file_url:
+            return jsonify({
+                "code": 400,
+                "message": "Either file_path or file_url is required"
+            }), 400
+        
+        # Handle file URL download
+        if file_url:
+            max_file_size = config.get('max_file_size', settings.DOC_MAXIMUM_SIZE)
+            download_timeout = config.get('download_timeout', DEFAULT_DOWNLOAD_TIMEOUT)
+            head_timeout = config.get('head_request_timeout', DEFAULT_HEAD_REQUEST_TIMEOUT)
+            
+            logger.info(f"Downloading file from URL: {file_url}")
+            try:
+                binary, error_msg = download_file_with_validation(
+                    file_url, max_file_size, download_timeout, head_timeout
+                )
+                if error_msg:
+                    return jsonify({
+                        "code": 400,
+                        "message": f"Failed to download file: {error_msg}"
+                    }), 400
+                
+                # Extract filename from URL or use provided filename
+                filename = config.get('filename') or file_url.split('/')[-1].split('?')[0]
+                if not filename:
+                    filename = "downloaded_file"
+            except Exception as e:
+                logger.error(f"Error downloading file from URL: {e}", exc_info=True)
+                return jsonify({
+                    "code": 500,
+                    "message": f"Failed to download file: {str(e)}"
+                }), 500
+        else:
+            # Use file path
+            filename = file_path
+            binary = None
+        
+        service = PowerRAGSplitService()
+        result = service.split_file(filename=filename, binary=binary, parser_id=parser_id, config=config)
+        
+        return jsonify({
+            "code": 0,
+            "data": result,
+            "message": "success"
+        }), 200
+        
+    except Exception as e:
+        logger.error(f"Split file error: {e}", exc_info=True)
+        return jsonify({
+            "code": 500,
+            "message": str(e)
+        }), 500
+
+
+@powerrag_bp.route("/split/file/upload", methods=["POST"])
+@apikey_required
+async def split_file_upload(tenant_id):
+    """
+    Split uploaded file into chunks using rag/app chunking methods
+    
+    Supports all ParserType methods: naive, qa, book, laws, paper, manual, 
+    presentation, table, resume, picture, one, audio, email, tag, knowledge_graph,
+    title, regex, smart
+    
+    Request (multipart/form-data):
+    - file: File to split (required)
+    - parser_id: Parser ID (optional, default: "naive")
+    - config: JSON string of parser config (optional)
+    
+    Config parameters:
+    - chunk_token_num (int): Target chunk size in tokens (default: 512)
+    - delimiter (str): Delimiter string for splitting (default: "\n。.；;！!？？")
+    - lang (str): Language (default: "Chinese")
+    - from_page (int): Start page number (default: 0)
+    - to_page (int): End page number (default: 100000)
+    
+    Response:
+    {
+        "code": 0,
+        "data": {
+            "parser_id": "naive",
+            "chunks": ["chunk1", "chunk2", ...],
+            "total_chunks": 10,
+            "filename": "document.pdf"
+        }
+    }
+    """
+    try:
+        # Check if file is present
+        files = await request.files
+        if 'file' not in files:
+            return jsonify({
+                "code": 400,
+                "message": "No file provided"
+            }), 400
+        
+        file = files['file']
+        if file.filename == '':
+            return jsonify({
+                "code": 400,
+                "message": "No file selected"
+            }), 400
+        
+        # Get parameters
+        form = await request.form
+        parser_id = form.get('parser_id', 'naive')
+        
+        # Parse config from JSON string if provided
+        import json
+        config_str = form.get('config', '{}')
+        try:
+            config = json.loads(config_str)
+        except json.JSONDecodeError:
+            return jsonify({
+                "code": 400,
+                "message": "Invalid JSON in config parameter"
+            }), 400
+        
+        filename = file.filename
+        
+        # Read file binary (file.read() is synchronous in Quart)
+        binary = file.read()
+        if not binary:
+            return jsonify({
+                "code": 400,
+                "message": "File is empty"
+            }), 400
+        
+        service = PowerRAGSplitService()
+        result = service.split_file(filename=filename, binary=binary, parser_id=parser_id, config=config)
+        
+        return jsonify({
+            "code": 0,
+            "data": result,
+            "message": "success"
+        }), 200
+        
+    except Exception as e:
+        logger.error(f"Split file upload error: {e}", exc_info=True)
+        return jsonify({
+            "code": 500,
+            "message": str(e)
+        }), 500
+
+
 # ============================================================================
 # 信息抽取接口
 # ============================================================================
diff --git a/powerrag/server/services/split_service.py b/powerrag/server/services/split_service.py
index e4b535938..76b8b6ef4 100644
--- a/powerrag/server/services/split_service.py
+++ b/powerrag/server/services/split_service.py
@@ -50,12 +50,13 @@ class PowerRAGSplitService:
     def __init__(self):
         # 初始化时动态导入chunker，避免循环导入
         self._init_chunker_factory()
+        self._init_file_chunker_factory()
 
     def _init_chunker_factory(self):
         """动态导入chunker模块，避免循环导入"""
         global CHUNKER_FACTORY
         if not CHUNKER_FACTORY:
-            # 直接引用同一模块中定义的函数
+            # PowerRAG 专门的 chunker（仅支持文本切分）
             CHUNKER_FACTORY.update({
                 ParserType.TITLE.value: title_based_chunking,  # PowerRAG Title Chunker
                 ParserType.REGEX.value: regex_based_chunking,  # PowerRAG regex Chunker
@@ -158,9 +159,14 @@ def dummy(prog=None, msg=""):
                 # Smart chunking returns a list of chunks directly
                 chunks = chunker(text, parser_config=parser_config)
             else:
-                # Use config as-is for other parsers
-                chunks=[]
-                raise ValueError(f"Chunker not found for parser_id: {parser_id}")
+                # Other parser types (naive, qa, book, laws, etc.) are not supported for text splitting
+                # Use split_file method instead for file-based chunking
+                raise ValueError(
+                    f"Parser '{parser_id}' is not supported for text splitting. "
+                    f"Supported parsers for text splitting are: {ParserType.TITLE.value}, "
+                    f"{ParserType.REGEX.value}, {ParserType.SMART.value}. "
+                    f"For other parser types, please use split_file() method instead."
+                )
 
             # Ensure all chunks are strings and handle encoding
             processed_chunks = []
@@ -196,6 +202,169 @@ def dummy(prog=None, msg=""):
             logger.error(f"Error splitting text with parser '{parser_id}': {e}", exc_info=True)
             raise
 
+    def _init_file_chunker_factory(self):
+        """初始化文件 chunker factory，映射 ParserType 到 rag/app 模块"""
+        # 延迟导入，避免循环导入
+        if not hasattr(self, '_file_chunker_factory'):
+            self._file_chunker_factory = {}
+            try:
+                # 导入 rag/app 模块
+                from rag.app import (
+                    laws, paper, presentation, manual, qa, table, book, resume,
+                    picture, naive, one, audio, email, tag
+                )
+                # 导入 powerrag/app 模块
+                from powerrag.app import title as powerrag_title, regex as powerrag_regex, smart as powerrag_smart
+                
+                # 映射 ParserType 到对应的 chunk 模块
+                self._file_chunker_factory = {
+                    ParserType.NAIVE.value: naive,
+                    ParserType.PAPER.value: paper,
+                    ParserType.BOOK.value: book,
+                    ParserType.PRESENTATION.value: presentation,
+                    ParserType.MANUAL.value: manual,
+                    ParserType.LAWS.value: laws,
+                    ParserType.QA.value: qa,
+                    ParserType.TABLE.value: table,
+                    ParserType.RESUME.value: resume,
+                    ParserType.PICTURE.value: picture,
+                    ParserType.ONE.value: one,
+                    ParserType.EMAIL.value: email,
+                    ParserType.KG.value: naive,  # knowledge_graph 使用 naive
+                    ParserType.TAG.value: tag,
+                    ParserType.TITLE.value: powerrag_title,  # PowerRAG Title Parser
+                    ParserType.REGEX.value: powerrag_regex,  # PowerRAG Regex Parser
+                    ParserType.SMART.value: powerrag_smart,  # PowerRAG Smart Parser
+                }
+            except ImportError as e:
+                logger.warning(f"Failed to import some rag/app modules: {e}")
+                # 如果导入失败，至少提供基本的 naive chunker
+                try:
+                    from rag.app import naive
+                    self._file_chunker_factory = {ParserType.NAIVE.value: naive}
+                except ImportError:
+                    logger.error("Failed to import naive chunker, file splitting will not work")
+                    self._file_chunker_factory = {}
+
+    def split_file(self, filename: str = None, binary: bytes = None, parser_id: str = "naive", 
+                   config: Dict[str, Any] = None) -> Dict[str, Any]:
+        """
+        Split file into chunks using rag/app chunking methods
+        
+        Args:
+            filename: File path (optional if binary is provided)
+            binary: File binary content (optional if filename is provided)
+            parser_id: Parser/chunker ID (e.g., "naive", "book", "title")
+            config: Chunking configuration (optional)
+            
+        Returns:
+            Dict containing chunks and metadata
+            
+        Example:
+            ```python
+            service = PowerRAGSplitService()
+            
+            # Using file path
+            result = service.split_file(
+                filename="/path/to/document.pdf",
+                parser_id="book",
+                config={"chunk_token_num": 512}
+            )
+            
+            # Using binary
+            with open("document.pdf", "rb") as f:
+                binary = f.read()
+            result = service.split_file(
+                filename="document.pdf",
+                binary=binary,
+                parser_id="naive",
+                config={"chunk_token_num": 256}
+            )
+            ```
+        """
+        if not filename and not binary:
+            raise ValueError("Either filename or binary must be provided")
+        
+        if filename and not binary:
+            # Read file from path
+            with open(filename, "rb") as f:
+                binary = f.read()
+        
+        if not filename:
+            # Generate a temporary filename from binary or use default
+            filename = "temp_file"
+        
+        if config is None:
+            config = {}
+        
+        # Get chunker module
+        chunker_module = self._file_chunker_factory.get(parser_id.lower())
+        if not chunker_module:
+            logger.warning(f"Chunker '{parser_id}' not found in file chunker factory, using naive")
+            chunker_module = self._file_chunker_factory.get(ParserType.NAIVE.value)
+            if not chunker_module:
+                raise ValueError(f"Chunker '{parser_id}' not found and naive chunker not available")
+        
+        # Prepare callback
+        def dummy(prog=None, msg=""):
+            """Dummy callback for progress"""
+            pass
+        
+        # Build parser_config from config
+        parser_config = config.copy()
+        parser_config.setdefault("chunk_token_num", 512)
+        parser_config.setdefault("delimiter", "\n。.；;！!？？")
+        
+        # Build kwargs
+        kwargs = {
+            "lang": config.get("lang", "Chinese"),
+            "callback": dummy,
+            "parser_config": parser_config,
+            "from_page": config.get("from_page", 0),
+            "to_page": config.get("to_page", 100000),
+        }
+        
+        # Add optional fields
+        if config.get("tenant_id"):
+            kwargs["tenant_id"] = config["tenant_id"]
+        if config.get("kb_id"):
+            kwargs["kb_id"] = config["kb_id"]
+        if config.get("doc_id"):
+            kwargs["doc_id"] = config["doc_id"]
+        
+        try:
+            # Call chunk function
+            logger.info(f"Calling chunk function for parser '{parser_id}' on file '{filename}'")
+            tokenized_chunks = chunker_module.chunk(filename, binary=binary, **kwargs)
+            
+            # Extract text content from tokenized chunks
+            chunks = []
+            for chunk_dict in tokenized_chunks:
+                if isinstance(chunk_dict, dict):
+                    # Extract content_with_weight or content field
+                    content = chunk_dict.get("content_with_weight") or chunk_dict.get("content", "")
+                    if content:
+                        chunks.append(content)
+                elif isinstance(chunk_dict, str):
+                    chunks.append(chunk_dict)
+            
+            logger.info(f"Split file '{filename}' with parser '{parser_id}': {len(chunks)} chunks")
+            
+            return {
+                "parser_id": parser_id,
+                "chunks": chunks,
+                "total_chunks": len(chunks),
+                "filename": filename,
+                "metadata": {
+                    "chunker": "rag/app",
+                    "config": config
+                }
+            }
+            
+        except Exception as e:
+            logger.error(f"Error splitting file '{filename}' with parser '{parser_id}': {e}", exc_info=True)
+            raise
+
 
 # ==============================================
 # Shared utility functions for chunking

From 537e40acdbcbd4966b860a0d935297852c5aa92c Mon Sep 17 00:00:00 2001
From: zhanggan7723 <suiyu.zg@oceanbase.com>
Date: Tue, 24 Feb 2026 20:15:11 +0800
Subject: [PATCH 18/19] Update powerrag/server/services/split_service.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 powerrag/server/services/split_service.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/powerrag/server/services/split_service.py b/powerrag/server/services/split_service.py
index 76b8b6ef4..832287546 100644
--- a/powerrag/server/services/split_service.py
+++ b/powerrag/server/services/split_service.py
@@ -287,8 +287,13 @@ def split_file(self, filename: str = None, binary: bytes = None, parser_id: str
         
         if filename and not binary:
             # Read file from path
-            with open(filename, "rb") as f:
-                binary = f.read()
+            try:
+                with open(filename, "rb") as f:
+                    binary = f.read()
+            except FileNotFoundError as e:
+                raise FileNotFoundError(
+                    f"Failed to open file '{filename}' for splitting"
+                ) from e
         
         if not filename:
             # Generate a temporary filename from binary or use default

From 0cb7666c8369a49b32ee0d683dd0d1ed6f6994a4 Mon Sep 17 00:00:00 2001
From: "suiyu.zg" <suiyu.zg@oceanbase.com>
Date: Tue, 24 Feb 2026 20:50:29 +0800
Subject: [PATCH 19/19] feat(sdk): enhance file splitting functionality and
 improve error handling

---
 api/apps/sdk/powerrag_proxy.py            |  7 ++-
 powerrag/sdk/README.md                    |  1 -
 powerrag/sdk/modules/chunk_manager.py     |  4 +-
 powerrag/sdk/tests/test_chunk.py          | 73 +++++++++++++++++++++--
 powerrag/sdk/tests/test_document.py       | 41 +++++++++----
 powerrag/server/routes/powerrag_routes.py | 53 +++++++++-------
 powerrag/server/services/split_service.py | 32 +++++-----
 7 files changed, 151 insertions(+), 60 deletions(-)

diff --git a/api/apps/sdk/powerrag_proxy.py b/api/apps/sdk/powerrag_proxy.py
index f0fffb811..065071c34 100644
--- a/api/apps/sdk/powerrag_proxy.py
+++ b/api/apps/sdk/powerrag_proxy.py
@@ -21,8 +21,11 @@
 这样 SDK 可以通过主 RAGFlow 服务访问 PowerRAG 功能，无需直接连接到 PowerRAG server
 """
 
-import os
+import asyncio
 import logging
+import os
+from io import BytesIO
+
 import httpx
 from quart import request, jsonify
 from api.utils.api_utils import token_required, get_error_data_result
@@ -91,8 +94,6 @@ async def _forward_request(method: str, endpoint: str, tenant_id: str = None):
                 if files_dict:
                     # 保留文件名信息！重要：不能直接 dict(files_dict)
                     # 因为会丢失文件名。需要构造 httpx 期望的格式
-                    import asyncio
-                    from io import BytesIO
                     files = {}
                     for field_name, file_storage in files_dict.items():
                         # 在线程中读取文件内容（避免阻塞事件循环）
diff --git a/powerrag/sdk/README.md b/powerrag/sdk/README.md
index 6f4e37a4b..825a134f5 100644
--- a/powerrag/sdk/README.md
+++ b/powerrag/sdk/README.md
@@ -387,7 +387,6 @@ for chunk in result['chunks']:
 - 基础方法: `naive`, `title`, `regex`, `smart`
 - 专业方法: `qa`, `book`, `laws`, `paper`, `manual`, `presentation`
 - 特殊格式: `table`, `resume`, `picture`, `one`, `email`
-- 高级方法: `knowledge_graph`
 
 **配置参数说明：**
 - `chunk_token_num` (int): 目标分块大小（tokens），默认 512
diff --git a/powerrag/sdk/modules/chunk_manager.py b/powerrag/sdk/modules/chunk_manager.py
index 84bbdf643..45c41b64e 100644
--- a/powerrag/sdk/modules/chunk_manager.py
+++ b/powerrag/sdk/modules/chunk_manager.py
@@ -14,8 +14,9 @@
 #  limitations under the License.
 #
 
-from typing import Optional, List, Dict, Any, Union
+import json
 from pathlib import Path
+from typing import Optional, List, Dict, Any, Union
 from .chunk import ChunkInfo
 
 
@@ -413,7 +414,6 @@ def split_file_upload(
         }
         
         if config:
-            import json
             form_data["config"] = json.dumps(config)
         
         url = "/powerrag/split/file/upload"
diff --git a/powerrag/sdk/tests/test_chunk.py b/powerrag/sdk/tests/test_chunk.py
index 84f649d6f..7b7861e90 100644
--- a/powerrag/sdk/tests/test_chunk.py
+++ b/powerrag/sdk/tests/test_chunk.py
@@ -165,19 +165,81 @@ def test_split_text_unsupported_parser(self, client: PowerRAGClient):
         """测试不支持的parser_id应该抛出错误"""
         text = "Test text"
         # 使用一个真正不支持的 parser_id（如 "paper"）
-        # 注意：naive 实际上是被支持的（通过 RAGFlow 代理）
         with pytest.raises(Exception) as exc_info:
             client.chunk.split_text(
                 text=text,
                 parser_id="paper",  # paper 不支持纯文本切片，需要文件处理
                 config={"chunk_token_num": 128}
             )
-        assert "not supported" in str(exc_info.value).lower() or "unknown" in str(exc_info.value).lower() or "failed" in str(exc_info.value).lower()
+        error_msg = str(exc_info.value).lower()
+        # PowerRAG: "not supported for text splitting" + "split_file"
+        # RAGFlow proxy: "unknown chunker" + "supported text chunkers"
+        assert (
+            ("not supported for text splitting" in error_msg and "split_file" in error_msg)
+            or ("unknown chunker" in error_msg and "paper" in error_msg)
+        )
 
 
 class TestChunkSplitFile:
     """测试文件切片"""
-    
+
+    def test_split_file_with_file_path(self, client: PowerRAGClient, test_file_path: str):
+        """测试使用 file_path 参数的文件切片（服务器需能访问该路径）"""
+        result = client.chunk.split_file(
+            file_path=test_file_path,
+            parser_id="naive",
+            config={"chunk_token_num": 512},
+        )
+        assert "chunks" in result
+        assert "total_chunks" in result
+        assert "filename" in result
+        assert isinstance(result["chunks"], list)
+        assert result["total_chunks"] >= 0
+
+    def test_split_file_with_file_url(self, client: PowerRAGClient):
+        """测试使用 file_url 参数的文件切片"""
+        # 使用 httpbin.org（HTTP 避免 SSL 证书问题，返回 HTML）
+        result = client.chunk.split_file(
+            file_url="http://httpbin.org/html",
+            parser_id="naive",
+            config={"chunk_token_num": 512, "filename": "example.html"},
+        )
+        assert "chunks" in result
+        assert "total_chunks" in result
+        assert "filename" in result
+        assert isinstance(result["chunks"], list)
+        assert result["total_chunks"] >= 0
+
+    def test_split_file_missing_both(self, client: PowerRAGClient):
+        """测试 file_path 和 file_url 都未提供时应抛出错误"""
+        with pytest.raises(ValueError) as exc_info:
+            client.chunk.split_file(parser_id="naive")
+        assert "file_path" in str(exc_info.value).lower() or "file_url" in str(exc_info.value).lower()
+        assert "must be provided" in str(exc_info.value).lower()
+
+    def test_split_file_invalid_url(self, client: PowerRAGClient):
+        """测试无效 URL 应抛出错误"""
+        with pytest.raises(Exception) as exc_info:
+            client.chunk.split_file(
+                file_url="http://invalid-domain-does-not-exist-12345.example.com/file.pdf",
+                parser_id="naive",
+                config={"download_timeout": 5},
+            )
+        error_msg = str(exc_info.value).lower()
+        assert "download" in error_msg or "failed" in error_msg or "connection" in error_msg
+
+    def test_split_file_size_limit_exceeded(self, client: PowerRAGClient):
+        """测试超过大小限制的 URL 应抛出错误"""
+        # httpbin.org/bytes/1000000 返回 1MB，max_file_size=1024 应触发限制
+        with pytest.raises(Exception) as exc_info:
+            client.chunk.split_file(
+                file_url="https://httpbin.org/bytes/1000000",
+                parser_id="naive",
+                config={"max_file_size": 1024},
+            )
+        error_msg = str(exc_info.value).lower()
+        assert "exceeds" in error_msg or "size" in error_msg or "limit" in error_msg
+
     def test_split_file_upload(self, client: PowerRAGClient, test_file_path: str):
         """测试上传文件并切片"""
         result = client.chunk.split_file_upload(
@@ -210,9 +272,12 @@ def test_split_file_upload_with_different_parsers(self, client: PowerRAGClient,
     
     def test_split_file_upload_nonexistent_file(self, client: PowerRAGClient):
         """测试不存在的文件应该抛出错误"""
-        with pytest.raises(FileNotFoundError):
+        with pytest.raises((FileNotFoundError, Exception)) as exc_info:
             client.chunk.split_file_upload(
                 file_path="/nonexistent/file.pdf",
                 parser_id="naive"
             )
+        # SDK raises FileNotFoundError locally; API may wrap in generic Exception
+        error_msg = str(exc_info.value).lower()
+        assert "not found" in error_msg or "no such file" in error_msg
 
diff --git a/powerrag/sdk/tests/test_document.py b/powerrag/sdk/tests/test_document.py
index 947c385d4..69dac99ff 100644
--- a/powerrag/sdk/tests/test_document.py
+++ b/powerrag/sdk/tests/test_document.py
@@ -14,6 +14,8 @@
 #  limitations under the License.
 #
 
+import time
+
 import pytest
 from powerrag.sdk import PowerRAGClient
 
@@ -217,20 +219,33 @@ def test_parse_to_chunk_async(self, client: PowerRAGClient, kb_id: str, test_fil
     
     def test_cancel_parse(self, client: PowerRAGClient, kb_id: str, test_file_path: str):
         """测试取消解析"""
-        import time
         uploaded_docs = client.document.upload(kb_id, test_file_path)
         doc_id = uploaded_docs[0]["id"]
-        
+
         try:
             client.document.parse_to_chunk(kb_id, [doc_id], wait=False)
-            # Wait a bit for parsing to start
-            time.sleep(0.5)
+
+            # Poll for parsing to start (RUNNING or SCHEDULE), timeout 10s
+            for _ in range(100):
+                doc = client.document.get(kb_id, doc_id)
+                if doc["run"] in ["RUNNING", "1", "SCHEDULE"]:
+                    break
+                time.sleep(0.1)
+            else:
+                pytest.fail("Parsing did not start within 10s")
+
             client.document.cancel_parse(kb_id, [doc_id])
-            
-            # Wait a bit for status update
-            time.sleep(0.5)
-            doc = client.document.get(kb_id, doc_id)
-            assert doc["run"] in ["CANCEL", "UNSTART"]
+
+            # Poll for cancel to propagate, timeout 5s
+            for _ in range(50):
+                doc = client.document.get(kb_id, doc_id)
+                if doc["run"] in ["CANCEL", "UNSTART", "2", "0"]:
+                    break
+                time.sleep(0.1)
+            else:
+                pytest.fail("Cancel did not propagate within 5s")
+
+            assert doc["run"] in ["CANCEL", "UNSTART", "2", "0"]
         finally:
             client.document.delete(kb_id, [doc_id])
 
@@ -959,12 +974,12 @@ def test_parse_from_invalid_url(self, client: PowerRAGClient):
             }
         )
         
-        # 应该返回 400 错误
-        assert response.status_code == 400
+        # 无效 URL（连接失败）返回 502 (Bad Gateway)
+        assert response.status_code == 502
         result = response.json()
-        assert result["code"] == 400
+        assert result["code"] == 502
         assert "Failed to download" in result["message"]
-    
+
     def test_parse_cannot_provide_both_file_and_url(self, client: PowerRAGClient, tmp_path):
         """测试不能同时提供 file 和 file_url"""
         import requests
diff --git a/powerrag/server/routes/powerrag_routes.py b/powerrag/server/routes/powerrag_routes.py
index 03c7d08fb..c6acc82cb 100644
--- a/powerrag/server/routes/powerrag_routes.py
+++ b/powerrag/server/routes/powerrag_routes.py
@@ -1187,13 +1187,13 @@ async def parse_to_md_upload(tenant_id):
                     "message": f"Request timeout while downloading file from URL. Please try again or increase timeout."
                 }), 408
             except requests.exceptions.ConnectionError as e:
-                # ConnectionError includes DNS resolution failures, invalid URLs, etc.
-                # Return 400 (Bad Request) instead of 503 (Service Unavailable) for invalid URLs
+                # ConnectionError: DNS failure, invalid URL, network unreachable, connection refused.
+                # 502 (Bad Gateway) indicates we could not reach the upstream URL.
                 logger.error(f"Connection error downloading file from URL: {file_url}. Error: {e}")
                 return jsonify({
-                    "code": 400,
+                    "code": 502,
                     "message": f"Failed to download file from URL: {str(e)}"
-                }), 400
+                }), 502
             except requests.exceptions.HTTPError as e:
                 logger.error(f"HTTP error downloading file from URL: {file_url}. Error: {e}")
                 return jsonify({
@@ -1201,11 +1201,12 @@ async def parse_to_md_upload(tenant_id):
                     "message": f"HTTP error while downloading file: {str(e)}"
                 }), 502
             except requests.exceptions.RequestException as e:
+                # Catch-all for other request errors (e.g. TooManyRedirects)
                 logger.error(f"Request error downloading file from URL: {file_url}. Error: {e}")
                 return jsonify({
-                    "code": 400,
+                    "code": 502,
                     "message": f"Failed to download file from URL: {str(e)}"
-                }), 400
+                }), 502
             except Exception as e:
                 logger.error(f"Unexpected error downloading file from URL: {file_url}. Error: {e}", exc_info=True)
                 return jsonify({
@@ -1409,11 +1410,11 @@ async def split_text(tenant_id):
 async def split_file(tenant_id):
     """
     Split file into chunks using rag/app chunking methods
-    
-    Supports all ParserType methods: naive, qa, book, laws, paper, manual, 
+
+    Supports all ParserType methods: naive, qa, book, laws, paper, manual,
     presentation, table, resume, picture, one, audio, email, tag, knowledge_graph,
-    title, regex, smart
-    
+    title, regex, smart.
+
     Request JSON:
     {
         "file_path": "/path/to/document.pdf",  # or use file_url
@@ -1492,14 +1493,20 @@ async def split_file(tenant_id):
             binary = None
         
         service = PowerRAGSplitService()
-        result = service.split_file(filename=filename, binary=binary, parser_id=parser_id, config=config)
-        
+        result = service.split_file(
+            filename=filename,
+            binary=binary,
+            parser_id=parser_id,
+            config=config,
+            tenant_id=tenant_id,
+        )
+
         return jsonify({
             "code": 0,
             "data": result,
             "message": "success"
         }), 200
-        
+
     except Exception as e:
         logger.error(f"Split file error: {e}", exc_info=True)
         return jsonify({
@@ -1513,11 +1520,11 @@ async def split_file(tenant_id):
 async def split_file_upload(tenant_id):
     """
     Split uploaded file into chunks using rag/app chunking methods
-    
-    Supports all ParserType methods: naive, qa, book, laws, paper, manual, 
+
+    Supports all ParserType methods: naive, qa, book, laws, paper, manual,
     presentation, table, resume, picture, one, audio, email, tag, knowledge_graph,
-    title, regex, smart
-    
+    title, regex, smart.
+
     Request (multipart/form-data):
     - file: File to split (required)
     - parser_id: Parser ID (optional, default: "naive")
@@ -1583,14 +1590,20 @@ async def split_file_upload(tenant_id):
             }), 400
         
         service = PowerRAGSplitService()
-        result = service.split_file(filename=filename, binary=binary, parser_id=parser_id, config=config)
-        
+        result = service.split_file(
+            filename=filename,
+            binary=binary,
+            parser_id=parser_id,
+            config=config,
+            tenant_id=tenant_id,
+        )
+
         return jsonify({
             "code": 0,
             "data": result,
             "message": "success"
         }), 200
-        
+
     except Exception as e:
         logger.error(f"Split file upload error: {e}", exc_info=True)
         return jsonify({
diff --git a/powerrag/server/services/split_service.py b/powerrag/server/services/split_service.py
index 76b8b6ef4..5b32f707b 100644
--- a/powerrag/server/services/split_service.py
+++ b/powerrag/server/services/split_service.py
@@ -34,6 +34,12 @@
 
 logger = logging.getLogger(__name__)
 
+
+def _dummy_callback(prog=None, msg=""):
+    """No-op callback for parser progress; used when progress reporting is not needed."""
+    pass
+
+
 # Chunker Factory - mapping parser_id to chunking module
 CHUNKER_FACTORY = {}
 
@@ -118,11 +124,6 @@ def split_text(self, text: str, parser_id: str = "title", config: Dict[str, Any]
             chunker = self._get_chunker(parser_id)
             logger.info(f"Using chunker: {parser_id} for text splitting")
 
-            # Prepare callback
-            def dummy(prog=None, msg=""):
-                """Dummy callback for progress"""
-                pass
-
             # Build parser_config based on parser_id
             if parser_id == ParserType.TITLE.value:
                 # Title parser specific config
@@ -246,16 +247,17 @@ def _init_file_chunker_factory(self):
                     logger.error("Failed to import naive chunker, file splitting will not work")
                     self._file_chunker_factory = {}
 
-    def split_file(self, filename: str = None, binary: bytes = None, parser_id: str = "naive", 
-                   config: Dict[str, Any] = None) -> Dict[str, Any]:
+    def split_file(self, filename: str = None, binary: bytes = None, parser_id: str = "naive",
+                   config: Dict[str, Any] = None, tenant_id: str = None) -> Dict[str, Any]:
         """
         Split file into chunks using rag/app chunking methods
-        
+
         Args:
             filename: File path (optional if binary is provided)
             binary: File binary content (optional if filename is provided)
             parser_id: Parser/chunker ID (e.g., "naive", "book", "title")
             config: Chunking configuration (optional)
+            tenant_id: Tenant ID (required for audio and picture parsers; used for LLM model lookup)
             
         Returns:
             Dict containing chunks and metadata
@@ -305,11 +307,6 @@ def split_file(self, filename: str = None, binary: bytes = None, parser_id: str
             if not chunker_module:
                 raise ValueError(f"Chunker '{parser_id}' not found and naive chunker not available")
         
-        # Prepare callback
-        def dummy(prog=None, msg=""):
-            """Dummy callback for progress"""
-            pass
-        
         # Build parser_config from config
         parser_config = config.copy()
         parser_config.setdefault("chunk_token_num", 512)
@@ -318,15 +315,16 @@ def dummy(prog=None, msg=""):
         # Build kwargs
         kwargs = {
             "lang": config.get("lang", "Chinese"),
-            "callback": dummy,
+            "callback": _dummy_callback,
             "parser_config": parser_config,
             "from_page": config.get("from_page", 0),
             "to_page": config.get("to_page", 100000),
         }
-        
+
+        if tenant_id:
+            kwargs["tenant_id"] = tenant_id
+
         # Add optional fields
-        if config.get("tenant_id"):
-            kwargs["tenant_id"] = config["tenant_id"]
         if config.get("kb_id"):
             kwargs["kb_id"] = config["kb_id"]
         if config.get("doc_id"):