From 352fa8fbc4ba9669cbc651ff8c1eca0378cd51fe Mon Sep 17 00:00:00 2001 From: "suiyu.zg" Date: Wed, 31 Dec 2025 16:55:48 +0800 Subject: [PATCH 01/19] feat:add PowerRAG SDK and API Proxy --- api/apps/sdk/powerrag_proxy.py | 594 +++++++++ api/apps/system_app.py | 2 +- powerrag/sdk/README.md | 1148 +++++++++++++++++ powerrag/sdk/__init__.py | 29 + powerrag/sdk/client.py | 155 +++ powerrag/sdk/modules/chunk.py | 32 + powerrag/sdk/modules/chunk_manager.py | 271 ++++ powerrag/sdk/modules/document.py | 40 + powerrag/sdk/modules/document_manager.py | 789 +++++++++++ powerrag/sdk/modules/extraction.py | 50 + powerrag/sdk/modules/extraction_manager.py | 239 ++++ powerrag/sdk/modules/knowledge_base.py | 34 + .../sdk/modules/knowledge_base_manager.py | 232 ++++ powerrag/sdk/modules/knowledge_graph.py | 52 + .../sdk/modules/knowledge_graph_manager.py | 102 ++ powerrag/sdk/modules/raptor.py | 29 + powerrag/sdk/modules/raptor_manager.py | 83 ++ powerrag/sdk/modules/retrieval.py | 41 + powerrag/sdk/modules/retrieval_manager.py | 158 +++ powerrag/sdk/tests/conftest.py | 231 ++++ powerrag/sdk/tests/pytest.ini | 20 + powerrag/sdk/tests/test_chunk.py | 163 +++ powerrag/sdk/tests/test_document.py | 430 ++++++ powerrag/sdk/tests/test_extraction.py | 137 ++ powerrag/sdk/tests/test_knowledge_base.py | 193 +++ powerrag/sdk/tests/test_knowledge_graph.py | 90 ++ powerrag/sdk/tests/test_raptor.py | 69 + powerrag/sdk/tests/test_retrieval.py | 105 ++ powerrag/server/app.py | 41 +- powerrag/server/powerrag_server.py | 15 +- powerrag/server/routes/powerrag_routes.py | 511 +++++++- powerrag/server/routes/task_routes.py | 12 +- powerrag/server/services/convert_service.py | 4 +- powerrag/server/services/extract_service.py | 22 +- powerrag/server/services/parse_service.py | 97 ++ .../services/parse_to_md_task_manager.py | 237 ++++ powerrag/server/services/split_service.py | 2 +- 37 files changed, 6394 insertions(+), 65 deletions(-) create mode 100644 api/apps/sdk/powerrag_proxy.py create mode 100644 powerrag/sdk/README.md create mode 100644 powerrag/sdk/__init__.py create mode 100644 powerrag/sdk/client.py create mode 100644 powerrag/sdk/modules/chunk.py create mode 100644 powerrag/sdk/modules/chunk_manager.py create mode 100644 powerrag/sdk/modules/document.py create mode 100644 powerrag/sdk/modules/document_manager.py create mode 100644 powerrag/sdk/modules/extraction.py create mode 100644 powerrag/sdk/modules/extraction_manager.py create mode 100644 powerrag/sdk/modules/knowledge_base.py create mode 100644 powerrag/sdk/modules/knowledge_base_manager.py create mode 100644 powerrag/sdk/modules/knowledge_graph.py create mode 100644 powerrag/sdk/modules/knowledge_graph_manager.py create mode 100644 powerrag/sdk/modules/raptor.py create mode 100644 powerrag/sdk/modules/raptor_manager.py create mode 100644 powerrag/sdk/modules/retrieval.py create mode 100644 powerrag/sdk/modules/retrieval_manager.py create mode 100644 powerrag/sdk/tests/conftest.py create mode 100644 powerrag/sdk/tests/pytest.ini create mode 100644 powerrag/sdk/tests/test_chunk.py create mode 100644 powerrag/sdk/tests/test_document.py create mode 100644 powerrag/sdk/tests/test_extraction.py create mode 100644 powerrag/sdk/tests/test_knowledge_base.py create mode 100644 powerrag/sdk/tests/test_knowledge_graph.py create mode 100644 powerrag/sdk/tests/test_raptor.py create mode 100644 powerrag/sdk/tests/test_retrieval.py create mode 100644 powerrag/server/services/parse_to_md_task_manager.py diff --git a/api/apps/sdk/powerrag_proxy.py b/api/apps/sdk/powerrag_proxy.py new file mode 100644 index 000000000..3bafdf867 --- /dev/null +++ b/api/apps/sdk/powerrag_proxy.py @@ -0,0 +1,594 @@ +# +# Copyright 2025 The OceanBase Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +PowerRAG API Proxy + +将 PowerRAG API 请求代理转发到独立的 PowerRAG server(端口6000) +这样 SDK 可以通过主 RAGFlow 服务访问 PowerRAG 功能,无需直接连接到 PowerRAG server +""" + +import os +import logging +import httpx +from quart import request, jsonify +from api.utils.api_utils import token_required, get_error_data_result + +logger = logging.getLogger(__name__) + +# manager 变量由 api/apps/__init__.py 的 register_page 函数自动注入 +# 这里使用 # noqa: F821 来忽略未定义的警告 + +# PowerRAG server 地址配置 +# 可以通过环境变量 POWERRAG_SERVER_URL 配置,默认为 http://localhost:6000 +POWERRAG_SERVER_URL = os.environ.get("POWERRAG_SERVER_URL", "http://localhost:6000") +POWERRAG_API_PREFIX = f"{POWERRAG_SERVER_URL}/api/v1/powerrag" + +# 创建异步 HTTP 客户端(使用连接池提高性能) +_http_client = httpx.AsyncClient( + timeout=httpx.Timeout(300.0, connect=10.0), # 5分钟总超时,10秒连接超时 + limits=httpx.Limits(max_keepalive_connections=20, max_connections=100), + follow_redirects=True, +) + + +async def _forward_request(method: str, endpoint: str, tenant_id: str = None): + """ + 将请求转发到 PowerRAG server(使用异步 HTTP 客户端) + + Args: + method: HTTP 方法 (GET, POST, PUT, DELETE) + endpoint: PowerRAG API 端点(不包含 /api/v1/powerrag 前缀) + tenant_id: 租户ID(可选,用于日志) + + Returns: + PowerRAG server 的响应 + """ + url = f"{POWERRAG_API_PREFIX}{endpoint}" + + # 获取请求数据 + if method == "GET": + params = dict(request.args) + json_data = None + files = None + data = None + else: + params = None + json_data = None + files = None + data = None + + # 尝试获取 JSON 数据 + try: + json_data = await request.get_json(silent=True) + except Exception: + pass + + # 如果没有 JSON 数据,尝试获取表单数据或文件 + if json_data is None: + try: + form = await request.form + if form: + data = dict(form) + except Exception: + pass + + try: + files_dict = await request.files + if files_dict: + # 保留文件名信息!重要:不能直接 dict(files_dict) + # 因为会丢失文件名。需要构造 httpx 期望的格式 + files = {} + for field_name, file_storage in files_dict.items(): + # httpx 期望格式: (filename, content, content_type) + files[field_name] = ( + file_storage.filename, + file_storage.read(), + file_storage.content_type or 'application/octet-stream' + ) + except Exception: + pass + + # 获取请求头(传递 Authorization) + headers = {} + if "Authorization" in request.headers: + headers["Authorization"] = request.headers["Authorization"] + + try: + logger.info(f"Forwarding {method} {endpoint} to PowerRAG server: {url}") + + # 使用异步 HTTP 客户端发送请求 + response = await _http_client.request( + method=method, + url=url, + params=params, + json=json_data, + data=data, + files=files, + headers=headers, + ) + + logger.info(f"PowerRAG server response status: {response.status_code}") + + # 返回响应 + try: + response_json = response.json() + logger.debug(f"Response JSON: {response_json}") + return jsonify(response_json), response.status_code + except Exception as e: + # 如果不是 JSON 响应,记录错误并返回错误信息 + logger.error(f"Failed to parse response as JSON: {e}", exc_info=True) + logger.error(f"Response status: {response.status_code}") + + # 尝试读取响应文本 + try: + response_text = response.text + error_msg = response_text[:200] if response_text else "(empty response body)" + except Exception: + error_msg = "(unable to read response body)" + + logger.error(f"Response content (first 200 chars): {error_msg}") + logger.error(f"Response headers: {dict(response.headers)}") + + # 返回错误响应 + return get_error_data_result( + message=f"PowerRAG server returned invalid JSON response: {error_msg}" + ), response.status_code if response.status_code >= 400 else 500 + + except httpx.ConnectError as e: + logger.error(f"Failed to connect to PowerRAG server at {POWERRAG_SERVER_URL}: {e}", exc_info=True) + return get_error_data_result( + message=f"PowerRAG server is not available at {POWERRAG_SERVER_URL}. " + f"Please ensure the PowerRAG server is running on port 6000." + ), 503 + + except httpx.TimeoutException as e: + logger.error(f"Request to PowerRAG server timed out: {url}", exc_info=True) + return get_error_data_result( + message="Request to PowerRAG server timed out" + ), 504 + + except httpx.HTTPStatusError as e: + logger.error(f"PowerRAG server returned error status {e.response.status_code}: {e}", exc_info=True) + try: + error_json = e.response.json() + return jsonify(error_json), e.response.status_code + except Exception: + return get_error_data_result( + message=f"PowerRAG server error: {e.response.status_code} {e.response.text[:200]}" + ), e.response.status_code + + except Exception as e: + logger.error(f"Error forwarding request to PowerRAG server: {e}", exc_info=True) + error_msg = str(e) + if hasattr(e, '__class__'): + error_msg = f"{e.__class__.__name__}: {error_msg}" + return get_error_data_result( + message=f"Error forwarding request to PowerRAG server: {error_msg}" + ), 500 + + +@manager.route("/powerrag/split", methods=["POST"]) # noqa: F821 +@token_required +async def split_text_proxy(tenant_id): + """ + 智能路由文本切片请求: + - PowerRAG chunkers (title, regex, smart) -> 转发到 PowerRAG server + - RAGFlow chunkers (naive, paper, book 等) -> 使用 RAGFlow 的实现 + + --- + tags: + - PowerRAG Proxy + security: + - ApiKeyAuth: [] + parameters: + - in: body + name: body + description: Split text parameters. + required: true + schema: + type: object + properties: + text: + type: string + description: Text to split. + parser_id: + type: string + description: Parser ID. + config: + type: object + description: Parser configuration. + - in: header + name: Authorization + type: string + required: true + description: Bearer token for authentication. + responses: + 200: + description: Split result. + """ + try: + # 获取请求数据 + data = await request.get_json() + if not data: + return get_error_data_result(message="No JSON data provided"), 400 + + text = data.get("text") + parser_id = data.get("parser_id", "title") + config = data.get("config", {}) + + if not text: + return get_error_data_result(message="text is required"), 400 + + # PowerRAG 支持的纯文本 chunker(专为文本切片设计) + POWERRAG_CHUNKERS = {"title", "regex", "smart"} + + # RAGFlow 支持纯文本切片的 chunker(使用 naive_merge) + RAGFLOW_TEXT_CHUNKERS = {"naive", "general"} + + # RAGFlow 需要文件处理的 chunker(不支持纯文本切片) + # RAGFLOW_DOCUMENT_CHUNKERS = { + # "paper", "book", "laws", "presentation", "manual", + # "qa", "table", "resume", "picture", "one", + # "knowledge_graph", "email", "tag" + # } + + if parser_id.lower() in POWERRAG_CHUNKERS: + # 转发到 PowerRAG server + logger.info(f"Forwarding '{parser_id}' chunker to PowerRAG server") + return await _forward_request("POST", "/split", tenant_id) + + elif parser_id.lower() in RAGFLOW_TEXT_CHUNKERS: + # 使用 RAGFlow 的 naive_merge 处理纯文本 + logger.info(f"Using RAGFlow naive_merge for '{parser_id}' chunker") + from rag.nlp import naive_merge + + # 默认配置 + chunk_token_num = config.get("chunk_token_num", 128) + delimiter = config.get("delimiter", "\n!?。;!?") + + # naive_merge 需要 sections 参数,格式为 [(text, position), ...] + sections = [(text, "")] + + # 调用 RAGFlow 的 naive_merge + chunks = naive_merge(sections, chunk_token_num=chunk_token_num, delimiter=delimiter) + + # 过滤掉空白块 + chunks = [chunk.strip() for chunk in chunks if chunk.strip()] + + # 返回结果 + return jsonify({ + "code": 0, + "data": { + "parser_id": parser_id, + "chunks": chunks, + "total_chunks": len(chunks), + "text_length": len(text), + "metadata": { + "chunker": "ragflow", + "config": config + } + }, + "message": "success" + }), 200 + # elif parser_id.lower() in RAGFLOW_DOCUMENT_CHUNKERS: + # # 这些 chunker 需要文档文件,不支持纯文本切片 + # return get_error_data_result( + # message=f"Chunker '{parser_id}' requires document file processing and does not support pure text splitting. " + # f"Supported text chunkers are: {', '.join(sorted(POWERRAG_CHUNKERS | RAGFLOW_TEXT_CHUNKERS))}" + # ), 400 + else: + # 未知的 chunker + return get_error_data_result( + message=f"Unknown chunker '{parser_id}'. " + f"Supported text chunkers are: {', '.join(sorted(POWERRAG_CHUNKERS | RAGFLOW_TEXT_CHUNKERS))}" + ), 400 + + except Exception as e: + logger.error(f"Error in split_text_proxy: {e}", exc_info=True) + return get_error_data_result(message=f"Failed to split text: {str(e)}"), 500 + + +@manager.route("/powerrag/extract", methods=["POST"]) # noqa: F821 +@token_required +async def extract_from_document_proxy(tenant_id): + """ + 代理 extract_from_document API 请求到 PowerRAG server + """ + return await _forward_request("POST", "/extract", tenant_id) + + +@manager.route("/powerrag/extract/text", methods=["POST"]) # noqa: F821 +@token_required +async def extract_from_text_proxy(tenant_id): + """ + 代理 extract_from_text API 请求到 PowerRAG server + """ + return await _forward_request("POST", "/extract/text", tenant_id) + + +@manager.route("/powerrag/extract/batch", methods=["POST"]) # noqa: F821 +@token_required +async def extract_batch_proxy(tenant_id): + """ + 代理 extract_batch API 请求到 PowerRAG server + """ + return await _forward_request("POST", "/extract/batch", tenant_id) + + +@manager.route("/powerrag/struct_extract/submit", methods=["POST"]) # noqa: F821 +@token_required +async def struct_extract_submit_proxy(tenant_id): + """ + 代理 struct_extract/submit API 请求到 PowerRAG server + """ + return await _forward_request("POST", "/struct_extract/submit", tenant_id) + + +@manager.route("/powerrag/struct_extract/status/", methods=["GET"]) # noqa: F821 +@token_required +async def struct_extract_status_proxy(tenant_id, task_id): + """ + 代理 struct_extract/status API 请求到 PowerRAG server + """ + return await _forward_request("GET", f"/struct_extract/status/{task_id}", tenant_id) + + +@manager.route("/powerrag/parse", methods=["POST"]) # noqa: F821 +@token_required +async def parse_document_proxy(tenant_id): + """ + 代理 parse API 请求到 PowerRAG server + """ + return await _forward_request("POST", "/parse", tenant_id) + + +@manager.route("/powerrag/parse/batch", methods=["POST"]) # noqa: F821 +@token_required +async def parse_batch_proxy(tenant_id): + """ + 代理 parse/batch API 请求到 PowerRAG server + """ + return await _forward_request("POST", "/parse/batch", tenant_id) + + +@manager.route("/powerrag/parse/upload", methods=["POST"]) # noqa: F821 +@token_required +async def parse_upload_proxy(tenant_id): + """ + 代理 parse/upload API 请求到 PowerRAG server + """ + return await _forward_request("POST", "/parse/upload", tenant_id) + + +@manager.route("/powerrag/convert", methods=["POST"]) # noqa: F821 +@token_required +async def convert_document_proxy(tenant_id): + """ + 代理 convert API 请求到 PowerRAG server + """ + return await _forward_request("POST", "/convert", tenant_id) + + +@manager.route("/powerrag/convert/upload", methods=["POST"]) # noqa: F821 +@token_required +async def convert_upload_proxy(tenant_id): + """ + 代理 convert/upload API 请求到 PowerRAG server + """ + return await _forward_request("POST", "/convert/upload", tenant_id) + + +@manager.route("/powerrag/parse_to_md", methods=["POST"]) # noqa: F821 +@token_required +async def parse_to_md_proxy(tenant_id): + """ + 代理 parse_to_md API 请求到 PowerRAG server + + 将文档解析为 Markdown 格式,但不进行切分。 + 适用于需要完整文档内容或外部系统自行处理切分的场景。 + + 支持的文件格式: + - PDF (.pdf) + - Office 文档 (.doc, .docx, .ppt, .pptx) + - 图片 (.jpg, .png) + - HTML (.html, .htm) + - Markdown (.md) + + --- + tags: + - PowerRAG Proxy + security: + - ApiKeyAuth: [] + parameters: + - in: body + name: body + description: Parse to markdown parameters. + required: true + schema: + type: object + properties: + doc_id: + type: string + required: true + description: RAGFlow document ID. + config: + type: object + description: Parser configuration. + properties: + layout_recognize: + type: string + description: Layout recognition engine (mineru or dots_ocr). + enable_ocr: + type: boolean + description: Enable OCR. + enable_formula: + type: boolean + description: Enable formula recognition. + enable_table: + type: boolean + description: Enable table recognition. + from_page: + type: integer + description: Start page number (for PDF). + to_page: + type: integer + description: End page number (for PDF). + - in: header + name: Authorization + type: string + required: true + description: Bearer token for authentication. + responses: + 200: + description: Parse to markdown result. + schema: + type: object + properties: + code: + type: integer + data: + type: object + properties: + doc_id: + type: string + doc_name: + type: string + markdown: + type: string + markdown_length: + type: integer + images: + type: object + total_images: + type: integer + message: + type: string + """ + return await _forward_request("POST", "/parse_to_md", tenant_id) + + +@manager.route("/powerrag/parse_to_md/async", methods=["POST"]) # noqa: F821 +@token_required +async def parse_to_md_async_proxy(tenant_id): + """ + 代理 parse_to_md/async API 请求到 PowerRAG server (异步提交任务) + + 异步解析文档为 Markdown,返回任务 ID。 + 适用于大文档或需要长时间处理的场景。 + + --- + tags: + - PowerRAG Proxy + security: + - ApiKeyAuth: [] + parameters: + - in: body + name: body + required: true + schema: + type: object + properties: + doc_id: + type: string + description: Document ID + config: + type: object + description: Parser configuration + - in: header + name: Authorization + type: string + required: true + description: Bearer token for authentication. + responses: + 200: + description: Task submitted successfully, returns task_id. + """ + return await _forward_request("POST", "/parse_to_md/async", tenant_id) + + +@manager.route("/powerrag/parse_to_md/status/", methods=["GET"]) # noqa: F821 +@token_required +async def parse_to_md_status_proxy(tenant_id, task_id): + """ + 代理 parse_to_md/status API 请求到 PowerRAG server (查询任务状态) + + 查询异步解析任务的状态和结果。 + + --- + tags: + - PowerRAG Proxy + security: + - ApiKeyAuth: [] + parameters: + - in: path + name: task_id + type: string + required: true + description: Task ID returned from async submission + - in: header + name: Authorization + type: string + required: true + description: Bearer token for authentication. + responses: + 200: + description: Task status and result. + """ + return await _forward_request("GET", f"/parse_to_md/status/{task_id}", tenant_id) + + +@manager.route("/powerrag/parse_to_md/upload", methods=["POST"]) # noqa: F821 +@token_required +async def parse_to_md_upload_proxy(tenant_id): + """ + 代理 parse_to_md/upload API 请求到 PowerRAG server + + 直接上传文件并解析为 Markdown,不进行切分。 + + 支持的文件格式: + - PDF (.pdf) + - Office 文档 (.doc, .docx, .ppt, .pptx) + - 图片 (.jpg, .png) + - HTML (.html, .htm) + - Markdown (.md) + + --- + tags: + - PowerRAG Proxy + security: + - ApiKeyAuth: [] + parameters: + - in: formData + name: file + type: file + required: true + description: File to parse (PDF, Office (doc/docx/ppt/pptx), Images (jpg/png), HTML, Markdown). + - in: formData + name: config + type: string + description: JSON string of parser configuration. + - in: header + name: Authorization + type: string + required: true + description: Bearer token for authentication. + responses: + 200: + description: Parse to markdown result. + """ + return await _forward_request("POST", "/parse_to_md/upload", tenant_id) + diff --git a/api/apps/system_app.py b/api/apps/system_app.py index 7e646927e..640fbe37a 100644 --- a/api/apps/system_app.py +++ b/api/apps/system_app.py @@ -217,7 +217,7 @@ def new_token(): obj = { "tenant_id": tenant_id, "token": generate_confirmation_token(), - "beta": generate_confirmation_token().replace("ragflow-", "")[:32], + "beta": generate_confirmation_token().replace("powerrag-", "")[:32], "create_time": current_timestamp(), "create_date": datetime_format(datetime.now()), "update_time": None, diff --git a/powerrag/sdk/README.md b/powerrag/sdk/README.md new file mode 100644 index 000000000..81b69be76 --- /dev/null +++ b/powerrag/sdk/README.md @@ -0,0 +1,1148 @@ +# PowerRAG SDK + +[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) +[![Python](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/) + +PowerRAG SDK 是一个功能强大的 Python SDK,为 PowerRAG API 提供了简单易用的接口,支持知识库管理、文档处理、Markdown 解析、文本切片、信息抽取、RAPTOR 构建、知识图谱和检索等功能。 + +## 特性 + +- 🚀 **简单易用**: 面向对象的 API 设计,直观的方法调用 +- 📚 **完整功能**: 支持 PowerRAG 所有核心功能模块 +- 🔄 **异步支持**: 支持异步任务的状态查询和轮询等待 +- 📦 **批量操作**: 支持批量上传、删除、抽取等操作 +- 📝 **Markdown 解析**: 支持文档解析为 Markdown 格式(同步/异步) +- 🎯 **类型提示**: 完整的类型注解,IDE 友好 +- ✅ **全面测试**: 包含完整的测试用例 + +## 安装 + +### 方式 1: 使用 pip(推荐) + +```bash +pip install powerrag-sdk +``` + +### 方式 2: 从源码安装 + +```bash +git clone https://github.com/oceanbase/powerrag.git +cd powerrag +pip install -e . +``` + +### 方式 3: 仅安装 SDK 模块 + +如果你只需要 SDK 功能: + +```bash +git clone https://github.com/oceanbase/powerrag.git +cd powerrag/powerrag/sdk +pip install -e . +``` + +### 依赖要求 + +- Python 3.8+ +- requests >= 2.28.0 +- typing-extensions (Python < 3.11) + +### 验证安装 + +```python +from powerrag.sdk import PowerRAGClient + +print(f"PowerRAG SDK installed successfully!") +``` + +## 快速开始 + +### 初始化客户端 + +```python +from powerrag.sdk import PowerRAGClient + +# 创建客户端 +client = PowerRAGClient( + api_key="your-api-key", + base_url="http://localhost:9380" +) +``` + +### 创建知识库 + +```python +# 创建知识库 +kb = client.knowledge_base.create( + name="my_knowledge_base", + description="My first knowledge base", + chunk_method="naive" +) +print(f"Knowledge Base ID: {kb['id']}") +``` + +### 上传文档 + +```python +# 上传单个文档 +docs = client.document.upload(kb['id'], "document.pdf") + +# 上传多个文档 +docs = client.document.upload(kb['id'], ["doc1.pdf", "doc2.pdf", "doc3.pdf"]) +``` + +### 解析文档为切片 + +```python +# 异步解析文档为切片 +task_id = client.document.parse_to_chunk( + kb['id'], + [docs[0]['id']], + wait=False +) + +# 同步解析并等待完成 +results = client.document.parse_to_chunk( + kb['id'], + [docs[0]['id']], + wait=True, + delete_existing=False +) +``` + +### 解析文档为 Markdown + +```python +# 同步解析为 Markdown(不切分) +result = client.document.parse_to_md( + doc_id=docs[0]['id'], + config={ + "layout_recognize": "mineru", # 或 "dots_ocr" + "enable_ocr": False, + "enable_formula": False, + "enable_table": True + } +) +print(f"Markdown: {result['markdown']}") +print(f"Total images: {result['total_images']}") + +# 异步解析为 Markdown +task_id = client.document.parse_to_md_async( + doc_id=docs[0]['id'], + config={"layout_recognize": "mineru"} +) + +# 查询异步任务状态 +status = client.document.get_parse_to_md_status(task_id) +print(f"Status: {status['status']}") + +# 等待任务完成 +result = client.document.wait_for_parse_to_md(task_id, timeout=300) + +# 直接上传文件并解析为 Markdown +result = client.document.parse_to_md_upload( + "document.pdf", + config={"layout_recognize": "mineru"} +) +``` + +### 检索 + +```python +# 执行检索 +result = client.retrieval.search( + kb_ids=[kb['id']], + question="什么是 PowerRAG?", + page_size=10, + similarity_threshold=0.2 +) + +# 打印结果 +for chunk in result['chunks']: + print(f"Content: {chunk['content']}") + print(f"Score: {chunk['similarity']}") + print(f"Document: {chunk['document_name']}") +``` + +## 核心功能亮点 + +### 文档解析为 Markdown + +PowerRAG SDK 提供了强大的文档解析为 Markdown 的功能,支持多种文档格式: + +**支持的格式:** +- PDF (.pdf) +- Office 文档 (.doc, .docx, .ppt, .pptx) +- 图片 (.jpg, .png) +- HTML (.html, .htm) + +**三种使用方式:** + +1. **同步解析**(适合小文档): +```python +result = client.document.parse_to_md(doc_id, config={...}) +``` + +2. **异步解析**(适合大文档): +```python +task_id = client.document.parse_to_md_async(doc_id, config={...}) +status = client.document.get_parse_to_md_status(task_id) +# 或等待完成 +result = client.document.wait_for_parse_to_md(task_id, timeout=300) +``` + +3. **直接上传解析**(无需知识库): +```python +result = client.document.parse_to_md_upload("file.pdf", config={...}) +``` + +**配置选项:** +- `layout_recognize`: 布局识别引擎 (`"mineru"` 或 `"dots_ocr"`) +- `enable_ocr`: 是否启用 OCR +- `enable_formula`: 是否识别公式 +- `enable_table`: 是否识别表格 +- `from_page`/`to_page`: PDF 页面范围 + +### 结构化信息抽取 + +支持使用 LangExtract 进行结构化信息抽取: + +```python +task = client.extraction.struct_extract( + text_or_documents="...", + prompt_description="Extract person information", + examples=[...], + temperature=0.0 +) +status = client.extraction.get_struct_extract_status(task['task_id']) +``` + +### 文本切片 + +无需上传文档即可对文本进行切片: + +```python +result = client.chunk.split_text( + text="# Title\n\nContent...", + parser_id="title", + config={"chunk_token_num": 512} +) +``` + +## 核心模块 + +PowerRAG SDK 包含以下 7 个核心模块: + +### 1. 知识库管理 (Knowledge Base) + +管理知识库的创建、查询、更新和删除。 + +```python +# 创建知识库 +kb = client.knowledge_base.create( + name="test_kb", + description="Test knowledge base", + embedding_model="BAAI/bge-small-en-v1.5@Builtin", + permission="me", + chunk_method="naive" +) + +# 获取知识库 +kb_info = client.knowledge_base.get(kb['id']) + +# 列出知识库 +kbs, total = client.knowledge_base.list( + name="test", + page=1, + page_size=10 +) + +# 更新知识库 +updated_kb = client.knowledge_base.update( + kb['id'], + description="Updated description", + pagerank=True +) + +# 删除知识库 +client.knowledge_base.delete([kb['id']]) +``` + +### 2. 文档管理 (Document) + +处理文档的上传、列表、查询、更新、删除、下载和解析。 + +```python +# 上传文档 +docs = client.document.upload(kb_id, ["file1.pdf", "file2.docx"]) + +# 从URL上传文档 +success = client.document.upload_from_url( + kb_id, + url="https://example.com/doc.pdf", + name="document.pdf" +) + +# 列出文档 +docs, total = client.document.list( + kb_id, + name="report", + page=1, + page_size=20, + keywords="机器学习", # 关键词搜索 + suffix=["pdf", "docx"], # 按后缀过滤 + run=["DONE", "FAIL"] # 按状态过滤 +) + +# 获取文档详情 +doc = client.document.get(kb_id, doc_id) + +# 更新文档 +updated_doc = client.document.update( + kb_id, + doc_id, + name="new_name.pdf", + meta_fields={"author": "John", "category": "AI"}, + enabled=True +) + +# 快捷方法:重命名文档 +client.document.rename(kb_id, doc_id, "renamed.pdf") + +# 快捷方法:设置元数据 +client.document.set_meta(kb_id, doc_id, {"version": "1.0"}) + +# 下载文档 +# 下载为字节流 +file_bytes = client.document.download(kb_id, doc_id) + +# 下载到文件 +saved_path = client.document.download(kb_id, doc_id, save_path="downloaded.pdf") + +# 解析文档为切片(异步) +task_id = client.document.parse_to_chunk(kb_id, [doc_id], wait=False) + +# 解析文档为切片(同步等待) +results = client.document.parse_to_chunk( + kb_id, + [doc_id], + wait=True, + delete_existing=False, # 是否删除已有切片 + config={"max_token": 512} # 自定义配置 +) + +# 解析文档为 Markdown(同步) +result = client.document.parse_to_md( + doc_id, + config={ + "layout_recognize": "mineru", # mineru 或 dots_ocr + "enable_ocr": False, + "enable_formula": False, + "enable_table": True, + "from_page": 0, # PDF起始页 + "to_page": 100 # PDF结束页 + } +) +print(result['markdown']) + +# 解析文档为 Markdown(异步) +task_id = client.document.parse_to_md_async(doc_id, config={...}) + +# 查询 parse_to_md 任务状态 +status = client.document.get_parse_to_md_status(task_id) +if status["status"] == "success": + print(status["result"]["markdown"]) + +# 等待 parse_to_md 任务完成 +result = client.document.wait_for_parse_to_md(task_id, timeout=300) + +# 上传并解析为 Markdown(无需知识库) +result = client.document.parse_to_md_upload("file.pdf", config={...}) + +# 解析URL文档(同步等待) +doc = client.document.parse_url( + kb_id, + url="https://example.com/doc.pdf", + name="web_doc.pdf", + wait=True +) + +# 取消解析任务 +client.document.cancel_parse(kb_id, [doc_id]) + +# 删除文档 +client.document.delete(kb_id, [doc_id]) +``` + +### 3. 切片管理 (Chunk) + +管理文档切片的查询、创建、更新、删除和文本切片。 + +```python +# 列出文档的切片 +chunks, total, doc_info = client.chunk.list( + kb_id, + doc_id, + keywords="机器学习", + page=1, + page_size=30 +) + +# 获取切片详情 +chunk = client.chunk.get(kb_id, doc_id, chunk_id) + +# 创建切片 +chunk = client.chunk.create( + kb_id, + doc_id, + content="This is a chunk content", + important_keywords=["keyword1", "keyword2"], + questions=["What is this about?"] +) + +# 更新切片 +updated_chunk = client.chunk.update( + kb_id, + doc_id, + chunk_id, + content="Updated content", + important_keywords=["new_keyword"], + questions=["Updated question?"], + available=True, + positions=[[0, 100]] +) + +# 删除切片 +client.chunk.delete(kb_id, doc_id, [chunk_id]) + +# 删除文档的所有切片 +client.chunk.delete(kb_id, doc_id, None) + +# 文本切片(无需上传文档) +result = client.chunk.split_text( + text="# Title\n\nLong text to be chunked...", + parser_id="title", # 解析器ID + config={"chunk_token_num": 512} # 自定义配置 +) +print(f"Total chunks: {result['total_chunks']}") +for chunk in result['chunks']: + print(chunk['content']) +``` + +### 4. 信息抽取 (Extraction) + +从文档或文本中抽取实体、关键词、摘要等信息。 + +```python +# 从文档抽取 +result = client.extraction.extract_from_document( + doc_id=doc_id, + extractor_type="entity", # entity, keyword, summary + config={ + "entity_types": ["PERSON", "ORG", "LOC"], + "use_regex": True, + "use_llm": False + } +) +print(result['entities']) + +# 从文本抽取 +result = client.extraction.extract_from_text( + text="PowerRAG is an advanced RAG framework developed by OceanBase", + extractor_type="entity", + config={"entity_types": ["ORG", "PRODUCT"]} +) + +# 抽取关键词 +result = client.extraction.extract_from_document( + doc_id=doc_id, + extractor_type="keyword", + config={ + "max_keywords": 20, + "min_word_length": 3 + } +) + +# 抽取摘要 +result = client.extraction.extract_from_document( + doc_id=doc_id, + extractor_type="summary", + config={ + "max_length": 200, + "min_length": 50 + } +) + +# 批量抽取 +results = client.extraction.extract_batch( + doc_ids=[doc_id1, doc_id2, doc_id3], + extractor_type="keyword", + config={"max_keywords": 15} +) +for result in results: + if result['success']: + print(f"Doc {result['doc_id']}: {result['data']}") + +# 结构化抽取 (LangExtract) +task = client.extraction.struct_extract( + text_or_documents="John Doe is 30 years old. His email is john@example.com", + prompt_description="Extract person information including name, age, and email", + examples=[ + { + "text": "Jane Smith is 25 years old. Email: jane@example.com", + "extractions": [ + {"name": "Jane Smith", "age": 25, "email": "jane@example.com"} + ] + } + ], + fetch_urls=False, + max_char_buffer=1000, + temperature=0.0, + extraction_passes=1 +) +print(f"Task ID: {task['task_id']}") + +# 获取结构化抽取状态 +status = client.extraction.get_struct_extract_status(task['task_id']) +print(f"Status: {status['status']}") +if status['status'] == 'completed': + print(f"Result: {status['result']}") +``` + +### 5. RAPTOR + +构建和管理 RAPTOR(Recursive Abstractive Processing for Tree-Organized Retrieval)。 + +**注意**: RAPTOR 的配置参数需要在创建或更新知识库时通过 `parser_config.raptor` 设置。 + +```python +# 创建知识库时配置 RAPTOR 参数 +kb = client.knowledge_base.create( + name="raptor_kb", + chunk_method="naive", + parser_config={ + "raptor": { + "max_cluster": 64, + "random_seed": 224, + "llm_model": "deepseek-chat" + } + } +) + +# 构建 RAPTOR(异步) +task = client.raptor.build(kb_id) +print(f"RAPTOR Task ID: {task['raptor_task_id']}") + +# 获取 RAPTOR 构建状态 +status = client.raptor.get_status(kb_id) +if status: + print(f"Status: {status['status']}") + print(f"Progress: {status['progress']}") +else: + print("No RAPTOR task found") +``` + +### 6. 知识图谱 (Knowledge Graph) + +构建和管理知识图谱。 + +**注意**: 知识图谱的配置参数需要在创建或更新知识库时通过 `parser_config.graphrag` 设置。 + +```python +# 创建知识库时配置知识图谱参数 +kb = client.knowledge_base.create( + name="kg_kb", + chunk_method="naive", + parser_config={ + "graphrag": { + "entity_types": ["PERSON", "ORG", "LOC", "EVENT"], + "llm_model": "deepseek-chat" + } + } +) + +# 构建知识图谱(异步) +task = client.knowledge_graph.build(kb_id) +print(f"Knowledge Graph Task ID: {task['graphrag_task_id']}") + +# 获取知识图谱数据 +kg = client.knowledge_graph.get(kb_id) +print(f"Graph nodes: {len(kg['graph'].get('nodes', []))}") +print(f"Graph edges: {len(kg['graph'].get('edges', []))}") +print(f"Mind map: {kg['mind_map']}") + +# 获取构建状态 +status = client.knowledge_graph.get_status(kb_id) +if status: + print(f"Status: {status['status']}") + print(f"Progress: {status['progress']}") +else: + print("No knowledge graph task found") +``` + +### 7. 检索 (Retrieval) + +执行语义检索和混合检索。 + +```python +# 基本检索 +result = client.retrieval.search( + kb_ids=[kb_id], + question="What is PowerRAG?", + page=1, + page_size=10 +) + +# 打印结果 +print(f"Total results: {result['total']}") +for chunk in result['chunks']: + print(f"Content: {chunk['content']}") + print(f"Similarity: {chunk['similarity']}") + print(f"Document: {chunk['document_name']}") + +# 高级检索 +result = client.retrieval.search( + kb_ids=[kb_id1, kb_id2], + question="机器学习的应用", + document_ids=[doc_id], # 限定文档范围 + page=1, + page_size=30, + similarity_threshold=0.3, # 相似度阈值 + vector_similarity_weight=0.3, # 向量相似度权重(混合检索) + top_k=1024, # 最大返回数量 + keyword=True, # 启用关键词增强 + use_kg=True, # 使用知识图谱检索 + rerank_id="bge-reranker-v2-m3", # 重排序模型 + highlight=True, # 高亮匹配内容 + cross_languages=["en", "zh"], # 跨语言检索 + metadata_condition={"status": "published", "year": 2024} # 元数据过滤 +) + +# 检索测试(与 search 功能相同,用于测试场景) +test_result = client.retrieval.test( + kb_ids=[kb_id], + question="测试查询", + page=1, + page_size=50, + similarity_threshold=0.2, + keyword=True, + use_kg=False +) +``` + +## 完整示例 + +以下是一个完整的工作流程示例: + +```python +from powerrag.sdk import PowerRAGClient +import time + +# 初始化客户端 +client = PowerRAGClient( + api_key="your-api-key", + base_url="http://localhost:9380" +) + +# 1. 创建知识库 +kb = client.knowledge_base.create( + name="research_papers", + description="Collection of AI research papers", + chunk_method="naive" +) +print(f"Created knowledge base: {kb['id']}") + +# 2. 上传文档 +docs = client.document.upload( + kb['id'], + ["paper1.pdf", "paper2.pdf", "paper3.pdf"] +) +print(f"Uploaded {len(docs)} documents") + +# 3. 解析文档为切片(同步等待) +doc_ids = [doc['id'] for doc in docs] +results = client.document.parse_to_chunk(kb['id'], doc_ids, wait=True) +print(f"Parsed {len(results)} documents") +for result in results: + print(f"Doc {result['doc_id']}: {result['status']}, {result['chunk_count']} chunks") + +# 4. 构建知识图谱(可选) +kg_task = client.knowledge_graph.build(kb['id']) +print(f"Building knowledge graph: {kg_task['graphrag_task_id']}") + +# 等待知识图谱构建完成 +while True: + status = client.knowledge_graph.get_status(kb['id']) + if not status: + break + print(f"KG status: {status['status']}, progress: {status.get('progress', 0)}") + if status['status'] in ['DONE', 'FAIL']: + break + time.sleep(5) + +# 5. 执行检索 +result = client.retrieval.search( + kb_ids=[kb['id']], + question="What are the latest advances in transformer models?", + page_size=5, + similarity_threshold=0.2, + use_kg=True, + highlight=True +) + +# 打印检索结果 +print(f"\nFound {result['total']} results:") +for i, chunk in enumerate(result['chunks'], 1): + print(f"\n{i}. Score: {chunk['similarity']:.3f}") + print(f" Content: {chunk['content'][:200]}...") + print(f" Document: {chunk['document_name']}") + +# 6. 从文档抽取关键信息 +for doc_id in doc_ids[:3]: # 抽取前3个文档 + extraction = client.extraction.extract_from_document( + doc_id=doc_id, + extractor_type="keyword", + config={"max_keywords": 10} + ) + print(f"\nExtracted keywords from {doc_id}: {extraction.get('keywords', [])}") + +# 7. 解析文档为 Markdown(可选) +md_result = client.document.parse_to_md( + doc_id=doc_ids[0], + config={"layout_recognize": "mineru"} +) +print(f"\nMarkdown length: {md_result['markdown_length']}") +print(f"Total images: {md_result['total_images']}") + +# 8. 清理(如需要) +# 删除特定文档 +# client.document.delete(kb['id'], [doc_ids[0]]) + +# 删除整个知识库(包括所有文档) +# client.knowledge_base.delete([kb['id']]) +``` + +## 环境配置 + +使用 SDK 前需要配置以下环境变量(可选): + +```bash +# PowerRAG 服务地址 +export HOST_ADDRESS="http://127.0.0.1:9380" + +# API 密钥 +export POWERRAG_API_KEY="your-api-key" +``` + +或在代码中直接指定: + +```python +client = PowerRAGClient( + api_key="your-api-key", + base_url="http://127.0.0.1:9380", + version="v1" # API 版本,默认为 v1 +) +``` + +## 测试 + +SDK 包含完整的测试套件,覆盖所有功能模块。 + +### 运行所有测试 + +```bash +# 设置环境变量 +export HOST_ADDRESS="http://127.0.0.1:9380" +export POWERRAG_API_KEY="your-api-key" + +# 运行测试 +pytest powerrag/sdk/tests/ +``` + +### 运行特定模块测试 + +```bash +# 测试知识库模块 +pytest powerrag/sdk/tests/test_knowledge_base.py + +# 测试文档模块 +pytest powerrag/sdk/tests/test_document.py + +# 测试检索模块 +pytest powerrag/sdk/tests/test_retrieval.py +``` + +更多测试说明请参考 [tests/README.md](tests/README.md)。 + +## 项目结构 + +``` +powerrag/sdk/ +├── __init__.py # SDK 入口,导出 PowerRAGClient +├── client.py # 主客户端类,提供 HTTP 请求方法 +├── README.md # SDK 文档(本文件) +├── modules/ # 功能模块 +│ ├── knowledge_base.py # 知识库数据模型 (TypedDict) +│ ├── knowledge_base_manager.py # 知识库管理器 +│ ├── document.py # 文档数据模型 (TypedDict) +│ ├── document_manager.py # 文档管理器 +│ ├── chunk.py # 切片数据模型 (TypedDict) +│ ├── chunk_manager.py # 切片管理器 +│ ├── extraction.py # 抽取数据模型 (TypedDict) +│ ├── extraction_manager.py # 抽取管理器 +│ ├── raptor.py # RAPTOR 数据模型 (TypedDict) +│ ├── raptor_manager.py # RAPTOR 管理器 +│ ├── knowledge_graph.py # 知识图谱数据模型 (TypedDict) +│ ├── knowledge_graph_manager.py # 知识图谱管理器 +│ ├── retrieval.py # 检索数据模型 (TypedDict) +│ └── retrieval_manager.py # 检索管理器 +└── tests/ # 完整的测试套件 + ├── README.md # 测试文档 + ├── conftest.py # pytest 配置和 fixtures + ├── pytest.ini # pytest 配置文件 + ├── test_knowledge_base.py # 知识库测试 + ├── test_document.py # 文档测试 + ├── test_chunk.py # 切片测试 + ├── test_extraction.py # 抽取测试 + ├── test_raptor.py # RAPTOR 测试 + ├── test_knowledge_graph.py # 知识图谱测试 + └── test_retrieval.py # 检索测试 +``` + +## API 参考 + +### PowerRAGClient + +主客户端类,提供对所有功能模块的访问。 + +**初始化参数:** +- `api_key` (str): API 密钥,必填 +- `base_url` (str): 服务地址,默认 `"http://localhost:9380"` +- `version` (str): API 版本,默认 `"v1"` + +**属性:** +- `knowledge_base` (KnowledgeBaseManager): 知识库管理器 +- `document` (DocumentManager): 文档管理器 +- `chunk` (ChunkManager): 切片管理器 +- `extraction` (ExtractionManager): 抽取管理器 +- `raptor` (RAPTORManager): RAPTOR 管理器 +- `knowledge_graph` (KnowledgeGraphManager): 知识图谱管理器 +- `retrieval` (RetrievalManager): 检索管理器 + +**内部方法:** +- `post(url, json=None, files=None, data=None, stream=False)`: POST 请求 +- `get(url, params=None, stream=False)`: GET 请求 +- `put(url, json=None)`: PUT 请求 +- `delete(url, json=None, params=None)`: DELETE 请求 + +### 数据模型 + +所有数据模型都使用 `TypedDict` 定义,提供完整的类型提示: + +**知识库相关:** +- `KnowledgeBaseInfo`: 知识库信息 + +**文档相关:** +- `DocumentInfo`: 文档信息 + +**切片相关:** +- `ChunkInfo`: 切片信息 + +**抽取相关:** +- `ExtractionResult`: 抽取结果 +- `StructExtractTaskInfo`: 结构化抽取任务信息 + +**RAPTOR 相关:** +- `RAPTORTaskInfo`: RAPTOR 任务信息 + +**知识图谱相关:** +- `KnowledgeGraphData`: 知识图谱数据 +- `KnowledgeGraphTaskInfo`: 知识图谱任务信息 + +**检索相关:** +- `RetrievalResult`: 检索结果 + +## 最佳实践 + +### 1. 错误处理 + +SDK 会抛出异常,建议在生产环境中进行适当的错误处理: + +```python +from requests.exceptions import RequestException, HTTPError + +try: + result = client.retrieval.search( + kb_ids=[kb_id], + question="test query" + ) +except RequestException as e: + print(f"Network error: {e}") +except Exception as e: + print(f"API error: {e}") +``` + +### 2. 异步任务处理 + +对于长时间运行的任务,建议使用异步方式: + +```python +# 提交任务 +task_id = client.document.parse_to_chunk(kb_id, doc_ids, wait=False) + +# 轮询状态 +import time +for doc_id in doc_ids: + while True: + doc = client.document.get(kb_id, doc_id) + if doc['run'] in ['DONE', 'FAIL']: + break + time.sleep(2) +``` + +### 3. 批量操作 + +充分利用批量操作提高效率: + +```python +# 批量上传 +docs = client.document.upload(kb_id, ["doc1.pdf", "doc2.pdf", "doc3.pdf"]) + +# 批量解析 +doc_ids = [doc['id'] for doc in docs] +results = client.document.parse_to_chunk(kb_id, doc_ids, wait=True) + +# 批量抽取 +results = client.extraction.extract_batch(doc_ids, extractor_type="keyword") +``` + +### 4. 知识库配置 + +在创建知识库时就配置好所需的参数: + +```python +kb = client.knowledge_base.create( + name="my_kb", + chunk_method="naive", + embedding_model="BAAI/bge-large-zh-v1.5@Builtin", + parser_config={ + "chunk_token_num": 512, + "raptor": { + "max_cluster": 64, + "llm_model": "deepseek-chat" + }, + "graphrag": { + "entity_types": ["PERSON", "ORG", "LOC"], + "llm_model": "deepseek-chat" + } + } +) +``` + +### 5. 检索优化 + +根据场景调整检索参数: + +```python +# 精确检索(高阈值) +result = client.retrieval.search( + kb_ids=[kb_id], + question="query", + similarity_threshold=0.5, # 更高的阈值 + page_size=10 +) + +# 召回优化(低阈值 + 大top_k + 重排序) +result = client.retrieval.search( + kb_ids=[kb_id], + question="query", + similarity_threshold=0.1, # 更低的阈值 + top_k=2048, # 更大的候选集 + rerank_id="bge-reranker-v2-m3", # 使用重排序 + keyword=True, # 启用关键词 + use_kg=True # 使用知识图谱 +) +``` + +## 错误处理 + +SDK 会抛出以下类型的异常: + +**常见异常:** +- `FileNotFoundError`: 文件不存在 +- `Exception`: API 调用失败(包含错误消息) +- `TimeoutError`: 任务超时(仅在使用 `wait_for_*` 方法时) +- `RequestException`: 网络请求错误 + +**示例:** + +```python +try: + kb = client.knowledge_base.get("nonexistent-id") +except Exception as e: + print(f"Error: {e}") +``` + +```python +from requests.exceptions import RequestException + +try: + result = client.retrieval.search( + kb_ids=[kb_id], + question="test query" + ) +except RequestException as e: + print(f"Network error: {e}") +except Exception as e: + print(f"API error: {e}") +``` + +## 常见问题 (FAQ) + +### Q: 如何处理大文档的解析? + +A: 对于大文档,建议使用异步解析: +```python +# 使用异步解析 +task_id = client.document.parse_to_md_async(doc_id) +result = client.document.wait_for_parse_to_md(task_id, timeout=600) +``` + +### Q: RAPTOR 和知识图谱的配置在哪里设置? + +A: 需要在创建或更新知识库时通过 `parser_config` 设置: +```python +kb = client.knowledge_base.create( + name="my_kb", + parser_config={ + "raptor": {"max_cluster": 64}, + "graphrag": {"entity_types": ["PERSON", "ORG"]} + } +) +``` + +### Q: 如何查看知识库的 RAPTOR 和知识图谱状态? + +A: 使用对应的 `get_status` 方法: +```python +raptor_status = client.raptor.get_status(kb_id) +kg_status = client.knowledge_graph.get_status(kb_id) +``` +返回 `None` 表示没有运行中的任务。 + +### Q: 如何实现混合检索? + +A: 调整 `vector_similarity_weight` 参数和启用 `keyword`: +```python +result = client.retrieval.search( + kb_ids=[kb_id], + question="query", + vector_similarity_weight=0.3, # 向量权重 + keyword=True, # 启用关键词 + use_kg=True # 使用知识图谱 +) +``` + +### Q: 支持哪些抽取类型? + +A: 支持三种抽取类型: +- `entity`: 实体抽取(人名、地名、组织等) +- `keyword`: 关键词抽取 +- `summary`: 摘要生成 + +还支持结构化抽取 (`struct_extract`),可以自定义抽取模式。 + +### Q: 如何处理解析失败的文档? + +A: 检查文档状态并根据错误信息处理: +```python +results = client.document.parse_to_chunk(kb_id, doc_ids, wait=True) +for result in results: + if result['status'] == 'FAIL': + print(f"Document {result['doc_id']} failed to parse") + # 重新解析或删除 +``` + +### Q: SDK 是否支持流式返回? + +A: 当前版本主要支持标准 REST API 调用。对于下载等操作,SDK 内部使用了流式传输。 + +### Q: 如何设置请求超时? + +A: 当前 SDK 使用 `requests` 库的默认超时。如需自定义,可以在调用前设置: +```python +import requests +requests.adapters.DEFAULT_RETRIES = 5 +``` + +## 贡献 + +欢迎贡献代码!请遵循以下步骤: + +1. Fork 本仓库 +2. 创建特性分支 (`git checkout -b feature/amazing-feature`) +3. 提交更改 (`git commit -m 'Add amazing feature'`) +4. 推送到分支 (`git push origin feature/amazing-feature`) +5. 创建 Pull Request + +**贡献指南:** +- 遵循 PEP 8 代码规范 +- 添加适当的类型注解 +- 为新功能添加测试用例 +- 更新相关文档 + +## 许可证 + +Copyright 2025 The OceanBase Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +## 链接 + +- [PowerRAG 项目主页](https://github.com/oceanbase/powerrag) +- [API 文档](https://github.com/oceanbase/powerrag/docs) +- [问题反馈](https://github.com/oceanbase/powerrag/issues) +- [更新日志](https://github.com/oceanbase/powerrag/CHANGELOG.md) + +## 支持 + +### 获取帮助 + +如有问题或建议,请: + +1. 📖 查看 [完整文档](https://github.com/oceanbase/powerrag/docs) +2. 🔍 搜索 [已有问题](https://github.com/oceanbase/powerrag/issues) +3. 💬 创建 [新问题](https://github.com/oceanbase/powerrag/issues/new) +4. 📧 联系 OceanBase 团队 + +### 社区 + +- GitHub: [oceanbase/powerrag](https://github.com/oceanbase/powerrag) +- 文档: [PowerRAG Documentation](https://github.com/oceanbase/powerrag/docs) +- 问题跟踪: [GitHub Issues](https://github.com/oceanbase/powerrag/issues) + +### 反馈 + +我们非常重视您的反馈!如果您: +- 发现了 bug +- 有功能建议 +- 需要帮助 +- 想要贡献代码 + +请通过 GitHub Issues 联系我们。 + +--- + +**Made with ❤️ by OceanBase Team** + + diff --git a/powerrag/sdk/__init__.py b/powerrag/sdk/__init__.py new file mode 100644 index 000000000..90d01aaf0 --- /dev/null +++ b/powerrag/sdk/__init__.py @@ -0,0 +1,29 @@ +# +# Copyright 2025 The OceanBase Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +PowerRAG SDK + +A Python SDK for PowerRAG API, providing easy-to-use interfaces for knowledge base management, +document processing, chunking, extraction, RAPTOR, knowledge graph, and retrieval. +""" + +from .client import PowerRAGClient + +__all__ = ["PowerRAGClient"] + +# Alias for convenience +PowerRAG = PowerRAGClient \ No newline at end of file diff --git a/powerrag/sdk/client.py b/powerrag/sdk/client.py new file mode 100644 index 000000000..c9c44810e --- /dev/null +++ b/powerrag/sdk/client.py @@ -0,0 +1,155 @@ +# +# Copyright 2025 The OceanBase Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import requests +from typing import Optional, Dict, Any + +from .modules.knowledge_base_manager import KnowledgeBaseManager +from .modules.document_manager import DocumentManager +from .modules.chunk_manager import ChunkManager +from .modules.extraction_manager import ExtractionManager +from .modules.raptor_manager import RAPTORManager +from .modules.knowledge_graph_manager import KnowledgeGraphManager +from .modules.retrieval_manager import RetrievalManager + + +class PowerRAGClient: + """PowerRAG SDK 主客户端""" + + def __init__(self, api_key: str, base_url: str = "http://localhost:9380", version: str = "v1"): + """ + 初始化客户端 + + Args: + api_key: API密钥 + base_url: 服务地址 + version: API版本,默认v1 + """ + self.api_key = api_key + self.base_url = base_url.rstrip("/") + self.api_url = f"{self.base_url}/api/{version}" + self.authorization_header = {"Authorization": f"Bearer {self.api_key}"} + + # 初始化各个管理模块 + self.knowledge_base = KnowledgeBaseManager(self) + self.document = DocumentManager(self) + self.chunk = ChunkManager(self) + self.extraction = ExtractionManager(self) + self.raptor = RAPTORManager(self) + self.knowledge_graph = KnowledgeGraphManager(self) + self.retrieval = RetrievalManager(self) + + def post(self, url: str, json=None, files=None, data=None, stream=False): + """ + POST请求 + + Args: + url: 请求URL + json: JSON数据 + files: 文件数据 + data: 表单数据 + stream: 是否流式传输 + + Returns: + Response对象 + """ + headers = self.authorization_header.copy() + + # 如果有文件上传,不设置Content-Type,让requests自动设置 + if files: + res = requests.post( + url=self.api_url + url, + json=json, + files=files, + data=data, + headers=headers, + stream=stream + ) + else: + if json: + headers["Content-Type"] = "application/json" + res = requests.post( + url=self.api_url + url, + json=json, + data=data, + headers=headers, + stream=stream + ) + return res + + def get(self, url: str, params=None, stream=False): + """ + GET请求 + + Args: + url: 请求URL + params: 查询参数 + stream: 是否流式传输 + + Returns: + Response对象 + """ + res = requests.get( + url=self.api_url + url, + params=params, + headers=self.authorization_header, + stream=stream + ) + return res + + def put(self, url: str, json=None): + """ + PUT请求 + + Args: + url: 请求URL + json: JSON数据 + + Returns: + Response对象 + """ + headers = self.authorization_header.copy() + headers["Content-Type"] = "application/json" + res = requests.put( + url=self.api_url + url, + json=json, + headers=headers + ) + return res + + def delete(self, url: str, json=None, params=None): + """ + DELETE请求 + + Args: + url: 请求URL + json: JSON数据 + params: 查询参数 + + Returns: + Response对象 + """ + headers = self.authorization_header.copy() + if json: + headers["Content-Type"] = "application/json" + res = requests.delete( + url=self.api_url + url, + json=json, + params=params, + headers=headers + ) + return res + diff --git a/powerrag/sdk/modules/chunk.py b/powerrag/sdk/modules/chunk.py new file mode 100644 index 000000000..23732e508 --- /dev/null +++ b/powerrag/sdk/modules/chunk.py @@ -0,0 +1,32 @@ +# +# Copyright 2025 The OceanBase Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import TypedDict, Optional, List, Dict, Any + + +class ChunkInfo(TypedDict, total=False): + """切片信息类型定义""" + id: str + content: str + document_id: str + dataset_id: str # 知识库ID + important_keywords: List[str] + questions: List[str] + image_id: Optional[str] + available: bool + positions: List[List[int]] # 位置信息,每个子列表包含5个整数 + docnm_kwd: str # 文档名称关键词 + diff --git a/powerrag/sdk/modules/chunk_manager.py b/powerrag/sdk/modules/chunk_manager.py new file mode 100644 index 000000000..61acccd1b --- /dev/null +++ b/powerrag/sdk/modules/chunk_manager.py @@ -0,0 +1,271 @@ +# +# Copyright 2025 The OceanBase Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Optional, List, Dict, Any +from .chunk import ChunkInfo + + +class ChunkManager: + """切片管理模块""" + + def __init__(self, client): + """ + 初始化切片管理模块 + + Args: + client: PowerRAG客户端实例 + """ + self.client = client + + def list( + self, + kb_id: str, + doc_id: str, + id: Optional[str] = None, + keywords: Optional[str] = None, + page: int = 1, + page_size: int = 30, + ) -> tuple[List[ChunkInfo], int, Dict[str, Any]]: + """ + 列出文档的切片 + + Args: + kb_id: 知识库ID + doc_id: 文档ID + id: 切片ID(可选,用于精确查询) + keywords: 关键词搜索(可选) + page: 页码,默认1 + page_size: 每页数量,默认30 + + Returns: + (切片列表, 总数, 文档信息) + + Raises: + Exception: API调用失败 + """ + params = { + "page": page, + "page_size": page_size, + } + + if id: + params["id"] = id + if keywords: + params["keywords"] = keywords + + url = f"/datasets/{kb_id}/documents/{doc_id}/chunks" + res = self.client.get(url, params=params) + res_json = res.json() + + if res_json.get("code") == 0: + data = res_json.get("data", {}) + return data.get("chunks", []), data.get("total", 0), data.get("doc", {}) + + raise Exception(res_json.get("message", "List chunks failed")) + + def get(self, kb_id: str, doc_id: str, chunk_id: str) -> ChunkInfo: + """ + 获取切片信息 + + Args: + kb_id: 知识库ID + doc_id: 文档ID + chunk_id: 切片ID + + Returns: + 切片信息 + + Raises: + Exception: API调用失败或切片不存在 + """ + chunks, total, _ = self.list(kb_id, doc_id, id=chunk_id, page_size=1) + if not chunks: + raise Exception(f"Chunk '{chunk_id}' not found") + return chunks[0] + + def create( + self, + kb_id: str, + doc_id: str, + content: str, + important_keywords: Optional[List[str]] = None, + questions: Optional[List[str]] = None, + ) -> ChunkInfo: + """ + 创建切片 + + Args: + kb_id: 知识库ID + doc_id: 文档ID + content: 切片内容 + important_keywords: 重要关键词列表(可选) + questions: 问题列表(可选) + + Returns: + 创建的切片信息 + + Raises: + Exception: API调用失败 + """ + payload = { + "content": content, + } + + if important_keywords is not None: + payload["important_keywords"] = important_keywords + if questions is not None: + payload["questions"] = questions + + url = f"/datasets/{kb_id}/documents/{doc_id}/chunks" + res = self.client.post(url, json=payload) + res_json = res.json() + + if res_json.get("code") != 0: + raise Exception(res_json.get("message", "Create chunk failed")) + + return res_json.get("data", {}).get("chunk", {}) + + def update( + self, + kb_id: str, + doc_id: str, + chunk_id: str, + content: Optional[str] = None, + important_keywords: Optional[List[str]] = None, + questions: Optional[List[str]] = None, + available: Optional[bool] = None, + positions: Optional[List[List[int]]] = None, + ) -> ChunkInfo: + """ + 更新切片 + + Args: + kb_id: 知识库ID + doc_id: 文档ID + chunk_id: 切片ID + content: 切片内容(可选) + important_keywords: 重要关键词列表(可选) + questions: 问题列表(可选) + available: 是否可用(可选) + positions: 位置信息(可选) + + Returns: + 更新后的切片信息 + + Raises: + Exception: API调用失败 + """ + update_data = {} + + if content is not None: + update_data["content"] = content + if important_keywords is not None: + update_data["important_keywords"] = important_keywords + if questions is not None: + update_data["questions"] = questions + if available is not None: + update_data["available"] = available + if positions is not None: + update_data["positions"] = positions + + if not update_data: + raise Exception("No fields to update") + + url = f"/datasets/{kb_id}/documents/{doc_id}/chunks/{chunk_id}" + res = self.client.put(url, json=update_data) + res_json = res.json() + + if res_json.get("code") != 0: + raise Exception(res_json.get("message", "Update chunk failed")) + + # API返回成功但不包含chunk数据,需要重新获取 + return self.get(kb_id, doc_id, chunk_id) + + def delete( + self, + kb_id: str, + doc_id: str, + chunk_ids: Optional[List[str]] = None, + ) -> None: + """ + 删除切片 + + Args: + kb_id: 知识库ID + doc_id: 文档ID + chunk_ids: 切片ID列表,如果为None则删除文档的所有切片 + + Raises: + Exception: API调用失败 + """ + payload = {} + if chunk_ids is not None: + payload["chunk_ids"] = chunk_ids + + url = f"/datasets/{kb_id}/documents/{doc_id}/chunks" + res = self.client.delete(url, json=payload) + res_json = res.json() + + if res_json.get("code") != 0: + raise Exception(res_json.get("message", "Delete chunks failed")) + + def split_text( + self, + text: str, + parser_id: str = "title", + config: Optional[Dict[str, Any]] = None, + ) -> Dict[str, Any]: + """ + 文本切片(无需上传文档) + + Args: + text: 要切片的文本(Markdown格式) + parser_id: 解析器ID(默认"title") + config: 解析配置(可选) + + Returns: + 切片结果,包含chunks列表和total_chunks数量 + + Raises: + Exception: API调用失败 + """ + payload = { + "text": text, + "parser_id": parser_id, + } + + if config: + payload["config"] = config + + url = "/powerrag/split" + res = self.client.post(url, json=payload) + + # 检查响应状态码 + if res.status_code != 200: + try: + error_json = res.json() + error_msg = error_json.get("message", f"HTTP {res.status_code}") + except Exception: + error_msg = f"HTTP {res.status_code}: {res.text[:200]}" + raise Exception(error_msg) + + res_json = res.json() + + if res_json.get("code") != 0: + raise Exception(res_json.get("message", "Split text failed")) + + return res_json.get("data", {}) + diff --git a/powerrag/sdk/modules/document.py b/powerrag/sdk/modules/document.py new file mode 100644 index 000000000..c76c05eae --- /dev/null +++ b/powerrag/sdk/modules/document.py @@ -0,0 +1,40 @@ +# +# Copyright 2025 The OceanBase Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import TypedDict, Optional, Dict, Any, List + + +class DocumentInfo(TypedDict, total=False): + """文档信息类型定义""" + id: str + name: str + dataset_id: str # 知识库ID + chunk_count: int + token_count: int + chunk_method: str + run: str # UNSTART, RUNNING, CANCEL, DONE, FAIL + progress: float # 0.0-1.0 + progress_msg: Optional[str] + type: str # 文件类型 + size: int # 文件大小(字节) + suffix: str # 文件后缀 + thumbnail: Optional[str] # 缩略图 + create_time: int # 创建时间戳 + update_time: int # 更新时间戳 + meta_fields: Optional[Dict[str, Any]] # 元数据字段 + enabled: bool # 是否启用 + parser_config: Optional[Dict[str, Any]] # 解析器配置 + diff --git a/powerrag/sdk/modules/document_manager.py b/powerrag/sdk/modules/document_manager.py new file mode 100644 index 000000000..34971524a --- /dev/null +++ b/powerrag/sdk/modules/document_manager.py @@ -0,0 +1,789 @@ +# +# Copyright 2025 The OceanBase Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Optional, List, Dict, Any, Union +from pathlib import Path +from .document import DocumentInfo + + +class DocumentManager: + """文档管理模块""" + + def __init__(self, client): + """ + 初始化文档管理模块 + + Args: + client: PowerRAG客户端实例 + """ + self.client = client + + def upload( + self, + kb_id: str, + file_paths: Union[str, List[str]], + parent_path: Optional[str] = None, + ) -> List[DocumentInfo]: + """ + 上传文档到知识库 + + Args: + kb_id: 知识库ID + file_paths: 文件路径(单个文件或文件列表) + parent_path: 父路径(可选,用于嵌套文件夹) + + Returns: + 文档信息列表 + + Raises: + Exception: API调用失败 + """ + if isinstance(file_paths, str): + file_paths = [file_paths] + + files = [] + for file_path in file_paths: + path = Path(file_path) + if not path.exists(): + raise FileNotFoundError(f"File not found: {file_path}") + + with open(path, "rb") as f: + files.append(("file", (path.name, f.read()))) + + form_data = {} + if parent_path: + form_data["parent_path"] = parent_path + + url = f"/datasets/{kb_id}/documents" + res = self.client.post(url, json=None, files=files, data=form_data) + res_json = res.json() + + if res_json.get("code") != 0: + raise Exception(res_json.get("message", "Upload documents failed")) + + return res_json.get("data", []) + + def upload_from_url( + self, + kb_id: str, + url: str, + name: str, + ) -> bool: + """ + 从URL上传文档到知识库 + + Args: + kb_id: 知识库ID + url: 文档URL + name: 文档名称 + + Returns: + 是否成功 + + Raises: + Exception: API调用失败 + """ + form_data = { + "kb_id": kb_id, + "name": name, + "url": url, + } + + res = self.client.post("/document/web_crawl", json=None, data=form_data) + res_json = res.json() + + if res_json.get("code") != 0: + raise Exception(res_json.get("message", "Upload from URL failed")) + + return res_json.get("data", False) + + def list( + self, + kb_id: str, + id: Optional[str] = None, + name: Optional[str] = None, + keywords: Optional[str] = None, + page: int = 1, + page_size: int = 30, + orderby: str = "create_time", + desc: bool = True, + create_time_from: int = 0, + create_time_to: int = 0, + suffix: Optional[List[str]] = None, + run: Optional[List[str]] = None, + ) -> tuple[List[DocumentInfo], int]: + """ + 列出知识库中的文档 + + Args: + kb_id: 知识库ID + id: 文档ID(可选) + name: 文档名称(可选) + keywords: 关键词搜索(可选) + page: 页码,默认1 + page_size: 每页数量,默认30 + orderby: 排序字段,默认create_time + desc: 是否降序,默认True + create_time_from: 创建时间起始(时间戳) + create_time_to: 创建时间结束(时间戳) + suffix: 文件后缀过滤(可选) + run: 运行状态过滤(可选,UNSTART/RUNNING/CANCEL/DONE/FAIL) + + Returns: + (文档列表, 总数) + + Raises: + Exception: API调用失败 + """ + params = { + "page": page, + "page_size": page_size, + "orderby": orderby, + "desc": desc, + } + + if id: + params["id"] = id + if name: + params["name"] = name + if keywords: + params["keywords"] = keywords + if create_time_from: + params["create_time_from"] = create_time_from + if create_time_to: + params["create_time_to"] = create_time_to + if suffix: + params["suffix"] = suffix + if run: + params["run"] = run + + url = f"/datasets/{kb_id}/documents" + res = self.client.get(url, params=params) + res_json = res.json() + + if res_json.get("code") == 0: + data = res_json.get("data", {}) + return data.get("docs", []), data.get("total", 0) + + raise Exception(res_json.get("message", "List documents failed")) + + def get(self, kb_id: str, doc_id: str) -> DocumentInfo: + """ + 获取文档信息 + + Args: + kb_id: 知识库ID + doc_id: 文档ID + + Returns: + 文档信息 + + Raises: + Exception: API调用失败或文档不存在 + """ + docs, _ = self.list(kb_id, id=doc_id, page_size=1) + if not docs: + raise Exception(f"Document '{doc_id}' not found") + return docs[0] + + def update( + self, + kb_id: str, + doc_id: str, + name: Optional[str] = None, + meta_fields: Optional[Dict[str, Any]] = None, + chunk_method: Optional[str] = None, + parser_config: Optional[Dict[str, Any]] = None, + enabled: Optional[bool] = None, + ) -> DocumentInfo: + """ + 更新文档 + + Args: + kb_id: 知识库ID + doc_id: 文档ID + name: 文档名称(可选) + meta_fields: 元数据字段(可选) + chunk_method: 切片方法(可选) + parser_config: 解析器配置(可选) + enabled: 是否启用(可选) + + Returns: + 更新后的文档信息 + + Raises: + Exception: API调用失败 + """ + update_data = {} + + if name is not None: + update_data["name"] = name + if meta_fields is not None: + update_data["meta_fields"] = meta_fields + if chunk_method is not None: + update_data["chunk_method"] = chunk_method + if parser_config is not None: + update_data["parser_config"] = parser_config + if enabled is not None: + update_data["enabled"] = enabled + + if not update_data: + raise Exception("No fields to update") + + url = f"/datasets/{kb_id}/documents/{doc_id}" + res = self.client.put(url, json=update_data) + res_json = res.json() + + if res_json.get("code") != 0: + raise Exception(res_json.get("message", "Update document failed")) + + return res_json.get("data", {}) + + def rename(self, kb_id: str, doc_id: str, new_name: str) -> DocumentInfo: + """ + 重命名文档 + + Args: + kb_id: 知识库ID + doc_id: 文档ID + new_name: 新名称 + + Returns: + 更新后的文档信息 + + Raises: + Exception: API调用失败 + """ + return self.update(kb_id, doc_id, name=new_name) + + def set_meta(self, kb_id: str, doc_id: str, meta_fields: Dict[str, Any]) -> DocumentInfo: + """ + 设置文档元数据 + + Args: + kb_id: 知识库ID + doc_id: 文档ID + meta_fields: 元数据字段字典 + + Returns: + 更新后的文档信息 + + Raises: + Exception: API调用失败 + """ + return self.update(kb_id, doc_id, meta_fields=meta_fields) + + def delete(self, kb_id: str, doc_ids: Optional[List[str]] = None) -> None: + """ + 删除文档 + + Args: + kb_id: 知识库ID + doc_ids: 文档ID列表,如果为None则删除所有文档 + + Raises: + Exception: API调用失败 + """ + payload = {"ids": doc_ids} + url = f"/datasets/{kb_id}/documents" + res = self.client.delete(url, json=payload) + res_json = res.json() + + if res_json.get("code") != 0: + raise Exception(res_json.get("message", "Delete documents failed")) + + def download(self, kb_id: str, doc_id: str, save_path: Optional[str] = None) -> Union[bytes, str]: + """ + 下载文档 + + Args: + kb_id: 知识库ID + doc_id: 文档ID + save_path: 保存路径(可选),如果提供则保存到文件,否则返回字节流 + + Returns: + 如果提供save_path则返回文件路径,否则返回文件字节流 + + Raises: + Exception: API调用失败 + """ + url = f"/datasets/{kb_id}/documents/{doc_id}" + res = self.client.get(url, stream=True) + + if res.status_code != 200: + res_json = res.json() if res.headers.get("content-type", "").startswith("application/json") else {} + raise Exception(res_json.get("message", "Download document failed")) + + file_content = res.content + + if save_path: + Path(save_path).parent.mkdir(parents=True, exist_ok=True) + with open(save_path, "wb") as f: + f.write(file_content) + return save_path + + return file_content + + def parse_to_chunk( + self, + kb_id: str, + doc_ids: List[str], + wait: bool = True, + delete_existing: bool = False, + config: Optional[Dict[str, Any]] = None, + ) -> Union[List[Dict[str, Any]], str]: + """ + 解析文档为切片 + + Args: + kb_id: 知识库ID + doc_ids: 文档ID列表 + wait: 是否等待解析完成(默认True) + delete_existing: 是否删除已存在的切片(默认False) + config: 解析配置(可选) + + Returns: + 如果wait=True,返回解析结果列表;如果wait=False,返回任务ID + + Raises: + Exception: API调用失败 + """ + payload = { + "document_ids": doc_ids, + } + + if delete_existing: + payload["delete_existing"] = True + if config: + payload["config"] = config + + url = f"/datasets/{kb_id}/chunks" + res = self.client.post(url, json=payload) + res_json = res.json() + + if res_json.get("code") != 0: + raise Exception(res_json.get("message", "Parse documents failed")) + + if wait: + return self._wait_for_parse(kb_id, doc_ids) + + return res_json.get("data", {}).get("task_id", "") + + def parse_to_md_async( + self, + doc_id: str, + config: Optional[Dict[str, Any]] = None, + ) -> str: + """ + 异步解析文档为 Markdown(不切分) + + 提交异步解析任务,立即返回任务 ID。 + 适用于大文档或需要长时间处理的场景。 + + 支持的文件格式: + - PDF (.pdf) + - Office 文档 (.doc, .docx, .ppt, .pptx) + - 图片 (.jpg, .png) + - HTML (.html, .htm) + + Args: + doc_id: 文档ID + config: 解析配置(可选) + - layout_recognize: 布局识别引擎 (mineru 或 dots_ocr,默认 mineru) + - enable_ocr: 是否启用 OCR (默认 False) + - enable_formula: 是否识别公式 (默认 False) + - enable_table: 是否识别表格 (默认 True) + - from_page: 起始页(仅 PDF,默认 0) + - to_page: 结束页(仅 PDF,默认 100000) + + Returns: + task_id: 任务ID,用于查询任务状态和结果 + + Raises: + Exception: API调用失败 + + Example: + >>> # 提交异步任务 + >>> task_id = client.document.parse_to_md_async( + ... doc_id="doc_123", + ... config={"layout_recognize": "mineru"} + ... ) + >>> print(f"Task ID: {task_id}") + + >>> # 查询任务状态 + >>> status = client.document.get_parse_to_md_status(task_id) + >>> if status["status"] == "success": + ... print(f"Markdown: {status['result']['markdown']}") + """ + payload = { + "doc_id": doc_id, + } + if config: + payload["config"] = config + + url = "/powerrag/parse_to_md/async" + res = self.client.post(url, json=payload) + res_json = res.json() + + if res_json.get("code") != 0: + raise Exception(res_json.get("message", "Submit parse_to_md task failed")) + + return res_json.get("data", {}).get("task_id", "") + + def get_parse_to_md_status( + self, + task_id: str, + ) -> Dict[str, Any]: + """ + 查询 parse_to_md 异步任务状态 + + Args: + task_id: 任务ID(由 parse_to_md_async 返回) + + Returns: + 任务状态字典: + { + "task_id": "...", + "status": "pending|processing|success|failed|not_found", + "created_at": "2025-01-01T00:00:00", + "updated_at": "2025-01-01T00:00:00", + "result": { # 仅当 status="success" 时存在 + "doc_id": "...", + "doc_name": "...", + "markdown": "...", + "markdown_length": 5000, + "images": {...}, + "total_images": 2 + }, + "error": "..." # 仅当 status="failed" 时存在 + } + + Raises: + Exception: API调用失败 + + Example: + >>> status = client.document.get_parse_to_md_status(task_id) + >>> print(f"Status: {status['status']}") + >>> + >>> if status["status"] == "success": + ... result = status["result"] + ... print(f"Markdown length: {result['markdown_length']}") + >>> elif status["status"] == "failed": + ... print(f"Error: {status['error']}") + >>> elif status["status"] in ["pending", "processing"]: + ... print("Task is still running...") + """ + url = f"/powerrag/parse_to_md/status/{task_id}" + res = self.client.get(url) + res_json = res.json() + + # For 404, still return the data (with status="not_found") + if res_json.get("code") == 404: + return res_json.get("data", {"task_id": task_id, "status": "not_found"}) + + if res_json.get("code") != 0: + raise Exception(res_json.get("message", "Get task status failed")) + + return res_json.get("data", {}) + + def wait_for_parse_to_md( + self, + task_id: str, + timeout: int = 300, + interval: float = 2.0, + ) -> Dict[str, Any]: + """ + 等待 parse_to_md 异步任务完成 + + 轮询任务状态直到完成(成功或失败)或超时。 + + Args: + task_id: 任务ID + timeout: 超时时间(秒),默认 300 秒(5 分钟) + interval: 轮询间隔(秒),默认 2 秒 + + Returns: + 任务最终状态(同 get_parse_to_md_status) + + Raises: + TimeoutError: 超时 + Exception: 任务失败或 API 调用失败 + + Example: + >>> task_id = client.document.parse_to_md_async(doc_id) + >>> result = client.document.wait_for_parse_to_md(task_id, timeout=600) + >>> print(f"Markdown: {result['result']['markdown']}") + """ + import time + + start_time = time.time() + terminal_states = {"success", "failed", "not_found"} + + while True: + status = self.get_parse_to_md_status(task_id) + + if status["status"] in terminal_states: + if status["status"] == "failed": + raise Exception(f"Task failed: {status.get('error', 'Unknown error')}") + elif status["status"] == "not_found": + raise Exception(f"Task not found: {task_id}") + return status + + # Check timeout + elapsed = time.time() - start_time + if elapsed > timeout: + raise TimeoutError(f"Task {task_id} did not complete within {timeout} seconds") + + # Sleep before next poll + time.sleep(interval) + + def parse_to_md( + self, + doc_id: str, + config: Optional[Dict[str, Any]] = None, + ) -> Dict[str, Any]: + """ + 解析文档为Markdown(不切分) + + 将已上传的文档解析为 Markdown 格式,但不进行切分。 + 适用于需要完整文档内容或外部系统自行处理切分的场景。 + + 支持的文件格式: + - PDF (.pdf) + - Office 文档 (.doc, .docx, .ppt, .pptx) + - 图片 (.jpg, .png) + - HTML (.html, .htm) + + Args: + doc_id: 文档ID + config: 解析配置(可选) + - layout_recognize: 布局识别引擎 (mineru 或 dots_ocr,默认 mineru) + - enable_ocr: 是否启用 OCR (默认 False) + - enable_formula: 是否识别公式 (默认 False) + - enable_table: 是否识别表格 (默认 True) + - from_page: 起始页(仅 PDF,默认 0) + - to_page: 结束页(仅 PDF,默认 100000) + + Returns: + 解析结果字典: + { + "doc_id": "...", + "doc_name": "...", + "markdown": "...", # 完整的 Markdown 内容 + "markdown_length": 5000, # Markdown 长度 + "images": {...}, # 图片字典 (base64) + "total_images": 2 # 图片总数 + } + + Raises: + Exception: API调用失败 + + Example: + >>> result = doc_manager.parse_to_md( + ... doc_id="doc_123", + ... config={"layout_recognize": "mineru", "enable_ocr": False} + ... ) + >>> print(f"Markdown length: {result['markdown_length']}") + >>> print(f"First 200 chars: {result['markdown'][:200]}") + """ + payload = { + "doc_id": doc_id, + } + + if config: + payload["config"] = config + + url = "/powerrag/parse_to_md" + res = self.client.post(url, json=payload) + res_json = res.json() + + if res_json.get("code") != 0: + raise Exception(res_json.get("message", "Parse to markdown failed")) + + return res_json.get("data", {}) + + def parse_to_md_upload( + self, + file_path: str, + config: Optional[Dict[str, Any]] = None, + ) -> Dict[str, Any]: + """ + 上传文件并解析为Markdown(不切分) + + 直接上传文件并解析为 Markdown 格式,不进行切分。 + 不需要先上传到知识库,适合一次性解析场景。 + + 支持的文件格式: + - PDF (.pdf) + - Office 文档 (.doc, .docx, .ppt, .pptx) + - 图片 (.jpg, .png) + - HTML (.html, .htm) + + Args: + file_path: 文件路径 + config: 解析配置(可选),同 parse_to_md + + Returns: + 解析结果字典,包含以下字段: + - filename: 文件名 + - markdown: Markdown 内容 + - markdown_length: Markdown 长度 + - images: 图片字典 + - total_images: 图片总数 + + Raises: + FileNotFoundError: 文件不存在 + Exception: API调用失败 + + Example: + >>> result = doc_manager.parse_to_md_upload( + ... file_path="document.pdf", + ... config={"layout_recognize": "mineru"} + ... ) + >>> print(result['markdown']) + >>> print(f"Parsed {result['total_images']} images") + """ + path = Path(file_path) + if not path.exists(): + raise FileNotFoundError(f"File not found: {file_path}") + + # Prepare files + with open(path, "rb") as f: + files = [("file", (path.name, f.read()))] + + # Prepare form data + import json + form_data = {} + if config: + form_data["config"] = json.dumps(config) + + url = "/powerrag/parse_to_md/upload" + res = self.client.post(url, json=None, files=files, data=form_data) + + # Parse JSON response + res_json = res.json() + + if res_json.get("code") != 0: + raise Exception(res_json.get("message", "Parse to markdown (upload) failed")) + + return res_json.get("data", {}) + + def parse_url( + self, + kb_id: str, + url: str, + name: str, + wait: bool = True, + config: Optional[Dict[str, Any]] = None, + ) -> Union[DocumentInfo, str]: + """ + 解析URL文档 + + Args: + kb_id: 知识库ID + url: 文档URL + name: 文档名称 + wait: 是否等待解析完成(默认True) + config: 解析配置(可选) + + Returns: + 如果wait=True,返回文档信息;如果wait=False,返回任务ID + + Raises: + Exception: API调用失败 + """ + self.upload_from_url(kb_id, url, name) + + docs, _ = self.list(kb_id, name=name) + if not docs: + raise Exception(f"Failed to upload document from URL: {url}") + + doc_id = docs[0]["id"] + + if wait: + self.parse_to_chunk(kb_id, [doc_id], wait=True, config=config) + return self.get(kb_id, doc_id) + + task_id = self.parse_to_chunk(kb_id, [doc_id], wait=False, config=config) + return task_id + + def cancel_parse(self, kb_id: str, doc_ids: List[str]) -> None: + """ + 取消解析任务 + + Args: + kb_id: 知识库ID + doc_ids: 文档ID列表 + + Raises: + Exception: API调用失败 + """ + payload = {"document_ids": doc_ids} + url = f"/datasets/{kb_id}/chunks" + res = self.client.delete(url, json=payload) + res_json = res.json() + + if res_json.get("code") != 0: + raise Exception(res_json.get("message", "Cancel parse failed")) + + def _wait_for_parse(self, kb_id: str, doc_ids: List[str]) -> List[Dict[str, Any]]: + """ + 等待解析完成(内部方法) + + Args: + kb_id: 知识库ID + doc_ids: 文档ID列表 + + Returns: + 解析结果列表 + """ + import time + + terminal_states = {"DONE", "FAIL", "CANCEL"} + interval_sec = 1 + pending = set(doc_ids) + results = [] + + while pending: + for doc_id in list(pending): + try: + doc = self.get(kb_id, doc_id) + run_status = doc.get("run", "") + + if run_status in terminal_states: + results.append({ + "doc_id": doc_id, + "status": run_status, + "chunk_count": doc.get("chunk_count", 0), + "token_count": doc.get("token_count", 0), + }) + pending.discard(doc_id) + elif doc.get("progress", 0.0) >= 1.0: + results.append({ + "doc_id": doc_id, + "status": "DONE", + "chunk_count": doc.get("chunk_count", 0), + "token_count": doc.get("token_count", 0), + }) + pending.discard(doc_id) + except Exception: + pass + + if pending: + time.sleep(interval_sec) + + return results + diff --git a/powerrag/sdk/modules/extraction.py b/powerrag/sdk/modules/extraction.py new file mode 100644 index 000000000..a7086f21e --- /dev/null +++ b/powerrag/sdk/modules/extraction.py @@ -0,0 +1,50 @@ +# +# Copyright 2025 The OceanBase Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import TypedDict, Optional, List, Dict, Any + + +class ExtractionResult(TypedDict, total=False): + """抽取结果类型定义""" + doc_id: Optional[str] + doc_name: Optional[str] + extractor_type: str + data: Dict[str, Any] # 抽取的数据(entities/keywords/summary等) + metadata: Dict[str, Any] + + +class EntityInfo(TypedDict, total=False): + """实体信息""" + text: str + type: str + start: int + end: int + confidence: Optional[float] + + +class KeywordInfo(TypedDict, total=False): + """关键词信息""" + keyword: str + score: float + frequency: Optional[int] + + +class StructExtractTaskInfo(TypedDict, total=False): + """结构化抽取任务信息""" + task_id: str + status: str + result: Optional[Dict[str, Any]] + diff --git a/powerrag/sdk/modules/extraction_manager.py b/powerrag/sdk/modules/extraction_manager.py new file mode 100644 index 000000000..a4d40d09b --- /dev/null +++ b/powerrag/sdk/modules/extraction_manager.py @@ -0,0 +1,239 @@ +# +# Copyright 2025 The OceanBase Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Optional, List, Dict, Any, Union +from .extraction import ExtractionResult, StructExtractTaskInfo + + +class ExtractionManager: + """抽取管理模块""" + + def __init__(self, client): + """ + 初始化抽取管理模块 + + Args: + client: PowerRAG客户端实例 + """ + self.client = client + + def extract_from_document( + self, + doc_id: str, + extractor_type: str = "entity", + config: Optional[Dict[str, Any]] = None, + ) -> ExtractionResult: + """ + 从文档抽取信息 + + Args: + doc_id: 文档ID + extractor_type: 抽取类型,'entity'、'keyword' 或 'summary' + config: 抽取配置(可选) + - entity: {"entity_types": ["PERSON", "ORG"], "use_regex": True, "use_llm": False} + - keyword: {"max_keywords": 20, "min_word_length": 3} + - summary: {"max_length": 200, "min_length": 50} + + Returns: + 抽取结果 + + Raises: + Exception: API调用失败 + """ + payload = { + "doc_id": doc_id, + "extractor_type": extractor_type, + } + + if config: + payload["config"] = config + + url = "/powerrag/extract" + res = self.client.post(url, json=payload) + res_json = res.json() + + if res_json.get("code") != 0: + raise Exception(res_json.get("message", "Extract from document failed")) + + return res_json.get("data", {}) + + def extract_from_text( + self, + text: str, + extractor_type: str = "entity", + config: Optional[Dict[str, Any]] = None, + ) -> ExtractionResult: + """ + 从文本抽取信息 + + Args: + text: 文本内容 + extractor_type: 抽取类型,'entity'、'keyword' 或 'summary' + config: 抽取配置(可选) + + Returns: + 抽取结果 + + Raises: + Exception: API调用失败 + """ + payload = { + "text": text, + "extractor_type": extractor_type, + } + + if config: + payload["config"] = config + + url = "/powerrag/extract/text" + res = self.client.post(url, json=payload) + res_json = res.json() + + if res_json.get("code") != 0: + raise Exception(res_json.get("message", "Extract from text failed")) + + return res_json.get("data", {}) + + def extract_batch( + self, + doc_ids: List[str], + extractor_type: str = "entity", + config: Optional[Dict[str, Any]] = None, + ) -> List[Dict[str, Any]]: + """ + 批量从文档抽取信息 + + Args: + doc_ids: 文档ID列表 + extractor_type: 抽取类型 + config: 抽取配置(可选) + + Returns: + 抽取结果列表,每个结果包含success字段 + + Raises: + Exception: API调用失败 + """ + payload = { + "doc_ids": doc_ids, + "extractor_type": extractor_type, + } + + if config: + payload["config"] = config + + url = "/powerrag/extract/batch" + res = self.client.post(url, json=payload) + res_json = res.json() + + if res_json.get("code") != 0: + raise Exception(res_json.get("message", "Batch extract failed")) + + return res_json.get("data", []) + + def struct_extract( + self, + text_or_documents: Union[str, List[Dict[str, str]]], + prompt_description: str, + examples: List[Dict[str, Any]], + fetch_urls: bool = False, + max_char_buffer: int = 1000, + temperature: Optional[float] = None, + extraction_passes: int = 1, + additional_context: Optional[str] = None, + prompt_validation_level: str = "WARNING", + prompt_validation_strict: bool = False, + resolver_params: Optional[Dict[str, Any]] = None, + model_parameters: Optional[Dict[str, Any]] = None, + timeout: Optional[int] = None, + ) -> StructExtractTaskInfo: + """ + 结构化抽取(LangExtract) + + Args: + text_or_documents: 文本内容或文档列表 + prompt_description: 抽取提示描述 + examples: 示例列表,每个示例包含text和extractions + fetch_urls: 是否获取URL(默认False) + max_char_buffer: 最大字符缓冲区(默认1000) + temperature: 温度参数(可选) + extraction_passes: 抽取轮数(默认1) + additional_context: 额外上下文(可选) + prompt_validation_level: 提示验证级别(默认"WARNING") + prompt_validation_strict: 是否严格验证(默认False) + resolver_params: 解析器参数(可选) + model_parameters: 模型参数(可选) + timeout: 超时时间(可选) + + Returns: + 任务信息,包含task_id + + Raises: + Exception: API调用失败 + """ + payload = { + "text_or_documents": text_or_documents, + "prompt_description": prompt_description, + "examples": examples, + "fetch_urls": fetch_urls, + "max_char_buffer": max_char_buffer, + "extraction_passes": extraction_passes, + "prompt_validation_level": prompt_validation_level, + "prompt_validation_strict": prompt_validation_strict, + } + + if temperature is not None: + payload["temperature"] = temperature + if additional_context: + payload["additional_context"] = additional_context + if resolver_params: + payload["resolver_params"] = resolver_params + if model_parameters: + payload["model_parameters"] = model_parameters + if timeout: + payload["timeout"] = timeout + + url = "/powerrag/struct_extract/submit" + res = self.client.post(url, json=payload) + res_json = res.json() + + if res_json.get("code") != 0: + raise Exception(res_json.get("message", "Struct extract failed")) + + return res_json.get("data", {}) + + def get_struct_extract_status(self, task_id: str) -> Dict[str, Any]: + """ + 获取结构化抽取任务状态 + + Args: + task_id: 任务ID + + Returns: + 任务状态信息 + + Raises: + Exception: API调用失败 + """ + url = f"/powerrag/struct_extract/status/{task_id}" + res = self.client.get(url) + res_json = res.json() + + if res_json.get("code") != 0: + raise Exception(res_json.get("message", "Get struct extract status failed")) + + return res_json.get("data", {}) + diff --git a/powerrag/sdk/modules/knowledge_base.py b/powerrag/sdk/modules/knowledge_base.py new file mode 100644 index 000000000..c8e0ebbcc --- /dev/null +++ b/powerrag/sdk/modules/knowledge_base.py @@ -0,0 +1,34 @@ +# +# Copyright 2025 The OceanBase Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import TypedDict, Optional, Dict, Any + + +class KnowledgeBaseInfo(TypedDict, total=False): + """知识库信息类型定义""" + id: str + name: str + avatar: Optional[str] + tenant_id: Optional[str] + description: Optional[str] + embedding_model: str + permission: str + document_count: int + chunk_count: int + chunk_method: str + parser_config: Optional[Dict[str, Any]] + pagerank: int + diff --git a/powerrag/sdk/modules/knowledge_base_manager.py b/powerrag/sdk/modules/knowledge_base_manager.py new file mode 100644 index 000000000..bc1eb07a9 --- /dev/null +++ b/powerrag/sdk/modules/knowledge_base_manager.py @@ -0,0 +1,232 @@ +# +# Copyright 2025 The OceanBase Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Optional, List, Dict, Any +from .knowledge_base import KnowledgeBaseInfo + + +class KnowledgeBaseManager: + """知识库管理模块""" + + def __init__(self, client): + """ + 初始化知识库管理模块 + + Args: + client: PowerRAG客户端实例 + """ + self.client = client + + def create( + self, + name: str, + description: Optional[str] = None, + avatar: Optional[str] = None, + embedding_model: Optional[str] = None, + permission: str = "me", + chunk_method: str = "naive", + parser_config: Optional[Dict[str, Any]] = None, + ) -> KnowledgeBaseInfo: + """ + 创建知识库 + + Args: + name: 知识库名称(必填) + description: 描述(可选) + avatar: 头像,base64编码(可选) + embedding_model: 嵌入模型名称(可选,默认使用租户默认模型) + permission: 权限,'me' 或 'team'(默认'me') + chunk_method: 切片方法(默认'naive') + parser_config: 解析器配置(可选) + + Returns: + 创建的知识库信息 + + Raises: + Exception: API调用失败 + + Note: + pagerank 字段只能在更新时设置,创建时不能设置 + """ + payload = { + "name": name, + } + + if description is not None: + payload["description"] = description + if avatar is not None: + payload["avatar"] = avatar + if embedding_model is not None: + payload["embedding_model"] = embedding_model + if permission: + payload["permission"] = permission + if chunk_method: + payload["chunk_method"] = chunk_method + if parser_config is not None: + payload["parser_config"] = parser_config + + res = self.client.post("/datasets", json=payload) + res_json = res.json() + + if res_json.get("code") != 0: + raise Exception(res_json.get("message", "Create knowledge base failed")) + + return res_json.get("data", {}) + + def get(self, kb_id: str) -> KnowledgeBaseInfo: + """ + 获取知识库 + + Args: + kb_id: 知识库ID + + Returns: + 知识库信息 + + Raises: + Exception: API调用失败或知识库不存在 + """ + kbs, _ = self.list(id=kb_id, page_size=1) + if not kbs: + raise Exception(f"Knowledge base '{kb_id}' not found") + return kbs[0] + + def list( + self, + id: Optional[str] = None, + name: Optional[str] = None, + page: int = 1, + page_size: int = 30, + orderby: str = "create_time", + desc: bool = True, + ) -> tuple[List[KnowledgeBaseInfo], int]: + """ + 列出知识库 + + Args: + id: 知识库ID(可选,用于精确查询) + name: 知识库名称(可选,用于模糊查询) + page: 页码,默认1 + page_size: 每页数量,默认30 + orderby: 排序字段,默认create_time + desc: 是否降序,默认True + + Returns: + (知识库列表, 总数) + + Raises: + Exception: API调用失败 + """ + params = { + "page": page, + "page_size": page_size, + "orderby": orderby, + "desc": desc, + } + + if id: + params["id"] = id + if name: + params["name"] = name + + res = self.client.get("/datasets", params=params) + res_json = res.json() + + if res_json.get("code") == 0: + # API返回的字段名是 total_datasets,不是 total + return res_json.get("data", []), res_json.get("total_datasets", 0) + + raise Exception(res_json.get("message", "List knowledge bases failed")) + + def update( + self, + kb_id: str, + name: Optional[str] = None, + description: Optional[str] = None, + avatar: Optional[str] = None, + embedding_model: Optional[str] = None, + permission: Optional[str] = None, + chunk_method: Optional[str] = None, + parser_config: Optional[Dict[str, Any]] = None, + pagerank: Optional[int] = None, + ) -> KnowledgeBaseInfo: + """ + 更新知识库 + + Args: + kb_id: 知识库ID + name: 知识库名称(可选) + description: 描述(可选) + avatar: 头像(可选) + embedding_model: 嵌入模型(可选) + permission: 权限(可选) + chunk_method: 切片方法(可选) + parser_config: 解析器配置(可选) + pagerank: 页面排名(可选) + + Returns: + 更新后的知识库信息 + + Raises: + Exception: API调用失败 + """ + # 字段名映射:SDK字段 -> API字段 + update_data = {} + if name is not None: + update_data["name"] = name + if description is not None: + update_data["description"] = description + if avatar is not None: + update_data["avatar"] = avatar + if embedding_model is not None: + update_data["embd_id"] = embedding_model + if permission is not None: + update_data["permission"] = permission + if chunk_method is not None: + update_data["parser_id"] = chunk_method + if parser_config is not None: + update_data["parser_config"] = parser_config + if pagerank is not None: + update_data["pagerank"] = pagerank + + if not update_data: + raise Exception("No fields to update") + + res = self.client.put(f"/datasets/{kb_id}", json=update_data) + res_json = res.json() + + if res_json.get("code") != 0: + raise Exception(res_json.get("message", "Update knowledge base failed")) + + return res_json.get("data", {}) + + def delete(self, ids: Optional[List[str]] = None) -> None: + """ + 删除知识库 + + Args: + ids: 知识库ID列表,如果为None则删除所有知识库 + + Raises: + Exception: API调用失败 + """ + payload = {"ids": ids} + res = self.client.delete("/datasets", json=payload) + res_json = res.json() + + if res_json.get("code") != 0: + raise Exception(res_json.get("message", "Delete knowledge bases failed")) + diff --git a/powerrag/sdk/modules/knowledge_graph.py b/powerrag/sdk/modules/knowledge_graph.py new file mode 100644 index 000000000..e236588db --- /dev/null +++ b/powerrag/sdk/modules/knowledge_graph.py @@ -0,0 +1,52 @@ +# +# Copyright 2025 The OceanBase Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import TypedDict, Optional, List, Dict, Any + + +class KnowledgeGraphNode(TypedDict, total=False): + """知识图谱节点""" + id: str + label: str + pagerank: Optional[float] + properties: Optional[Dict[str, Any]] + + +class KnowledgeGraphEdge(TypedDict, total=False): + """知识图谱边""" + source: str + target: str + weight: Optional[float] + label: Optional[str] + properties: Optional[Dict[str, Any]] + + +class KnowledgeGraphData(TypedDict, total=False): + """知识图谱数据""" + graph: Dict[str, Any] # 包含nodes和edges + mind_map: Dict[str, Any] + + +class KnowledgeGraphTaskInfo(TypedDict, total=False): + """知识图谱任务信息""" + graphrag_task_id: str + status: Optional[str] + progress: Optional[float] + progress_msg: Optional[str] + begin_at: Optional[str] + create_time: Optional[int] + update_time: Optional[int] + diff --git a/powerrag/sdk/modules/knowledge_graph_manager.py b/powerrag/sdk/modules/knowledge_graph_manager.py new file mode 100644 index 000000000..2ead3c75d --- /dev/null +++ b/powerrag/sdk/modules/knowledge_graph_manager.py @@ -0,0 +1,102 @@ +# +# Copyright 2025 The OceanBase Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Optional, Dict, Any +from .knowledge_graph import KnowledgeGraphData, KnowledgeGraphTaskInfo + + +class KnowledgeGraphManager: + """知识图谱管理模块""" + + def __init__(self, client): + """ + 初始化知识图谱管理模块 + + Args: + client: PowerRAG客户端实例 + """ + self.client = client + + def build(self, kb_id: str) -> KnowledgeGraphTaskInfo: + """ + 构建知识图谱(异步) + + 注意:KnowledgeGraph的配置参数从知识库的 `parser_config.graphrag` 中读取。 + 需要在创建或更新知识库时设置这些配置参数。 + + Args: + kb_id: 知识库ID + + Returns: + 任务信息,包含graphrag_task_id + + Raises: + Exception: API调用失败 + """ + url = f"/datasets/{kb_id}/run_graphrag" + res = self.client.post(url) + res_json = res.json() + + if res_json.get("code") != 0: + raise Exception(res_json.get("message", "Build knowledge graph failed")) + + return res_json.get("data", {}) + + def get(self, kb_id: str) -> KnowledgeGraphData: + """ + 获取知识图谱 + + Args: + kb_id: 知识库ID + + Returns: + 知识图谱数据,包含graph和mind_map + + Raises: + Exception: API调用失败 + """ + url = f"/datasets/{kb_id}/knowledge_graph" + res = self.client.get(url) + res_json = res.json() + + if res_json.get("code") != 0: + raise Exception(res_json.get("message", "Get knowledge graph failed")) + + return res_json.get("data", {"graph": {}, "mind_map": {}}) + + def get_status(self, kb_id: str) -> Optional[Dict[str, Any]]: + """ + 获取知识图谱构建状态 + + Args: + kb_id: 知识库ID + + Returns: + 任务状态信息,如果不存在则返回None + + Raises: + Exception: API调用失败 + """ + url = f"/datasets/{kb_id}/trace_graphrag" + res = self.client.get(url) + res_json = res.json() + + if res_json.get("code") != 0: + raise Exception(res_json.get("message", "Get knowledge graph status failed")) + + data = res_json.get("data", {}) + return data if data else None + diff --git a/powerrag/sdk/modules/raptor.py b/powerrag/sdk/modules/raptor.py new file mode 100644 index 000000000..5516c9862 --- /dev/null +++ b/powerrag/sdk/modules/raptor.py @@ -0,0 +1,29 @@ +# +# Copyright 2025 The OceanBase Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import TypedDict, Optional, Dict, Any + + +class RAPTORTaskInfo(TypedDict, total=False): + """RAPTOR任务信息""" + raptor_task_id: str + status: Optional[str] + progress: Optional[float] + progress_msg: Optional[str] + begin_at: Optional[str] + create_time: Optional[int] + update_time: Optional[int] + diff --git a/powerrag/sdk/modules/raptor_manager.py b/powerrag/sdk/modules/raptor_manager.py new file mode 100644 index 000000000..4bb814c67 --- /dev/null +++ b/powerrag/sdk/modules/raptor_manager.py @@ -0,0 +1,83 @@ +# +# Copyright 2025 The OceanBase Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Optional, Dict, Any +from .raptor import RAPTORTaskInfo + + +class RAPTORManager: + """RAPTOR管理模块""" + + def __init__(self, client): + """ + 初始化RAPTOR管理模块 + + Args: + client: PowerRAG客户端实例 + """ + self.client = client + + def build(self, kb_id: str) -> RAPTORTaskInfo: + """ + 构建RAPTOR(异步) + + 注意:RAPTOR的配置参数从知识库的 `parser_config.raptor` 中读取。 + 需要在创建或更新知识库时设置这些配置参数。 + + Args: + kb_id: 知识库ID + + Returns: + 任务信息,包含raptor_task_id + + Raises: + Exception: API调用失败 + """ + # 使用SDK的RAPTOR接口 + url = f"/datasets/{kb_id}/run_raptor" + res = self.client.post(url) + res_json = res.json() + + if res_json.get("code") != 0: + raise Exception(res_json.get("message", "Build RAPTOR failed")) + + return res_json.get("data", {}) + + def get_status(self, kb_id: str) -> Optional[Dict[str, Any]]: + """ + 获取RAPTOR构建状态 + + Args: + kb_id: 知识库ID + + Returns: + 任务状态信息,如果不存在则返回None + + Raises: + Exception: API调用失败 + """ + # 使用SDK的RAPTOR接口 + url = f"/datasets/{kb_id}/trace_raptor" + res = self.client.get(url) + res_json = res.json() + + if res_json.get("code") != 0: + raise Exception(res_json.get("message", "Get RAPTOR status failed")) + + data = res_json.get("data", {}) + return data if data else None + + diff --git a/powerrag/sdk/modules/retrieval.py b/powerrag/sdk/modules/retrieval.py new file mode 100644 index 000000000..0b0c92acb --- /dev/null +++ b/powerrag/sdk/modules/retrieval.py @@ -0,0 +1,41 @@ +# +# Copyright 2025 The OceanBase Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import TypedDict, Optional, List, Dict, Any + + +class RetrievalChunk(TypedDict, total=False): + """检索结果切片""" + id: str + content: str + document_id: str + dataset_id: str + similarity: float + important_keywords: List[str] + questions: List[str] + docnm_kwd: str + image_id: Optional[str] + available: bool + positions: List[List[int]] + + +class RetrievalResult(TypedDict, total=False): + """检索结果""" + total: int + chunks: List[RetrievalChunk] + doc_aggs: Dict[str, Any] # 文档聚合信息 + labels: Optional[Dict[str, Any]] # 标签信息 + diff --git a/powerrag/sdk/modules/retrieval_manager.py b/powerrag/sdk/modules/retrieval_manager.py new file mode 100644 index 000000000..239104531 --- /dev/null +++ b/powerrag/sdk/modules/retrieval_manager.py @@ -0,0 +1,158 @@ +# +# Copyright 2025 The OceanBase Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Optional, List, Dict, Any +from .retrieval import RetrievalResult + + +class RetrievalManager: + """检索管理模块""" + + def __init__(self, client): + """ + 初始化检索管理模块 + + Args: + client: PowerRAG客户端实例 + """ + self.client = client + + def search( + self, + kb_ids: List[str], + question: str, + document_ids: Optional[List[str]] = None, + page: int = 1, + page_size: int = 30, + similarity_threshold: float = 0.2, + vector_similarity_weight: float = 0.3, + top_k: int = 1024, + keyword: bool = False, + use_kg: bool = False, + rerank_id: Optional[str] = None, + highlight: bool = True, + cross_languages: Optional[List[str]] = None, + metadata_condition: Optional[Dict[str, Any]] = None, + ) -> RetrievalResult: + """ + 检索(向量/关键词/混合) + + Args: + kb_ids: 知识库ID列表 + question: 查询问题 + document_ids: 文档ID列表(可选,用于过滤) + page: 页码,默认1 + page_size: 每页数量,默认30 + similarity_threshold: 相似度阈值,默认0.2 + vector_similarity_weight: 向量相似度权重(混合检索时使用),默认0.3 + top_k: 最大返回数量,默认1024 + keyword: 是否使用关键词增强,默认False + use_kg: 是否使用知识图谱检索,默认False + rerank_id: 重排序模型ID(可选) + highlight: 是否高亮匹配内容,默认True + cross_languages: 跨语言列表(可选) + metadata_condition: 元数据过滤条件(可选) + + Returns: + 检索结果,包含chunks列表和total数量 + + Raises: + Exception: API调用失败 + """ + payload = { + "dataset_ids": kb_ids, + "question": question, + "page": page, + "page_size": page_size, + "similarity_threshold": similarity_threshold, + "vector_similarity_weight": vector_similarity_weight, + "top_k": top_k, + "keyword": keyword, + "use_kg": use_kg, + "highlight": highlight, + } + + if document_ids: + payload["document_ids"] = document_ids + if rerank_id: + payload["rerank_id"] = rerank_id + if cross_languages: + payload["cross_languages"] = cross_languages + if metadata_condition: + payload["metadata_condition"] = metadata_condition + + url = "/retrieval" + res = self.client.post(url, json=payload) + res_json = res.json() + + if res_json.get("code") != 0: + raise Exception(res_json.get("message", "Search failed")) + + return res_json.get("data", {"total": 0, "chunks": []}) + + def test( + self, + kb_ids: List[str], + question: str, + document_ids: Optional[List[str]] = None, + page: int = 1, + page_size: int = 30, + similarity_threshold: float = 0.2, + vector_similarity_weight: float = 0.3, + top_k: int = 1024, + keyword: bool = False, + use_kg: bool = False, + rerank_id: Optional[str] = None, + highlight: bool = True, + ) -> RetrievalResult: + """ + 检索测试(与search方法相同,用于测试场景) + + Args: + kb_ids: 知识库ID列表 + question: 查询问题 + document_ids: 文档ID列表(可选) + page: 页码,默认1 + page_size: 每页数量,默认30 + similarity_threshold: 相似度阈值,默认0.2 + vector_similarity_weight: 向量相似度权重,默认0.3 + top_k: 最大返回数量,默认1024 + keyword: 是否使用关键词增强,默认False + use_kg: 是否使用知识图谱检索,默认False + rerank_id: 重排序模型ID(可选) + highlight: 是否高亮匹配内容,默认True + + Returns: + 检索结果 + + Raises: + Exception: API调用失败 + """ + return self.search( + kb_ids=kb_ids, + question=question, + document_ids=document_ids, + page=page, + page_size=page_size, + similarity_threshold=similarity_threshold, + vector_similarity_weight=vector_similarity_weight, + top_k=top_k, + keyword=keyword, + use_kg=use_kg, + rerank_id=rerank_id, + highlight=highlight, + ) + diff --git a/powerrag/sdk/tests/conftest.py b/powerrag/sdk/tests/conftest.py new file mode 100644 index 000000000..78075d876 --- /dev/null +++ b/powerrag/sdk/tests/conftest.py @@ -0,0 +1,231 @@ +# +# Copyright 2025 The OceanBase Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import time +import pytest +from pathlib import Path + +from powerrag.sdk import PowerRAGClient + +# 从环境变量获取配置 +HOST_ADDRESS = os.getenv("HOST_ADDRESS", "http://127.0.0.1:9222") +API_KEY = os.getenv("POWERRAG_API_KEY", "ragflow-MAln1FNDn9PhIcqv1axaaUT3mM-efUZ83O5LVcroe9E") + + +@pytest.fixture(scope="session") +def client(): + """ + 创建PowerRAG客户端实例 + + Returns: + PowerRAGClient实例 + """ + return PowerRAGClient(api_key=API_KEY, base_url=HOST_ADDRESS) + + +@pytest.fixture(scope="function") +def kb_id(client: PowerRAGClient): + """ + 创建测试用的知识库 + + Args: + client: PowerRAG客户端实例 + + Yields: + 知识库ID + + Returns: + 知识库ID + """ + kb = client.knowledge_base.create(name=f"test_kb_{os.getpid()}") + yield kb["id"] + # 清理:删除测试知识库 + try: + client.knowledge_base.delete([kb["id"]]) + except Exception: + pass + + +@pytest.fixture(scope="function") +def doc_id(client: PowerRAGClient, kb_id: str, test_file_path: str): + """ + 创建测试用的文档 + + Args: + client: PowerRAG客户端实例 + kb_id: 知识库ID + test_file_path: 测试文件路径 + + Yields: + 文档ID + """ + docs = client.document.upload(kb_id, test_file_path) + yield docs[0]["id"] + # 清理:删除测试文档 + try: + client.document.delete(kb_id, [docs[0]["id"]]) + except Exception: + pass + + +@pytest.fixture(scope="function") +def chunk_id(client: PowerRAGClient, kb_id: str, doc_id: str): + """ + 创建测试用的切片 + + Args: + client: PowerRAG客户端实例 + kb_id: 知识库ID + doc_id: 文档ID + + Yields: + 切片ID + """ + chunk = client.chunk.create( + kb_id, + doc_id, + content="Test chunk content for testing" + ) + yield chunk["id"] + # 清理:删除测试切片 + try: + client.chunk.delete(kb_id, doc_id, [chunk["id"]]) + except Exception: + pass + + +@pytest.fixture(scope="function") +def test_file_path(tmp_path): + """ + 创建测试文件(HTML 格式,parse_to_md 支持) + + Args: + tmp_path: pytest临时路径 + + Returns: + 测试文件路径 + """ + test_file = tmp_path / "test_document.html" + test_file.write_text(""" + + + + Test Document + + +

Test Document

+

This is a test document for PowerRAG SDK testing.

+ +

Section 1

+

This is the first section with some content.

+ +

Section 2

+

This is the second section with more content.

+ + +""") + return str(test_file) + + +@pytest.fixture(scope="function") +def test_files(tmp_path): + """ + 创建多个测试文件(HTML 格式) + + Args: + tmp_path: pytest临时路径 + + Returns: + 测试文件路径列表 + """ + files = [] + for i in range(3): + test_file = tmp_path / f"test_document_{i}.html" + test_file.write_text(f""" + + + + Test Document {i} + + +

Test Document {i}

+

This is test document {i} for PowerRAG SDK testing.

+ +

Content

+

Sample content for document {i}.

+ + +""") + files.append(str(test_file)) + return files + + +@pytest.fixture(scope="function") +def doc_ids(client: PowerRAGClient, kb_id: str, test_files: list): + """ + 创建多个测试文档 + + Args: + client: PowerRAG客户端实例 + kb_id: 知识库ID + test_files: 测试文件路径列表 + + Yields: + 文档ID列表 + """ + docs = client.document.upload(kb_id, test_files) + doc_ids = [doc["id"] for doc in docs] + yield doc_ids + # 清理:删除测试文档 + try: + client.document.delete(kb_id, doc_ids) + except Exception: + pass + + +@pytest.fixture(scope="function") +def kb_with_docs(client: PowerRAGClient, test_files: list): + """ + 创建带有已解析文档的知识库(用于RAPTOR等需要文档的测试) + + Args: + client: PowerRAG客户端实例 + test_files: 测试文件路径列表 + + Yields: + 知识库ID + """ + # 创建知识库 + kb = client.knowledge_base.create(name=f"test_kb_with_docs_{os.getpid()}") + kb_id = kb["id"] + + try: + # 上传文档 + docs = client.document.upload(kb_id, test_files) + doc_ids = [doc["id"] for doc in docs] + + # 解析文档(wait=True 会等待解析完成) + client.document.parse_to_chunk(kb_id, doc_ids, wait=True) + + yield kb_id + finally: + # 清理:删除测试知识库 + try: + client.knowledge_base.delete([kb_id]) + except Exception: + pass + diff --git a/powerrag/sdk/tests/pytest.ini b/powerrag/sdk/tests/pytest.ini new file mode 100644 index 000000000..029953c9e --- /dev/null +++ b/powerrag/sdk/tests/pytest.ini @@ -0,0 +1,20 @@ +# +# Copyright 2025 The OceanBase Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +[pytest] +# pytest 配置文件 +# 注意:环境变量需要在运行 pytest 之前设置,或者通过 conftest.py 设置 + diff --git a/powerrag/sdk/tests/test_chunk.py b/powerrag/sdk/tests/test_chunk.py new file mode 100644 index 000000000..940460201 --- /dev/null +++ b/powerrag/sdk/tests/test_chunk.py @@ -0,0 +1,163 @@ +# +# Copyright 2025 The OceanBase Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import pytest +from powerrag.sdk import PowerRAGClient + + +class TestChunkList: + """测试切片列表""" + + def test_list_chunks(self, client: PowerRAGClient, kb_id: str, doc_id: str): + """测试列出切片""" + chunks, total, doc_info = client.chunk.list(kb_id, doc_id) + assert isinstance(chunks, list) + assert total >= 0 + + def test_list_with_keywords(self, client: PowerRAGClient, kb_id: str, doc_id: str): + """测试使用关键词搜索切片""" + chunks, total, _ = client.chunk.list(kb_id, doc_id, keywords="test") + assert isinstance(chunks, list) + + +class TestChunkGet: + """测试切片查询""" + + def test_get_existing_chunk(self, client: PowerRAGClient, kb_id: str, doc_id: str, chunk_id: str): + """测试获取存在的切片""" + chunk = client.chunk.get(kb_id, doc_id, chunk_id) + assert chunk["id"] == chunk_id + + def test_get_nonexistent_chunk(self, client: PowerRAGClient, kb_id: str, doc_id: str): + """测试获取不存在的切片""" + with pytest.raises(Exception) as exc_info: + client.chunk.get(kb_id, doc_id, "nonexistent_id") + assert "not found" in str(exc_info.value).lower() + + +class TestChunkCreate: + """测试切片创建""" + + def test_create_chunk(self, client: PowerRAGClient, kb_id: str, doc_id: str): + """测试创建切片""" + chunk = client.chunk.create( + kb_id, + doc_id, + content="Test chunk content", + important_keywords=["test", "chunk"] + ) + assert chunk["id"] is not None + assert chunk["content"] == "Test chunk content" + + # 清理 + client.chunk.delete(kb_id, doc_id, [chunk["id"]]) + + def test_create_chunk_with_questions(self, client: PowerRAGClient, kb_id: str, doc_id: str): + """测试创建带问题的切片""" + chunk = client.chunk.create( + kb_id, + doc_id, + content="Test content", + questions=["What is this?", "How does it work?"] + ) + assert len(chunk.get("questions", [])) == 2 + + # 清理 + client.chunk.delete(kb_id, doc_id, [chunk["id"]]) + + +class TestChunkUpdate: + """测试切片更新""" + + def test_update_content(self, client: PowerRAGClient, kb_id: str, doc_id: str, chunk_id: str): + """测试更新切片内容""" + updated_chunk = client.chunk.update( + kb_id, + doc_id, + chunk_id, + content="Updated content" + ) + assert updated_chunk["content"] == "Updated content" + + def test_update_keywords(self, client: PowerRAGClient, kb_id: str, doc_id: str, chunk_id: str): + """测试更新关键词""" + updated_chunk = client.chunk.update( + kb_id, + doc_id, + chunk_id, + important_keywords=["new", "keywords"] + ) + assert updated_chunk.get("important_keywords") == ["new", "keywords"] + + +class TestChunkDelete: + """测试切片删除""" + + def test_delete_single_chunk(self, client: PowerRAGClient, kb_id: str, doc_id: str): + """测试删除单个切片""" + chunk = client.chunk.create(kb_id, doc_id, content="To be deleted") + chunk_id = chunk["id"] + + client.chunk.delete(kb_id, doc_id, [chunk_id]) + + with pytest.raises(Exception): + client.chunk.get(kb_id, doc_id, chunk_id) + + def test_delete_multiple_chunks(self, client: PowerRAGClient, kb_id: str, doc_id: str): + """测试批量删除切片""" + chunk_ids = [] + for i in range(3): + chunk = client.chunk.create(kb_id, doc_id, content=f"Chunk {i}") + chunk_ids.append(chunk["id"]) + + client.chunk.delete(kb_id, doc_id, chunk_ids) + + for chunk_id in chunk_ids: + with pytest.raises(Exception): + client.chunk.get(kb_id, doc_id, chunk_id) + + +class TestChunkSplitText: + """测试文本切片""" + + def test_split_text(self, client: PowerRAGClient): + """测试文本切片""" + markdown_text = """ +# 第一章 + +这是第一章的内容... + +## 1.1 小节 + +这是小节内容... +""" + result = client.chunk.split_text( + text=markdown_text, + parser_id="title", + config={"title_level": 2, "chunk_token_num": 256} + ) + assert result.get("total_chunks", 0) > 0 or len(result.get("chunks", [])) > 0 + + def test_split_text_with_config(self, client: PowerRAGClient): + """测试使用配置的文本切片""" + text = "This is a test document with multiple paragraphs." + result = client.chunk.split_text( + text=text, + parser_id="naive", + config={"chunk_token_num": 128} + ) + assert "chunks" in result or "total_chunks" in result + diff --git a/powerrag/sdk/tests/test_document.py b/powerrag/sdk/tests/test_document.py new file mode 100644 index 000000000..3aa1e098b --- /dev/null +++ b/powerrag/sdk/tests/test_document.py @@ -0,0 +1,430 @@ +# +# Copyright 2025 The OceanBase Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import pytest +from powerrag.sdk import PowerRAGClient + + +class TestDocumentUpload: + """测试文档上传""" + + def test_upload_single_file(self, client: PowerRAGClient, kb_id: str, test_file_path: str): + """测试上传单个文件""" + docs = client.document.upload(kb_id, test_file_path) + assert len(docs) == 1 + assert docs[0]["id"] is not None + assert docs[0]["name"] is not None + + # 清理 + client.document.delete(kb_id, [docs[0]["id"]]) + + def test_upload_multiple_files(self, client: PowerRAGClient, kb_id: str, test_files: list): + """测试批量上传文件""" + docs = client.document.upload(kb_id, test_files) + assert len(docs) == len(test_files) + + # 清理 + doc_ids = [doc["id"] for doc in docs] + client.document.delete(kb_id, doc_ids) + + def test_upload_nonexistent_file(self, client: PowerRAGClient, kb_id: str): + """测试上传不存在的文件""" + with pytest.raises(FileNotFoundError): + client.document.upload(kb_id, "nonexistent.pdf") + + +class TestDocumentList: + """测试文档列表""" + + def test_list_all_documents(self, client: PowerRAGClient, kb_id: str): + """测试列出所有文档""" + docs, total = client.document.list(kb_id) + assert isinstance(docs, list) + assert total >= 0 + + def test_list_with_filter(self, client: PowerRAGClient, kb_id: str, test_file_path: str): + """测试使用过滤器列出文档""" + uploaded_docs = client.document.upload(kb_id, test_file_path) + doc_name = uploaded_docs[0]["name"] + + try: + docs, total = client.document.list(kb_id, name=doc_name) + assert len(docs) >= 1 + assert any(doc["name"] == doc_name for doc in docs) + finally: + client.document.delete(kb_id, [uploaded_docs[0]["id"]]) + + +class TestDocumentGet: + """测试文档查询""" + + def test_get_existing_document(self, client: PowerRAGClient, kb_id: str, test_file_path: str): + """测试获取存在的文档""" + uploaded_docs = client.document.upload(kb_id, test_file_path) + doc_id = uploaded_docs[0]["id"] + + try: + doc = client.document.get(kb_id, doc_id) + assert doc["id"] == doc_id + finally: + client.document.delete(kb_id, [doc_id]) + + def test_get_nonexistent_document(self, client: PowerRAGClient, kb_id: str): + """测试获取不存在的文档""" + # 使用有效的UUID格式但不存在于系统中的ID + nonexistent_id = "a" * 32 # 32个字符的十六进制字符串 + with pytest.raises(Exception) as exc_info: + client.document.get(kb_id, nonexistent_id) + # 检查错误信息中是否包含 "not found" 或 "don't own" + error_msg = str(exc_info.value).lower() + assert "not found" in error_msg or "don't own" in error_msg + + +class TestDocumentUpdate: + """测试文档更新""" + + def test_update_name(self, client: PowerRAGClient, kb_id: str, test_file_path: str): + """测试更新文档名称""" + uploaded_docs = client.document.upload(kb_id, test_file_path) + doc_id = uploaded_docs[0]["id"] + + try: + # 注意:不能更改文件扩展名,所以保持 .html 扩展名 + updated_doc = client.document.update(kb_id, doc_id, name="updated_name.html") + assert updated_doc["name"] == "updated_name.html" + finally: + client.document.delete(kb_id, [doc_id]) + + def test_rename(self, client: PowerRAGClient, kb_id: str, test_file_path: str): + """测试重命名文档""" + uploaded_docs = client.document.upload(kb_id, test_file_path) + doc_id = uploaded_docs[0]["id"] + + try: + # 注意:不能更改文件扩展名,所以保持 .html 扩展名 + renamed_doc = client.document.rename(kb_id, doc_id, "renamed.html") + assert renamed_doc["name"] == "renamed.html" + finally: + client.document.delete(kb_id, [doc_id]) + + def test_set_meta(self, client: PowerRAGClient, kb_id: str, test_file_path: str): + """测试设置元数据""" + uploaded_docs = client.document.upload(kb_id, test_file_path) + doc_id = uploaded_docs[0]["id"] + + try: + meta_fields = {"author": "Test Author", "category": "Test"} + updated_doc = client.document.set_meta(kb_id, doc_id, meta_fields) + assert updated_doc.get("meta_fields") == meta_fields + finally: + client.document.delete(kb_id, [doc_id]) + + +class TestDocumentDelete: + """测试文档删除""" + + def test_delete_single_document(self, client: PowerRAGClient, kb_id: str, test_file_path: str): + """测试删除单个文档""" + uploaded_docs = client.document.upload(kb_id, test_file_path) + doc_id = uploaded_docs[0]["id"] + + client.document.delete(kb_id, [doc_id]) + + with pytest.raises(Exception): + client.document.get(kb_id, doc_id) + + def test_delete_multiple_documents(self, client: PowerRAGClient, kb_id: str, test_files: list): + """测试批量删除文档""" + uploaded_docs = client.document.upload(kb_id, test_files) + doc_ids = [doc["id"] for doc in uploaded_docs] + + client.document.delete(kb_id, doc_ids) + + for doc_id in doc_ids: + with pytest.raises(Exception): + client.document.get(kb_id, doc_id) + + +class TestDocumentDownload: + """测试文档下载""" + + def test_download_to_bytes(self, client: PowerRAGClient, kb_id: str, test_file_path: str): + """测试下载为字节流""" + uploaded_docs = client.document.upload(kb_id, test_file_path) + doc_id = uploaded_docs[0]["id"] + + try: + content = client.document.download(kb_id, doc_id) + assert isinstance(content, bytes) + assert len(content) > 0 + finally: + client.document.delete(kb_id, [doc_id]) + + def test_download_to_file(self, client: PowerRAGClient, kb_id: str, test_file_path: str, tmp_path): + """测试下载到文件""" + uploaded_docs = client.document.upload(kb_id, test_file_path) + doc_id = uploaded_docs[0]["id"] + + try: + save_path = tmp_path / "downloaded_file.html" + result_path = client.document.download(kb_id, doc_id, save_path=str(save_path)) + + assert result_path == str(save_path) + assert save_path.exists() + assert save_path.stat().st_size > 0 + finally: + client.document.delete(kb_id, [doc_id]) + + +class TestDocumentParse: + """测试文档解析""" + + def test_parse_to_chunk_sync(self, client: PowerRAGClient, kb_id: str, test_file_path: str): + """测试同步解析为切片""" + uploaded_docs = client.document.upload(kb_id, test_file_path) + doc_id = uploaded_docs[0]["id"] + + try: + results = client.document.parse_to_chunk(kb_id, [doc_id], wait=True) + assert len(results) == 1 + assert results[0]["status"] == "DONE" + finally: + client.document.delete(kb_id, [doc_id]) + + def test_parse_to_chunk_async(self, client: PowerRAGClient, kb_id: str, test_file_path: str): + """测试异步解析为切片""" + uploaded_docs = client.document.upload(kb_id, test_file_path) + doc_id = uploaded_docs[0]["id"] + + try: + task_id = client.document.parse_to_chunk(kb_id, [doc_id], wait=False) + assert task_id is not None + finally: + client.document.delete(kb_id, [doc_id]) + + def test_cancel_parse(self, client: PowerRAGClient, kb_id: str, test_file_path: str): + """测试取消解析""" + uploaded_docs = client.document.upload(kb_id, test_file_path) + doc_id = uploaded_docs[0]["id"] + + try: + client.document.parse_to_chunk(kb_id, [doc_id], wait=False) + client.document.cancel_parse(kb_id, [doc_id]) + + doc = client.document.get(kb_id, doc_id) + assert doc["run"] in ["CANCEL", "UNSTART"] + finally: + client.document.delete(kb_id, [doc_id]) + + +class TestDocumentParseToMD: + """测试文档解析为 Markdown(不切分)""" + + def test_parse_to_md_basic(self, client: PowerRAGClient, kb_id: str, test_file_path: str): + """测试基本的 parse_to_md 功能""" + # 上传文档 + uploaded_docs = client.document.upload(kb_id, test_file_path) + doc_id = uploaded_docs[0]["id"] + + try: + # 解析为 Markdown + result = client.document.parse_to_md(doc_id) + + # 验证返回结果 + assert "doc_id" in result + assert "doc_name" in result + assert "markdown" in result + assert "markdown_length" in result + assert result["doc_id"] == doc_id + assert isinstance(result["markdown"], str) + assert result["markdown_length"] > 0 + finally: + client.document.delete(kb_id, [doc_id]) + + def test_parse_to_md_with_config(self, client: PowerRAGClient, kb_id: str, test_file_path: str): + """测试带配置参数的 parse_to_md""" + uploaded_docs = client.document.upload(kb_id, test_file_path) + doc_id = uploaded_docs[0]["id"] + + try: + # 使用配置解析 + config = { + "layout_recognize": "mineru", + "enable_ocr": False, + "enable_formula": False, + "enable_table": True + } + result = client.document.parse_to_md(doc_id, config=config) + + # 验证返回结果 + assert result["doc_id"] == doc_id + assert "markdown" in result + assert len(result["markdown"]) > 0 + finally: + client.document.delete(kb_id, [doc_id]) + + def test_parse_to_md_nonexistent_doc(self, client: PowerRAGClient): + """测试解析不存在的文档""" + nonexistent_id = "nonexistent_doc_id_123" + + with pytest.raises(Exception) as exc_info: + client.document.parse_to_md(nonexistent_id) + + # 验证错误信息 + error_msg = str(exc_info.value).lower() + assert "not found" in error_msg or "failed" in error_msg + + def test_parse_to_md_with_images(self, client: PowerRAGClient, kb_id: str, test_file_path: str): + """测试解析带图片的文档""" + uploaded_docs = client.document.upload(kb_id, test_file_path) + doc_id = uploaded_docs[0]["id"] + + try: + result = client.document.parse_to_md(doc_id) + + # 验证图片相关字段 + assert "images" in result + assert "total_images" in result + assert isinstance(result["images"], dict) + assert isinstance(result["total_images"], int) + assert result["total_images"] >= 0 + finally: + client.document.delete(kb_id, [doc_id]) + + +class TestDocumentParseToMDAsync: + """测试异步解析文档为 Markdown""" + + def test_parse_to_md_async_basic(self, client: PowerRAGClient, kb_id: str, test_file_path: str): + """测试异步解析基本功能""" + # 上传文档 + uploaded_docs = client.document.upload(kb_id, test_file_path) + doc_id = uploaded_docs[0]["id"] + + try: + # 提交异步任务 + task_id = client.document.parse_to_md_async(doc_id) + assert task_id + assert len(task_id) > 0 + + # 查询任务状态 + status = client.document.get_parse_to_md_status(task_id) + assert "task_id" in status + assert "status" in status + assert status["status"] in ["pending", "processing", "success", "failed"] + + # 等待任务完成 + result = client.document.wait_for_parse_to_md(task_id, timeout=300) + assert result["status"] == "success" + assert "result" in result + assert "markdown" in result["result"] + assert result["result"]["markdown_length"] > 0 + + finally: + client.document.delete(kb_id, [doc_id]) + + def test_parse_to_md_async_with_config(self, client: PowerRAGClient, kb_id: str, test_file_path: str): + """测试异步解析带配置""" + uploaded_docs = client.document.upload(kb_id, test_file_path) + doc_id = uploaded_docs[0]["id"] + + try: + # 提交带配置的异步任务 + task_id = client.document.parse_to_md_async( + doc_id, + config={ + "layout_recognize": "mineru", + "enable_ocr": False, + "enable_table": True + } + ) + + # 等待完成 + result = client.document.wait_for_parse_to_md(task_id, timeout=300) + assert result["status"] == "success" + assert result["result"]["markdown_length"] > 0 + + finally: + client.document.delete(kb_id, [doc_id]) + + def test_parse_to_md_async_nonexistent_doc(self, client: PowerRAGClient): + """测试异步解析不存在的文档""" + with pytest.raises(Exception) as exc_info: + client.document.parse_to_md_async("nonexistent_doc_id") + + assert "not found" in str(exc_info.value).lower() or "failed" in str(exc_info.value).lower() + + def test_get_parse_to_md_status_not_found(self, client: PowerRAGClient): + """测试查询不存在的任务状态""" + status = client.document.get_parse_to_md_status("nonexistent_task_id") + assert status["status"] == "not_found" + + def test_wait_for_parse_to_md_timeout(self, client: PowerRAGClient, kb_id: str, test_file_path: str): + """测试等待任务超时(使用极短超时时间)""" + uploaded_docs = client.document.upload(kb_id, test_file_path) + doc_id = uploaded_docs[0]["id"] + + try: + task_id = client.document.parse_to_md_async(doc_id) + + # 使用极短的超时时间(0.1秒)来触发超时 + with pytest.raises(TimeoutError): + client.document.wait_for_parse_to_md(task_id, timeout=0.1, interval=0.05) + + finally: + client.document.delete(kb_id, [doc_id]) + + +class TestDocumentParseToMDUpload: + """测试直接上传文件并解析为 Markdown""" + + def test_parse_to_md_upload_json_response(self, client: PowerRAGClient, test_file_path: str): + """测试上传文件并返回 JSON 响应""" + result = client.document.parse_to_md_upload(test_file_path) + + # 验证返回结果 + assert "filename" in result + assert "markdown" in result + assert "markdown_length" in result + assert "images" in result + assert "total_images" in result + assert isinstance(result["markdown"], str) + assert result["markdown_length"] > 0 + + def test_parse_to_md_upload_with_config(self, client: PowerRAGClient, test_file_path: str): + """测试带配置参数上传并解析""" + config = { + "layout_recognize": "mineru", + "enable_ocr": False + } + result = client.document.parse_to_md_upload(test_file_path, config=config) + + assert "markdown" in result + assert len(result["markdown"]) > 0 + + def test_parse_to_md_upload_nonexistent_file(self, client: PowerRAGClient): + """测试上传不存在的文件""" + with pytest.raises(FileNotFoundError): + client.document.parse_to_md_upload("nonexistent_file.pdf") + + def test_parse_to_md_upload_different_formats(self, client: PowerRAGClient, test_file_path: str): + """测试上传不同格式的文件""" + # 注意:这个测试需要实际的不同格式文件 + # 这里我们只测试 txt 文件,实际使用时可以添加更多格式 + result = client.document.parse_to_md_upload(test_file_path) + + assert "markdown" in result + assert result["markdown_length"] > 0 \ No newline at end of file diff --git a/powerrag/sdk/tests/test_extraction.py b/powerrag/sdk/tests/test_extraction.py new file mode 100644 index 000000000..5aa9e771f --- /dev/null +++ b/powerrag/sdk/tests/test_extraction.py @@ -0,0 +1,137 @@ +# +# Copyright 2025 The OceanBase Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import pytest +from powerrag.sdk import PowerRAGClient + + +class TestExtractFromDocument: + """测试从文档抽取""" + + def test_extract_entities(self, client: PowerRAGClient, doc_id: str): + """测试抽取实体""" + result = client.extraction.extract_from_document( + doc_id, + extractor_type="entity", + config={"entity_types": ["PERSON", "ORG"]} + ) + assert result["extractor_type"] == "entity" + assert "data" in result + + def test_extract_keywords(self, client: PowerRAGClient, doc_id: str): + """测试抽取关键词""" + result = client.extraction.extract_from_document( + doc_id, + extractor_type="keyword", + config={"max_keywords": 20} + ) + assert result["extractor_type"] == "keyword" + assert "data" in result + + def test_extract_summary(self, client: PowerRAGClient, doc_id: str): + """测试抽取摘要""" + result = client.extraction.extract_from_document( + doc_id, + extractor_type="summary", + config={"max_length": 200} + ) + assert result["extractor_type"] == "summary" + assert "data" in result + + +class TestExtractFromText: + """测试从文本抽取""" + + def test_extract_entities_from_text(self, client: PowerRAGClient): + """测试从文本抽取实体""" + text = "John works at Microsoft in Seattle." + result = client.extraction.extract_from_text( + text, + extractor_type="entity", + config={"entity_types": ["PERSON", "ORG", "LOCATION"]} + ) + assert result["extractor_type"] == "entity" + assert "data" in result + + def test_extract_keywords_from_text(self, client: PowerRAGClient): + """测试从文本抽取关键词""" + text = "This is a test document about artificial intelligence and machine learning." + result = client.extraction.extract_from_text( + text, + extractor_type="keyword", + config={"max_keywords": 10} + ) + assert result["extractor_type"] == "keyword" + assert "data" in result + + +class TestExtractBatch: + """测试批量抽取""" + + def test_extract_batch(self, client: PowerRAGClient, doc_ids: list): + """测试批量抽取""" + results = client.extraction.extract_batch( + doc_ids, + extractor_type="entity" + ) + assert len(results) == len(doc_ids) + assert all("success" in r for r in results) + + +class TestStructExtract: + """测试结构化抽取""" + + def test_struct_extract(self, client: PowerRAGClient): + """测试结构化抽取""" + text = "John attended a conference in New York on January 1, 2024." + examples = [ + { + "text": "John attended a conference in New York on January 1, 2024.", + "extractions": [ + {"extraction_class": "name", "extraction_text": "John"}, + {"extraction_class": "location", "extraction_text": "New York"}, + {"extraction_class": "date", "extraction_text": "January 1, 2024"} + ] + } + ] + + task_info = client.extraction.struct_extract( + text_or_documents=text, + prompt_description="Extract names, locations, and dates from the text.", + examples=examples + ) + assert "task_id" in task_info + + def test_get_struct_extract_status(self, client: PowerRAGClient): + """测试获取结构化抽取任务状态""" + text = "Test text for extraction." + examples = [ + { + "text": "Test text for extraction.", + "extractions": [] + } + ] + + task_info = client.extraction.struct_extract( + text_or_documents=text, + prompt_description="Extract information.", + examples=examples + ) + task_id = task_info["task_id"] + + status = client.extraction.get_struct_extract_status(task_id) + assert "status" in status or "task_id" in status + diff --git a/powerrag/sdk/tests/test_knowledge_base.py b/powerrag/sdk/tests/test_knowledge_base.py new file mode 100644 index 000000000..21058e643 --- /dev/null +++ b/powerrag/sdk/tests/test_knowledge_base.py @@ -0,0 +1,193 @@ +# +# Copyright 2025 The OceanBase Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import pytest +from powerrag.sdk import PowerRAGClient + + +class TestKnowledgeBaseCreate: + """测试知识库创建""" + + def test_create_with_name_only(self, client: PowerRAGClient): + """测试仅使用名称创建知识库""" + kb = client.knowledge_base.create(name="test_kb") + assert kb["id"] is not None + assert kb["name"] == "test_kb" + assert kb["chunk_method"] == "naive" + assert kb["permission"] == "me" + + # 清理 + client.knowledge_base.delete([kb["id"]]) + + def test_create_with_all_fields(self, client: PowerRAGClient): + """测试使用所有字段创建知识库""" + # 注意:pagerank 字段只能在更新时设置,创建时不能设置 + kb = client.knowledge_base.create( + name="test_kb_full", + description="Test description", + embedding_model="BAAI/bge-small-en-v1.5@Builtin", + permission="team", + chunk_method="book", + parser_config={"chunk_token_num": 256} + ) + assert kb["name"] == "test_kb_full" + assert kb["description"] == "Test description" + assert kb["chunk_method"] == "book" + assert kb["permission"] == "team" + + # 清理 + client.knowledge_base.delete([kb["id"]]) + + def test_create_duplicate_name(self, client: PowerRAGClient): + """测试创建重复名称的知识库""" + name = "duplicate_test" + kb1 = client.knowledge_base.create(name=name) + + try: + # 某些系统可能允许重复名称,所以这里只检查是否创建成功 + kb2 = client.knowledge_base.create(name=name) + # 如果创建成功,清理两个知识库 + client.knowledge_base.delete([kb1["id"], kb2["id"]]) + except Exception as e: + # 如果抛出异常,说明不允许重复名称 + assert "already exists" in str(e).lower() or "duplicate" in str(e).lower() + # 清理第一个知识库 + client.knowledge_base.delete([kb1["id"]]) + + +class TestKnowledgeBaseGet: + """测试知识库查询""" + + def test_get_existing_kb(self, client: PowerRAGClient): + """测试获取存在的知识库""" + kb = client.knowledge_base.create(name="get_test") + try: + fetched_kb = client.knowledge_base.get(kb["id"]) + assert fetched_kb["id"] == kb["id"] + assert fetched_kb["name"] == kb["name"] + finally: + client.knowledge_base.delete([kb["id"]]) + + def test_get_nonexistent_kb(self, client: PowerRAGClient): + """测试获取不存在的知识库""" + # 使用有效的UUID格式但不存在于系统中的ID + # UUID v1格式:32位十六进制字符串 + nonexistent_id = "a" * 32 # 32个字符的十六进制字符串 + with pytest.raises(Exception) as exc_info: + client.knowledge_base.get(nonexistent_id) + # 检查错误信息中是否包含 "not found" 或 "invalid uuid" + error_msg = str(exc_info.value).lower() + assert "not found" in error_msg or "invalid uuid" in error_msg + + +class TestKnowledgeBaseList: + """测试知识库列表""" + + def test_list_all(self, client: PowerRAGClient): + """测试列出所有知识库""" + kb_ids = [] + try: + for i in range(3): + kb = client.knowledge_base.create(name=f"list_test_{i}") + kb_ids.append(kb["id"]) + + kbs, total = client.knowledge_base.list() + assert len(kbs) > 0 + assert total >= 3 + finally: + if kb_ids: + client.knowledge_base.delete(kb_ids) + + def test_list_with_filter(self, client: PowerRAGClient): + """测试使用过滤器列出知识库""" + name = "filter_test" + kb = client.knowledge_base.create(name=name) + try: + kbs, total = client.knowledge_base.list(name=name) + assert len(kbs) >= 1 + assert any(kb_item["name"] == name for kb_item in kbs) + finally: + client.knowledge_base.delete([kb["id"]]) + + def test_list_with_pagination(self, client: PowerRAGClient): + """测试分页列出知识库""" + kb_ids = [] + try: + for i in range(5): + kb = client.knowledge_base.create(name=f"page_test_{i}") + kb_ids.append(kb["id"]) + + kbs_page1, total = client.knowledge_base.list(page=1, page_size=2) + assert len(kbs_page1) <= 2 + assert total >= 5 + finally: + if kb_ids: + client.knowledge_base.delete(kb_ids) + + +class TestKnowledgeBaseUpdate: + """测试知识库更新""" + + def test_update_name(self, client: PowerRAGClient): + """测试更新知识库名称""" + kb = client.knowledge_base.create(name="update_test") + try: + updated_kb = client.knowledge_base.update(kb["id"], name="updated_name") + assert updated_kb["name"] == "updated_name" + finally: + client.knowledge_base.delete([kb["id"]]) + + def test_update_multiple_fields(self, client: PowerRAGClient): + """测试更新多个字段""" + kb = client.knowledge_base.create(name="multi_update_test") + try: + updated_kb = client.knowledge_base.update( + kb["id"], + name="multi_updated", + description="Updated description", + permission="team" + ) + assert updated_kb["name"] == "multi_updated" + assert updated_kb["description"] == "Updated description" + assert updated_kb["permission"] == "team" + finally: + client.knowledge_base.delete([kb["id"]]) + + +class TestKnowledgeBaseDelete: + """测试知识库删除""" + + def test_delete_single_kb(self, client: PowerRAGClient): + """测试删除单个知识库""" + kb = client.knowledge_base.create(name="delete_test") + client.knowledge_base.delete([kb["id"]]) + + with pytest.raises(Exception): + client.knowledge_base.get(kb["id"]) + + def test_delete_multiple_kbs(self, client: PowerRAGClient): + """测试批量删除知识库""" + kb_ids = [] + for i in range(3): + kb = client.knowledge_base.create(name=f"batch_delete_{i}") + kb_ids.append(kb["id"]) + + client.knowledge_base.delete(kb_ids) + + for kb_id in kb_ids: + with pytest.raises(Exception): + client.knowledge_base.get(kb_id) + diff --git a/powerrag/sdk/tests/test_knowledge_graph.py b/powerrag/sdk/tests/test_knowledge_graph.py new file mode 100644 index 000000000..2bb1ab170 --- /dev/null +++ b/powerrag/sdk/tests/test_knowledge_graph.py @@ -0,0 +1,90 @@ +# +# Copyright 2025 The OceanBase Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import pytest +import time +from powerrag.sdk import PowerRAGClient + + +class TestKnowledgeGraphBuild: + """测试知识图谱构建""" + + def test_build_knowledge_graph(self, client: PowerRAGClient, kb_with_docs: str): + """测试构建知识图谱""" + task_info = client.knowledge_graph.build(kb_with_docs) + assert "graphrag_task_id" in task_info + + def test_build_knowledge_graph_already_running(self, client: PowerRAGClient, kb_with_docs: str): + """测试构建知识图谱时任务已在运行""" + # 先启动一个任务 + client.knowledge_graph.build(kb_with_docs) + + # 再次启动可能会失败或返回已有任务 + try: + task_info = client.knowledge_graph.build(kb_with_docs) + # 如果成功,说明系统允许重复构建 + assert "graphrag_task_id" in task_info + except Exception as e: + # 如果失败,应该是因为任务已在运行 + assert "already running" in str(e).lower() or "running" in str(e).lower() + + +class TestKnowledgeGraphGet: + """测试知识图谱查询""" + + def test_get_knowledge_graph(self, client: PowerRAGClient, kb_with_docs: str): + """测试获取知识图谱""" + kg_data = client.knowledge_graph.get(kb_with_docs) + assert "graph" in kg_data + assert "mind_map" in kg_data + + def test_get_knowledge_graph_empty(self, client: PowerRAGClient, kb_with_docs: str): + """测试获取空的知识图谱""" + # 确保没有知识图谱数据 + try: + client.knowledge_graph.delete(kb_with_docs) + except Exception: + pass + + kg_data = client.knowledge_graph.get(kb_with_docs) + assert kg_data["graph"] == {} + assert kg_data["mind_map"] == {} + + +class TestKnowledgeGraphStatus: + """测试知识图谱状态查询""" + + def test_get_status(self, client: PowerRAGClient, kb_with_docs: str): + """测试获取知识图谱状态""" + # 先构建 + task_info = client.knowledge_graph.build(kb_with_docs) + + # 查询状态 + status = client.knowledge_graph.get_status(kb_with_docs) + assert status is not None + assert "progress" in status + + def test_get_status_not_exists(self, client: PowerRAGClient, kb_with_docs: str): + """测试获取不存在的知识图谱状态""" + # 确保没有运行的任务 - 只在存在时删除 + try: + if client.knowledge_graph.get_status(kb_with_docs) is not None: + client.knowledge_graph.delete(kb_with_docs) + except Exception: + pass + + status = client.knowledge_graph.get_status(kb_with_docs) + assert status is None \ No newline at end of file diff --git a/powerrag/sdk/tests/test_raptor.py b/powerrag/sdk/tests/test_raptor.py new file mode 100644 index 000000000..a2e7e889f --- /dev/null +++ b/powerrag/sdk/tests/test_raptor.py @@ -0,0 +1,69 @@ +# +# Copyright 2025 The OceanBase Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import pytest +import time +from powerrag.sdk import PowerRAGClient + + +class TestRAPTORBuild: + """测试RAPTOR构建""" + + def test_build_raptor(self, client: PowerRAGClient, kb_with_docs: str): + """测试构建RAPTOR""" + task_info = client.raptor.build(kb_with_docs) + assert "raptor_task_id" in task_info + + def test_build_raptor_already_running(self, client: PowerRAGClient, kb_with_docs: str): + """测试构建RAPTOR时任务已在运行""" + # 先启动一个任务 + client.raptor.build(kb_with_docs) + + # 再次启动可能会失败或返回已有任务 + try: + task_info = client.raptor.build(kb_with_docs) + # 如果成功,说明系统允许重复构建 + assert "raptor_task_id" in task_info + except Exception as e: + # 如果失败,应该是因为任务已在运行 + assert "already running" in str(e).lower() or "running" in str(e).lower() + + +class TestRAPTORStatus: + """测试RAPTOR状态查询""" + + def test_get_status(self, client: PowerRAGClient, kb_with_docs: str): + """测试获取RAPTOR状态""" + # 先构建 + task_info = client.raptor.build(kb_with_docs) + + # 查询状态 + status = client.raptor.get_status(kb_with_docs) + assert status is not None + assert "progress" in status + + def test_get_status_not_exists(self, client: PowerRAGClient, kb_with_docs: str): + """测试获取不存在的RAPTOR状态""" + # 确保没有运行的任务 - 只有在存在时才删除 + try: + if client.raptor.get_status(kb_with_docs) is not None: + client.raptor.delete(kb_with_docs) + except Exception: + pass + + status = client.raptor.get_status(kb_with_docs) + assert status is None + diff --git a/powerrag/sdk/tests/test_retrieval.py b/powerrag/sdk/tests/test_retrieval.py new file mode 100644 index 000000000..062840d6d --- /dev/null +++ b/powerrag/sdk/tests/test_retrieval.py @@ -0,0 +1,105 @@ +# +# Copyright 2025 The OceanBase Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import pytest +from powerrag.sdk import PowerRAGClient + + +class TestRetrievalSearch: + """测试检索""" + + def test_basic_search(self, client: PowerRAGClient, kb_id: str): + """测试基本检索""" + result = client.retrieval.search( + kb_ids=[kb_id], + question="测试问题" + ) + assert "chunks" in result + assert "total" in result + assert isinstance(result["chunks"], list) + + def test_search_with_pagination(self, client: PowerRAGClient, kb_id: str): + """测试分页检索""" + result = client.retrieval.search( + kb_ids=[kb_id], + question="测试问题", + page=1, + page_size=10 + ) + assert len(result["chunks"]) <= 10 + + def test_search_with_similarity_threshold(self, client: PowerRAGClient, kb_id: str): + """测试相似度阈值""" + result = client.retrieval.search( + kb_ids=[kb_id], + question="测试问题", + similarity_threshold=0.5 + ) + # 验证所有结果的相似度都大于等于阈值(如果有结果) + for chunk in result["chunks"]: + assert chunk.get("similarity", 0) >= 0.5 + + def test_search_with_document_filter(self, client: PowerRAGClient, kb_id: str, doc_id: str): + """测试文档过滤""" + result = client.retrieval.search( + kb_ids=[kb_id], + question="测试问题", + document_ids=[doc_id] + ) + # 验证所有结果都来自指定的文档(如果有结果) + for chunk in result["chunks"]: + assert chunk["document_id"] == doc_id + + def test_search_with_keyword(self, client: PowerRAGClient, kb_id: str): + """测试关键词增强""" + result = client.retrieval.search( + kb_ids=[kb_id], + question="测试问题", + keyword=True + ) + assert "chunks" in result + + def test_search_with_kg(self, client: PowerRAGClient, kb_id: str): + """测试知识图谱检索""" + result = client.retrieval.search( + kb_ids=[kb_id], + question="测试问题", + use_kg=True + ) + assert "chunks" in result + + def test_search_with_highlight(self, client: PowerRAGClient, kb_id: str): + """测试高亮""" + result = client.retrieval.search( + kb_ids=[kb_id], + question="测试问题", + highlight=True + ) + assert "chunks" in result + + +class TestRetrievalTest: + """测试检索测试方法""" + + def test_retrieval_test(self, client: PowerRAGClient, kb_id: str): + """测试检索测试方法""" + result = client.retrieval.test( + kb_ids=[kb_id], + question="测试问题" + ) + assert "chunks" in result + assert "total" in result + diff --git a/powerrag/server/app.py b/powerrag/server/app.py index 6963e4245..f6abc7011 100644 --- a/powerrag/server/app.py +++ b/powerrag/server/app.py @@ -14,44 +14,39 @@ # limitations under the License. # -"""PowerRAG Flask Application Configuration""" +"""PowerRAG Quart Application Configuration""" import logging import json -from flask import Flask -from flask.json.provider import DefaultJSONProvider -from flask_cors import CORS +from quart import Quart +from quart_cors import cors from api.utils.json_encode import CustomJSONEncoder logger = logging.getLogger(__name__) -class CustomJSONProvider(DefaultJSONProvider): - """Custom JSON provider that supports Chinese characters without Unicode escaping""" - - def dumps(self, obj, **kwargs): - """Override dumps to ensure Chinese characters are not escaped""" - kwargs.setdefault('ensure_ascii', False) - kwargs.setdefault('cls', CustomJSONEncoder) - return json.dumps(obj, **kwargs) - - def create_app(): - """Create and configure the PowerRAG Flask application""" + """Create and configure the PowerRAG Quart application""" - app = Flask(__name__) + app = Quart(__name__) # CORS configuration - allow requests from RAGFlow frontend - CORS(app, supports_credentials=True, max_age=2592000) - - # JSON encoder configuration - # Use custom JSON provider to ensure Chinese characters are displayed properly - app.json = CustomJSONProvider(app) + # Note: Cannot use allow_credentials=True with wildcard allow_origin="*" + # Since PowerRAG has its own API key authentication, we don't need credentials + app = cors(app, allow_origin="*", allow_credentials=False, allow_methods=["*"], allow_headers=["*"]) # Request configuration app.url_map.strict_slashes = False app.config["MAX_CONTENT_LENGTH"] = 1024 * 1024 * 1024 # 1GB max upload + # Custom JSON encoder for Chinese characters + @app.before_serving + async def setup_json_encoder(): + """Setup custom JSON encoder""" + import functools + import json + json.dumps = functools.partial(json.dumps, cls=CustomJSONEncoder, ensure_ascii=False) + # Register blueprints from powerrag.server.routes.powerrag_routes import powerrag_bp from powerrag.server.routes.task_routes import task_bp @@ -61,10 +56,10 @@ def create_app(): # Health check endpoint @app.route("/health", methods=["GET"]) - def health_check(): + async def health_check(): return {"status": "ok", "service": "powerrag"}, 200 - logger.info("PowerRAG Flask application created successfully") + logger.info("PowerRAG Quart application created successfully") return app diff --git a/powerrag/server/powerrag_server.py b/powerrag/server/powerrag_server.py index 01d96d8aa..b235c0340 100644 --- a/powerrag/server/powerrag_server.py +++ b/powerrag/server/powerrag_server.py @@ -32,8 +32,6 @@ project_root = Path(__file__).parent.parent.parent sys.path.insert(0, str(project_root)) -from werkzeug.serving import run_simple - # Initialize logging from common.log_utils import init_root_logger init_root_logger("powerrag_server") @@ -115,7 +113,7 @@ def main(): signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGTERM, signal_handler) - # Create Flask app + # Create Quart app app = create_app() # Start server @@ -128,13 +126,12 @@ def main(): logger.info(f" - POST http://{args.host}:{args.port}/api/v1/powerrag/extract") logger.info(f" - GET http://{args.host}:{args.port}/health") - run_simple( - hostname=args.host, + # Run Quart app + app.run( + host=args.host, port=args.port, - application=app, - threaded=True, - use_reloader=args.reload, # Only reload if explicitly requested - use_debugger=args.debug, # Debugger enabled with --debug + debug=args.debug, + use_reloader=args.reload, ) except Exception as e: logger.error(f"Failed to start PowerRAG server: {e}", exc_info=True) diff --git a/powerrag/server/routes/powerrag_routes.py b/powerrag/server/routes/powerrag_routes.py index 20d057a0c..4cc881810 100644 --- a/powerrag/server/routes/powerrag_routes.py +++ b/powerrag/server/routes/powerrag_routes.py @@ -18,7 +18,7 @@ import os import logging -from flask import Blueprint, request, jsonify, Response +from quart import Blueprint, request, jsonify, Response from powerrag.server.services.parse_service import PowerRAGParseService from powerrag.server.services.convert_service import PowerRAGConvertService from powerrag.server.services.split_service import PowerRAGSplitService @@ -50,7 +50,7 @@ # ============================================================================ @powerrag_bp.route("/run", methods=["POST"]) -def run_parse(): +async def run_parse(): """ Run PowerRAG parsing tasks using task_executor (async) - 推荐使用 @@ -83,7 +83,7 @@ def run_parse(): """ logger.info(f"=== PowerRAG /run endpoint called from {request.remote_addr} ===") try: - data = request.get_json() + data = await request.get_json() if not data: return jsonify({ @@ -219,7 +219,7 @@ def run_parse(): @powerrag_bp.route("/parse", methods=["POST"]) @apikey_required -def parse_document(tenant_id): +async def parse_document(tenant_id): """ Quick parse for preview (synchronous) - 仅用于快速预览 @@ -247,7 +247,7 @@ def parse_document(tenant_id): } """ try: - data = request.get_json() + data = await request.get_json() if not data: return jsonify({ @@ -300,7 +300,7 @@ def parse_document(tenant_id): @powerrag_bp.route("/parse/batch", methods=["POST"]) @apikey_required -def parse_documents_batch(tenant_id): +async def parse_documents_batch(tenant_id): """ Batch parse multiple documents (using ThreadPoolExecutor like FileService.parse_docs) @@ -312,7 +312,7 @@ def parse_documents_batch(tenant_id): } """ try: - data = request.get_json() + data = await request.get_json() if not data: return jsonify({ @@ -450,7 +450,7 @@ def parse_upload_file(): @powerrag_bp.route("/convert", methods=["POST"]) @apikey_required -def convert_document(tenant_id): +async def convert_document(tenant_id): """ Convert document format using PowerRAG converters @@ -478,7 +478,7 @@ def convert_document(tenant_id): - to_page (int): End page number (default: 100000) """ try: - data = request.get_json() + data = await request.get_json() if not data: return jsonify({ @@ -517,7 +517,7 @@ def convert_document(tenant_id): @powerrag_bp.route("/convert/upload", methods=["POST"]) @apikey_required -def convert_upload_file(tenant_id): +async def convert_upload_file(tenant_id): """ Convert uploaded file directly (with file download support) @@ -635,13 +635,483 @@ def convert_upload_file(tenant_id): }), 500 +# ============================================================================ +# 文档解析为 Markdown 接口(不切分) +# ============================================================================ + +@powerrag_bp.route("/parse_to_md", methods=["POST"]) +@apikey_required +async def parse_to_md(tenant_id): + """ + Parse document to Markdown WITHOUT chunking + + 将文档解析为 Markdown 格式,但不进行切分。 + 适用于需要完整文档内容或外部系统自行处理切分的场景。 + + 支持的文件格式: + - PDF (.pdf) + - Office 文档 (.doc, .docx, .ppt, .pptx) + - 图片 (.jpg, .png) + - HTML (.html, .htm) + + Authentication: Requires RAGFlow API key in Authorization header (Bearer token) + + Request JSON: + { + "doc_id": "document_id", // RAGFlow 文档 ID + "config": { + "layout_recognize": "mineru", // 布局识别引擎: mineru 或 dots_ocr + "enable_ocr": false, // 是否启用 OCR + "enable_formula": false, // 是否识别公式 + "enable_table": true, // 是否识别表格 + "from_page": 0, // 起始页(仅 PDF) + "to_page": 100000 // 结束页(仅 PDF) + } + } + + Response: + { + "code": 0, + "data": { + "doc_id": "document_id", + "doc_name": "document.pdf", + "markdown": "# Title\n\nContent...", // 完整的 Markdown 内容 + "markdown_length": 5000, + "images": { // 文档中的图片(base64) + "image_001.png": "base64_data...", + "image_002.png": "base64_data..." + }, + "total_images": 2 + }, + "message": "success" + } + """ + try: + data = await request.get_json() + + if not data: + return jsonify({ + "code": 400, + "message": "No JSON data provided" + }), 400 + + doc_id = data.get("doc_id") + config = data.get("config", {}) + + if not doc_id: + return jsonify({ + "code": 400, + "message": "doc_id is required" + }), 400 + + # Get document from database + exist, doc = DocumentService.get_by_id(doc_id) + if not exist: + return jsonify({ + "code": 404, + "message": f"Document {doc_id} not found" + }), 404 + + # Get document binary data from storage + bucket, name = File2DocumentService.get_storage_address(doc_id=doc_id) + binary = settings.STORAGE_IMPL.get(bucket, name) + + if not binary: + return jsonify({ + "code": 404, + "message": f"Document binary data not found for {doc_id}" + }), 404 + + # Create service + gotenberg_url = config.get("gotenberg_url", GOTENBERG_URL) + service = PowerRAGParseService(gotenberg_url=gotenberg_url) + + # Parse document to markdown (no chunking) + from pathlib import Path + file_ext = Path(doc.name).suffix.lstrip('.').lower() + + # Determine format type + # Supported: PDF, Office (doc/docx/ppt/pptx), HTML, Images (jpg/png) + format_type_map = { + 'pdf': 'pdf', + 'docx': 'office', 'doc': 'office', + 'xlsx': 'office', 'xls': 'office', + 'pptx': 'office', 'ppt': 'office', + 'html': 'html', 'htm': 'html', + 'jpg': 'image', 'jpeg': 'image', + 'png': 'image' + } + + format_type = format_type_map.get(file_ext) + if not format_type: + return jsonify({ + "code": 400, + "message": f"Unsupported file format: {file_ext}. Supported formats: pdf, doc, docx, ppt, pptx, jpg, png, html" + }), 400 + + # Use _parse_to_markdown method (returns tuple of markdown_content and images) + md_content, images = service._parse_to_markdown( + filename=doc.name, + binary=binary, + format_type=format_type, + config=config + ) + + return jsonify({ + "code": 0, + "data": { + "doc_id": doc_id, + "doc_name": doc.name, + "markdown": md_content, + "markdown_length": len(md_content), + "images": images, + "total_images": len(images) + }, + "message": "success" + }), 200 + + except Exception as e: + logger.error(f"Parse to markdown error: {e}", exc_info=True) + return jsonify({ + "code": 500, + "message": str(e) + }), 500 + + +@powerrag_bp.route("/parse_to_md/async", methods=["POST"]) +@apikey_required +async def parse_to_md_async(tenant_id): + """ + Parse document to Markdown asynchronously (submit task) + + 异步解析文档为 Markdown,返回任务 ID。 + 适用于大文档或需要长时间处理的场景。 + + Authentication: Requires RAGFlow API key in Authorization header (Bearer token) + + Request JSON: + { + "doc_id": "document_id", // RAGFlow 文档 ID + "config": { + "layout_recognize": "mineru", + "enable_ocr": false, + "enable_formula": false, + "enable_table": true + } + } + + Response: + { + "code": 0, + "data": { + "task_id": "uuid-string" + }, + "message": "Task submitted successfully" + } + """ + try: + data = await request.get_json() + + if not data: + return jsonify({ + "code": 400, + "message": "No JSON data provided" + }), 400 + + doc_id = data.get("doc_id") + config = data.get("config", {}) + + if not doc_id: + return jsonify({ + "code": 400, + "message": "doc_id is required" + }), 400 + + # Verify document exists and get binary data in the main thread + # (cannot access storage from thread pool workers) + exist, doc = DocumentService.get_by_id(doc_id) + if not exist: + return jsonify({ + "code": 404, + "message": f"Document {doc_id} not found" + }), 404 + + # Get document binary data NOW (in main thread with app context) + bucket, name = File2DocumentService.get_storage_address(doc_id=doc_id) + if not bucket or not name: + return jsonify({ + "code": 404, + "message": f"Document storage address not found for {doc_id}" + }), 404 + + binary = settings.STORAGE_IMPL.get(bucket, name) + if not binary: + return jsonify({ + "code": 404, + "message": f"Document binary data not found for {doc_id}" + }), 404 + + # Determine format type + from pathlib import Path + file_ext = Path(doc.name).suffix.lstrip('.').lower() + format_type_map = { + 'pdf': 'pdf', 'docx': 'office', 'doc': 'office', + 'xlsx': 'office', 'xls': 'office', 'pptx': 'office', 'ppt': 'office', + 'html': 'html', 'htm': 'html', + 'jpg': 'image', 'jpeg': 'image', 'png': 'image' + } + format_type = format_type_map.get(file_ext, 'pdf') + + # Get task manager and service + from powerrag.server.services.parse_to_md_task_manager import get_task_manager + task_manager = get_task_manager() + + gotenberg_url = config.get("gotenberg_url", GOTENBERG_URL) + service = PowerRAGParseService(gotenberg_url=gotenberg_url) + + # Submit task with binary data (not doc_id) + task_id = task_manager.submit_task( + service=service, + method_name="parse_to_md", + filename=doc.name, + binary=binary, + format_type=format_type, + config=config + ) + + return jsonify({ + "code": 0, + "data": { + "task_id": task_id + }, + "message": "Task submitted successfully" + }), 200 + + except Exception as e: + logger.error(f"Parse to markdown async error: {e}", exc_info=True) + return jsonify({ + "code": 500, + "message": str(e) + }), 500 + + +@powerrag_bp.route("/parse_to_md/status/", methods=["GET"]) +@apikey_required +def get_parse_to_md_status(task_id): + """ + Get parse_to_md task status and result + + 查询异步解析任务的状态和结果。 + + Authentication: Requires RAGFlow API key in Authorization header (Bearer token) + + Response: + { + "code": 0, + "data": { + "task_id": "uuid-string", + "status": "pending|processing|success|failed|not_found", + "created_at": "2025-01-01T00:00:00", + "updated_at": "2025-01-01T00:00:00", + "result": { + "doc_id": "...", + "doc_name": "...", + "markdown": "...", + "markdown_length": 5000, + "images": {...}, + "total_images": 2 + }, + "error": "Error message if failed" + }, + "message": "success" + } + """ + try: + from powerrag.server.services.parse_to_md_task_manager import get_task_manager + task_manager = get_task_manager() + + status = task_manager.get_task_status(task_id) + + if status.get("status") == "not_found": + return jsonify({ + "code": 404, + "message": "Task not found", + "data": status + }), 404 + + return jsonify({ + "code": 0, + "data": status, + "message": "success" + }), 200 + + except Exception as e: + logger.error(f"Get parse_to_md status error: {e}", exc_info=True) + return jsonify({ + "code": 500, + "message": str(e) + }), 500 + + +@powerrag_bp.route("/parse_to_md/upload", methods=["POST"]) +@apikey_required +async def parse_to_md_upload(tenant_id): + """ + Parse uploaded file to Markdown WITHOUT chunking + + 直接上传文件并解析为 Markdown,不进行切分。 + + 支持的文件格式: + - PDF (.pdf) + - Office 文档 (.doc, .docx, .ppt, .pptx) + - 图片 (.jpg, .png) + - HTML (.html, .htm) + + Authentication: Requires RAGFlow API key in Authorization header (Bearer token) + + Request (multipart/form-data): + - file: File to parse (required) - supports PDF, Office (doc/docx/ppt/pptx), Images (jpg/png), HTML + - config: JSON string of parser config (optional) + + Config parameters: + - layout_recognize (str): mineru or dots_ocr (default: mineru) + - enable_ocr (bool): Enable OCR (default: false) + - enable_formula (bool): Enable formula recognition (default: false) + - enable_table (bool): Enable table recognition (default: true) + - from_page (int): Start page number (default: 0) + - to_page (int): End page number (default: 100000) + + Response JSON: + { + "code": 0, + "data": { + "filename": "document.pdf", + "markdown": "# Title\n\nContent...", + "markdown_length": 5000, + "images": {...}, + "total_images": 2 + }, + "message": "success" + } + """ + try: + # Check if file is present + files = await request.files + if 'file' not in files: + return jsonify({ + "code": 400, + "message": "No file provided" + }), 400 + + file = files['file'] + if file.filename == '': + return jsonify({ + "code": 400, + "message": "No file selected" + }), 400 + + # Parse config from JSON string if provided + import json + form = await request.form + config_str = form.get('config', '{}') + try: + config = json.loads(config_str) + except json.JSONDecodeError: + return jsonify({ + "code": 400, + "message": "Invalid JSON in config parameter" + }), 400 + + # Read file binary + filename = file.filename + logger.info(f"Received file upload: filename={filename}, file object={file}") + + if not filename: + return jsonify({ + "code": 400, + "message": "Filename is required" + }), 400 + + binary = file.read() + if not binary: + return jsonify({ + "code": 400, + "message": "File is empty" + }), 400 + + # Add filename to config + config['filename'] = filename + + # Determine format type + from pathlib import Path + file_ext = Path(filename).suffix.lstrip('.').lower() + + logger.info(f"Parsed filename: {filename}, extension: '{file_ext}'") + + if not file_ext: + return jsonify({ + "code": 400, + "message": f"File must have an extension. Filename: '{filename}', parsed extension: '{file_ext}'" + }), 400 + + # Supported: PDF, Office (doc/docx/ppt/pptx), HTML, Markdown, Images (jpg/png) + format_type_map = { + 'pdf': 'pdf', + 'docx': 'office', 'doc': 'office', + 'xlsx': 'office', 'xls': 'office', + 'pptx': 'office', 'ppt': 'office', + 'html': 'html', 'htm': 'html', + 'jpg': 'image', 'jpeg': 'image', + 'png': 'image' + } + + format_type = format_type_map.get(file_ext) + if not format_type: + return jsonify({ + "code": 400, + "message": f"Unsupported file format: {file_ext}. Supported formats: pdf, doc, docx, ppt, pptx, jpg, png, html" + }), 400 + + # Create service and parse + gotenberg_url = config.get("gotenberg_url", GOTENBERG_URL) + service = PowerRAGParseService(gotenberg_url=gotenberg_url) + + # Parse to markdown + md_content, images = service._parse_to_markdown( + filename=filename, + binary=binary, + format_type=format_type, + config=config + ) + + # Return as JSON + return jsonify({ + "code": 0, + "data": { + "filename": filename, + "markdown": md_content, + "markdown_length": len(md_content), + "images": images, + "total_images": len(images) + }, + "message": "success" + }), 200 + + except Exception as e: + logger.error(f"Parse to markdown (upload) error: {e}", exc_info=True) + return jsonify({ + "code": 500, + "message": str(e) + }), 500 + + # ============================================================================ # 文档切片接口 # ============================================================================ @powerrag_bp.route("/split", methods=["POST"]) @apikey_required -def split_text(tenant_id): +async def split_text(tenant_id): """ Split text into chunks using powerrag/app chunking methods @@ -670,7 +1140,7 @@ def split_text(tenant_id): } """ try: - data = request.get_json() + data = await request.get_json() if not data: return jsonify({ @@ -717,7 +1187,7 @@ def split_text(tenant_id): @powerrag_bp.route("/extract", methods=["POST"]) @apikey_required -def extract_from_document(tenant_id): +async def extract_from_document(tenant_id): """ Extract information from document using PowerRAG extractors @@ -732,7 +1202,7 @@ def extract_from_document(tenant_id): } """ try: - data = request.get_json() + data = await request.get_json() if not data: return jsonify({ @@ -769,7 +1239,7 @@ def extract_from_document(tenant_id): @powerrag_bp.route("/extract/text", methods=["POST"]) @apikey_required -def extract_from_text(tenant_id): +async def extract_from_text(tenant_id): """ Extract information from raw text (no doc_id required) @@ -781,7 +1251,7 @@ def extract_from_text(tenant_id): } """ try: - data = request.get_json() + data = await request.get_json() if not data: return jsonify({ @@ -817,7 +1287,8 @@ def extract_from_text(tenant_id): @powerrag_bp.route("/extract/batch", methods=["POST"]) -def extract_batch(tenant_id): +@apikey_required +async def extract_batch(tenant_id): """ Extract information from multiple documents @@ -829,7 +1300,7 @@ def extract_batch(tenant_id): } """ try: - data = request.get_json() + data = await request.get_json() if not data: return jsonify({ @@ -885,7 +1356,7 @@ def extract_batch(tenant_id): @powerrag_bp.route("/struct_extract/submit", methods=["POST"]) @apikey_required -def submit_extraction_task(tenant_id): +async def submit_extraction_task(tenant_id): """ Submit a langextract extraction task @@ -952,7 +1423,7 @@ def submit_extraction_task(tenant_id): } """ try: - data = request.get_json() + data = await request.get_json() if not data: return jsonify({ diff --git a/powerrag/server/routes/task_routes.py b/powerrag/server/routes/task_routes.py index dbc99c6fe..feda60fc7 100644 --- a/powerrag/server/routes/task_routes.py +++ b/powerrag/server/routes/task_routes.py @@ -22,7 +22,7 @@ """ import logging -from flask import Blueprint, request, jsonify +from quart import Blueprint, request, jsonify from powerrag.server.services.task_queue_service import PowerRAGTaskQueueService @@ -32,7 +32,7 @@ @task_bp.route("/parse/async", methods=["POST"]) -def parse_document_async(): +async def parse_document_async(): """ Create an async parsing task using task_executor @@ -62,7 +62,7 @@ def parse_document_async(): } """ try: - data = request.get_json() + data = await request.get_json() if not data: return jsonify({ @@ -112,7 +112,7 @@ def parse_document_async(): @task_bp.route("/task/", methods=["GET"]) -def get_task_status(task_id): +async def get_task_status(task_id): """ Get task status and progress @@ -151,7 +151,7 @@ def get_task_status(task_id): @task_bp.route("/task//cancel", methods=["POST"]) -def cancel_task(task_id): +async def cancel_task(task_id): """ Cancel a running task @@ -184,7 +184,7 @@ def cancel_task(task_id): @task_bp.route("/document//chunks", methods=["GET"]) -def get_document_chunks(doc_id): +async def get_document_chunks(doc_id): """ Get parsed chunks for a completed document diff --git a/powerrag/server/services/convert_service.py b/powerrag/server/services/convert_service.py index fc08cf275..25b72a760 100644 --- a/powerrag/server/services/convert_service.py +++ b/powerrag/server/services/convert_service.py @@ -281,8 +281,10 @@ def _html_to_pdf(self, binary: bytes, config: Dict[str, Any]) -> bytes: filename = config.get('filename', 'document.html') try: + # According to https://gotenberg.dev/docs/routes#html-file-into-pdf-route + # The file MUST be named "index.html" url = f"{self.gotenberg_url}/forms/chromium/convert/html" - files = {'files': (filename, io.BytesIO(binary))} + files = {'files': ('index.html', io.BytesIO(binary))} logger.info(f"Converting HTML document to PDF via Gotenberg: {filename}") response = requests.post(url, files=files, timeout=60) diff --git a/powerrag/server/services/extract_service.py b/powerrag/server/services/extract_service.py index 9f8cfa200..9e0db84c2 100644 --- a/powerrag/server/services/extract_service.py +++ b/powerrag/server/services/extract_service.py @@ -23,14 +23,25 @@ from api.db.services.document_service import DocumentService from api.db.services.file2document_service import File2DocumentService -from common.settings import STORAGE_IMPL +from common import settings -# ⚠️ 延迟导入 PdfParser,避免启动时加载 OCR 模型 -# from deepdoc.parser import PdfParser as RAGFlowPdfParser logger = logging.getLogger(__name__) +def _ensure_storage_initialized(): + """Ensure STORAGE_IMPL is initialized before use""" + if settings.STORAGE_IMPL is None: + logger.warning("STORAGE_IMPL not initialized, calling init_settings()") + settings.init_settings() + + if settings.STORAGE_IMPL is None: + raise RuntimeError( + "STORAGE_IMPL is not initialized. Please ensure init_settings() " + "is called during application startup." + ) + + class PowerRAGExtractService: """Service for information extraction from documents""" @@ -55,6 +66,9 @@ def extract_from_document(self, doc_id: str, extractor_type: str, Dict containing extracted information and metadata """ try: + # Ensure storage is initialized + _ensure_storage_initialized() + # Get document exist, doc = DocumentService.get_by_id(doc_id) if not exist: @@ -62,7 +76,7 @@ def extract_from_document(self, doc_id: str, extractor_type: str, # Get binary data and extract text bucket, name = File2DocumentService.get_storage_address(doc_id=doc_id) - binary = STORAGE_IMPL.get(bucket, name) + binary = settings.STORAGE_IMPL.get(bucket, name) if not binary: raise ValueError(f"Document binary not found for {doc_id}") diff --git a/powerrag/server/services/parse_service.py b/powerrag/server/services/parse_service.py index 371edfe68..8e4f96abb 100644 --- a/powerrag/server/services/parse_service.py +++ b/powerrag/server/services/parse_service.py @@ -580,3 +580,100 @@ def parse_docs_batch(self, doc_ids: List[str], parser_type: str = None, return results + def _parse_to_markdown_for_task(self, doc_id: str = None, filename: str = None, + binary: bytes = None, format_type: str = None, + config: Dict[str, Any] = None) -> Dict[str, Any]: + """ + Parse document to Markdown for async task execution + + This is a wrapper method used by ParseToMdTaskManager for async execution. + It handles both doc_id-based and direct binary-based parsing. + + Args: + doc_id: Document ID (for database lookup) + filename: Filename (for direct binary parsing) + binary: Binary data (for direct binary parsing) + format_type: Format type (for direct binary parsing) + config: Parser configuration + + Returns: + Dict with parsed results: + { + "doc_id": "...", + "doc_name": "...", + "markdown": "...", + "markdown_length": 5000, + "images": {...}, + "total_images": 2 + } + """ + if config is None: + config = {} + + # Case 1: Parse from doc_id (from database) + if doc_id: + # Get document from database + exist, doc = DocumentService.get_by_id(doc_id) + if not exist: + raise ValueError(f"Document {doc_id} not found") + + # Get binary data from storage + bucket, name = File2DocumentService.get_storage_address(doc_id=doc_id) + + if not bucket or not name: + raise ValueError(f"Invalid storage address for document {doc_id}: bucket={bucket}, name={name}") + + storage = STORAGE_IMPL + + if not storage: + raise ValueError("Storage implementation not available") + + try: + binary = storage.get(bucket, name) + if not binary: + raise ValueError(f"Document binary data not found in storage: bucket={bucket}, name={name}") + except Exception as e: + logger.error(f"Failed to get binary for doc {doc_id}: {e}", exc_info=True) + raise ValueError(f"Failed to retrieve document binary: {e}") + + # Determine format + file_ext = Path(doc.name).suffix.lstrip('.').lower() + format_type_map = { + 'pdf': 'pdf', 'docx': 'office', 'doc': 'office', + 'xlsx': 'office', 'xls': 'office', 'pptx': 'office', 'ppt': 'office', + 'html': 'html', 'htm': 'html', + 'jpg': 'image', 'jpeg': 'image', 'png': 'image' + } + format_type = format_type_map.get(file_ext, 'pdf') + filename = doc.name + + # Case 2: Parse from direct binary (filename, binary, format_type provided) + elif filename and binary is not None and format_type: + doc_id = None + else: + raise ValueError("Must provide either doc_id or (filename, binary, format_type)") + + # Parse to markdown + md_content, images = self._parse_to_markdown( + filename=filename, + binary=binary, + format_type=format_type, + config=config + ) + + # Prepare result + result = { + "markdown": md_content, + "markdown_length": len(md_content), + "images": images, + "total_images": len(images) if images else 0 + } + + if doc_id: + result["doc_id"] = doc_id + result["doc_name"] = filename + else: + result["doc_name"] = filename + + return result + diff --git a/powerrag/server/services/parse_to_md_task_manager.py b/powerrag/server/services/parse_to_md_task_manager.py new file mode 100644 index 000000000..ce838fd11 --- /dev/null +++ b/powerrag/server/services/parse_to_md_task_manager.py @@ -0,0 +1,237 @@ +""" +Parse to Markdown Task Manager + +Manages async tasks for parse_to_md operations. +Provides task submission, status tracking, and result retrieval. +""" + +import uuid +import threading +import logging +from datetime import datetime +from typing import Dict, Any, Optional +from enum import Enum +from concurrent.futures import ThreadPoolExecutor + + +logger = logging.getLogger(__name__) + + +class TaskStatus(Enum): + """Task status enum""" + PENDING = "pending" + PROCESSING = "processing" + SUCCESS = "success" + FAILED = "failed" + NOT_FOUND = "not_found" + + +class ParseToMdTaskManager: + """ + Singleton task manager for parse_to_md async operations. + + Features: + - Thread-safe task storage + - Async task execution with thread pool + - Task status tracking + - Result caching (max 1000 completed tasks) + """ + + _instance = None + _lock = threading.Lock() + + def __new__(cls): + if cls._instance is None: + with cls._lock: + if cls._instance is None: + cls._instance = super().__new__(cls) + cls._instance._initialized = False + return cls._instance + + def __init__(self): + if self._initialized: + return + + self._initialized = True + self.tasks = {} # task_id -> task_info + self.tasks_lock = threading.Lock() + + # Thread pool for async execution (max 4 concurrent tasks) + self.executor = ThreadPoolExecutor(max_workers=4, thread_name_prefix="parse_to_md_worker") + + # Max cached completed tasks (to prevent memory leak) + self.max_cached_tasks = 1000 + + logger.info("ParseToMdTaskManager initialized") + + def submit_task( + self, + service, + method_name: str, + **kwargs + ) -> str: + """ + Submit a parse_to_md task for async execution + + Args: + service: The ParseService instance + method_name: Method name to call ("parse_to_md" or "parse_to_md_upload") + **kwargs: Arguments to pass to the method + + Returns: + task_id: Unique task identifier + """ + task_id = str(uuid.uuid4()) + + with self.tasks_lock: + # Clean up old tasks if needed + if len(self.tasks) > self.max_cached_tasks: + self._cleanup_old_tasks() + + # Create task info + self.tasks[task_id] = { + "task_id": task_id, + "status": TaskStatus.PENDING.value, + "created_at": datetime.now().isoformat(), + "updated_at": datetime.now().isoformat(), + "method": method_name, + "kwargs": kwargs, + "result": None, + "error": None + } + + # Submit to thread pool + future = self.executor.submit(self._execute_task, task_id, service, method_name, kwargs) + + logger.info(f"Task {task_id} submitted for {method_name}") + + return task_id + + def _execute_task( + self, + task_id: str, + service, + method_name: str, + kwargs: Dict[str, Any] + ): + """ + Execute the parse task in background thread + + Args: + task_id: Task ID + service: ParseService instance + method_name: Method to call + kwargs: Method arguments + """ + try: + # Update status to processing + self._update_task_status(task_id, TaskStatus.PROCESSING) + + # Call the actual method + if method_name == "parse_to_md": + result = service._parse_to_markdown_for_task(**kwargs) + elif method_name == "parse_to_md_upload": + result = service._parse_to_markdown_for_task(**kwargs) + else: + raise ValueError(f"Unknown method: {method_name}") + + # Update with success result + with self.tasks_lock: + if task_id in self.tasks: + self.tasks[task_id].update({ + "status": TaskStatus.SUCCESS.value, + "updated_at": datetime.now().isoformat(), + "result": result + }) + + logger.info(f"Task {task_id} completed successfully") + + except Exception as e: + # Update with error + logger.error(f"Task {task_id} failed: {e}", exc_info=True) + + with self.tasks_lock: + if task_id in self.tasks: + self.tasks[task_id].update({ + "status": TaskStatus.FAILED.value, + "updated_at": datetime.now().isoformat(), + "error": str(e) + }) + + def get_task_status(self, task_id: str) -> Dict[str, Any]: + """ + Get task status and result + + Args: + task_id: Task ID + + Returns: + Task information dict + """ + with self.tasks_lock: + task = self.tasks.get(task_id) + + if not task: + return { + "task_id": task_id, + "status": TaskStatus.NOT_FOUND.value + } + + # Return a copy to avoid external modifications + return { + "task_id": task["task_id"], + "status": task["status"], + "created_at": task["created_at"], + "updated_at": task["updated_at"], + "result": task.get("result"), + "error": task.get("error") + } + + def _update_task_status(self, task_id: str, status: TaskStatus): + """Update task status""" + with self.tasks_lock: + if task_id in self.tasks: + self.tasks[task_id].update({ + "status": status.value, + "updated_at": datetime.now().isoformat() + }) + + def _cleanup_old_tasks(self): + """ + Clean up old completed/failed tasks to prevent memory leak. + Keeps only the most recent tasks. + """ + # Get completed/failed tasks + completed_tasks = [ + (tid, t["updated_at"]) + for tid, t in self.tasks.items() + if t["status"] in [TaskStatus.SUCCESS.value, TaskStatus.FAILED.value] + ] + + # Sort by updated_at (oldest first) + completed_tasks.sort(key=lambda x: x[1]) + + # Remove oldest 20% of tasks + num_to_remove = max(1, len(completed_tasks) // 5) + for i in range(num_to_remove): + task_id = completed_tasks[i][0] + del self.tasks[task_id] + logger.debug(f"Cleaned up old task {task_id}") + + def shutdown(self): + """Shutdown the task manager and thread pool""" + logger.info("Shutting down ParseToMdTaskManager") + self.executor.shutdown(wait=True) + + +# Singleton instance +_task_manager = None + + +def get_task_manager() -> ParseToMdTaskManager: + """Get the singleton task manager instance""" + global _task_manager + if _task_manager is None: + _task_manager = ParseToMdTaskManager() + return _task_manager + diff --git a/powerrag/server/services/split_service.py b/powerrag/server/services/split_service.py index 5e4e45eee..e4b535938 100644 --- a/powerrag/server/services/split_service.py +++ b/powerrag/server/services/split_service.py @@ -55,7 +55,7 @@ def _init_chunker_factory(self): """动态导入chunker模块,避免循环导入""" global CHUNKER_FACTORY if not CHUNKER_FACTORY: - global regex_based_chunking, title_based_chunking, smart_based_chunking + # 直接引用同一模块中定义的函数 CHUNKER_FACTORY.update({ ParserType.TITLE.value: title_based_chunking, # PowerRAG Title Chunker ParserType.REGEX.value: regex_based_chunking, # PowerRAG regex Chunker From f8d2bd59d4e8121c94dbb9528c57d6dac902e79a Mon Sep 17 00:00:00 2001 From: "suiyu.zg" Date: Mon, 5 Jan 2026 21:24:18 +0800 Subject: [PATCH 02/19] feat: add GitHub Actions workflow for Python package publishing and initial SDK configuration --- .github/workflows/python-publish.yml | 71 ++++++++++++++++++++++++++++ powerrag/sdk/MANIFEST.in | 6 +++ powerrag/sdk/README.md | 2 +- powerrag/sdk/pyproject.toml | 45 ++++++++++++++++++ 4 files changed, 123 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/python-publish.yml create mode 100644 powerrag/sdk/MANIFEST.in create mode 100644 powerrag/sdk/pyproject.toml diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml new file mode 100644 index 000000000..1ebf223ca --- /dev/null +++ b/.github/workflows/python-publish.yml @@ -0,0 +1,71 @@ +# This workflow will upload a Python Package using Twine when a release is created +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries + +name: Upload Python Package + +on: + release: + types: [published] + +permissions: + contents: read + +jobs: + release-build: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + + - name: Install Poetry + run: | + pip install poetry + + - name: Set version from release tag + working-directory: ./powerrag/sdk + run: | + # Extract version from tag (e.g., sdk-v0.1.0 -> 0.1.0 or v0.1.0 -> 0.1.0) + VERSION="${{ github.ref_name }}" + VERSION="${VERSION#v}" + echo "Setting version to: $VERSION" + poetry version "$VERSION" + cat pyproject.toml | grep "^version" + + - name: Install dependencies + working-directory: ./powerrag/sdk + run: | + poetry install + + - name: Build package + working-directory: ./powerrag/sdk + run: | + poetry build + + - name: Upload distributions + uses: actions/upload-artifact@v4 + with: + name: release-dists + path: powerrag/sdk/dist/ + + pypi-publish: + runs-on: ubuntu-latest + needs: + - release-build + permissions: + id-token: write + + steps: + - name: Retrieve release distributions + uses: actions/download-artifact@v4 + with: + name: release-dists + path: dist/ + + - name: Publish release distributions to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + diff --git a/powerrag/sdk/MANIFEST.in b/powerrag/sdk/MANIFEST.in new file mode 100644 index 000000000..fc8223e29 --- /dev/null +++ b/powerrag/sdk/MANIFEST.in @@ -0,0 +1,6 @@ +include README.md +include ../../LICENSE +recursive-include powerrag/sdk *.py +recursive-exclude powerrag/sdk/__pycache__ * +recursive-exclude powerrag/sdk/tests * + diff --git a/powerrag/sdk/README.md b/powerrag/sdk/README.md index 81b69be76..0fac6a2ce 100644 --- a/powerrag/sdk/README.md +++ b/powerrag/sdk/README.md @@ -43,7 +43,7 @@ pip install -e . ### 依赖要求 -- Python 3.8+ +- Python 3.10+ - requests >= 2.28.0 - typing-extensions (Python < 3.11) diff --git a/powerrag/sdk/pyproject.toml b/powerrag/sdk/pyproject.toml new file mode 100644 index 000000000..89943ba45 --- /dev/null +++ b/powerrag/sdk/pyproject.toml @@ -0,0 +1,45 @@ +[build-system] +requires = ["poetry-core>=1.0.0"] +build-backend = "poetry.core.masonry.api" + +[tool.poetry] +name = "powerrag-sdk" +version = "0.0.0" # Version will be set automatically from GitHub release tag +description = "A Python SDK for PowerRAG API, providing easy-to-use interfaces for knowledge base management, document processing, chunking, extraction, RAPTOR, knowledge graph, and retrieval." +authors = ["OceanBase Team "] +license = "Apache-2.0" +readme = "README.md" +homepage = "https://github.com/oceanbase/powerrag" +repository = "https://github.com/oceanbase/powerrag" +documentation = "https://github.com/oceanbase/powerrag/docs" +keywords = ["powerrag", "rag", "llm", "sdk", "knowledge-base", "document-processing"] +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Topic :: Software Development :: Libraries :: Python Modules", +] + +# Include SDK modules using relative path from repository root +packages = [ + { include = "powerrag/sdk", from = "../.." }, +] + +[tool.poetry.dependencies] +python = ">=3.9,<4.0" +requests = ">=2.28.0" +typing-extensions = { version = ">=4.0.0", python = "<3.11" } + +[tool.poetry.group.dev.dependencies] +pytest = ">=8.3.5" +requests-toolbelt = ">=1.0.0" + +[tool.poetry.urls] +"Bug Tracker" = "https://github.com/oceanbase/powerrag/issues" +"Documentation" = "https://github.com/oceanbase/powerrag/docs" +"Source Code" = "https://github.com/oceanbase/powerrag" From 35ba7e221fd46f06fbc2b1fe73fb802e5b5e5695 Mon Sep 17 00:00:00 2001 From: "suiyu.zg" Date: Tue, 6 Jan 2026 11:05:30 +0800 Subject: [PATCH 03/19] chore: update GitHub Actions workflow for SDK publishing and refine package configuration --- .github/workflows/python-publish.yml | 15 ++++++++------- .gitignore | 3 +++ powerrag/sdk/MANIFEST.in | 6 ------ powerrag/sdk/pyproject.toml | 15 +++++++++++++-- 4 files changed, 24 insertions(+), 15 deletions(-) delete mode 100644 powerrag/sdk/MANIFEST.in diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index 1ebf223ca..54eed1288 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -26,10 +26,13 @@ jobs: run: | pip install poetry - - name: Set version from release tag - working-directory: ./powerrag/sdk + - name: Copy pyproject.toml to root and set version run: | - # Extract version from tag (e.g., sdk-v0.1.0 -> 0.1.0 or v0.1.0 -> 0.1.0) + # Copy SDK pyproject.toml to repository root for building + cp powerrag/sdk/pyproject.toml . + cp powerrag/sdk/README.md . + + # Extract version from tag (e.g., v0.1.0 -> 0.1.0) VERSION="${{ github.ref_name }}" VERSION="${VERSION#v}" echo "Setting version to: $VERSION" @@ -37,12 +40,10 @@ jobs: cat pyproject.toml | grep "^version" - name: Install dependencies - working-directory: ./powerrag/sdk run: | - poetry install + poetry install --only main - name: Build package - working-directory: ./powerrag/sdk run: | poetry build @@ -50,7 +51,7 @@ jobs: uses: actions/upload-artifact@v4 with: name: release-dists - path: powerrag/sdk/dist/ + path: dist/ pypi-publish: runs-on: ubuntu-latest diff --git a/.gitignore b/.gitignore index 1e4b1642b..3fbd4dea9 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,9 @@ api/flask_session # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html Cargo.lock +# Poetry lock file - exclude for library/SDK projects (similar to Cargo.lock for libraries) +powerrag/sdk/poetry.lock + # These are backup files generated by rustfmt **/*.rs.bk diff --git a/powerrag/sdk/MANIFEST.in b/powerrag/sdk/MANIFEST.in deleted file mode 100644 index fc8223e29..000000000 --- a/powerrag/sdk/MANIFEST.in +++ /dev/null @@ -1,6 +0,0 @@ -include README.md -include ../../LICENSE -recursive-include powerrag/sdk *.py -recursive-exclude powerrag/sdk/__pycache__ * -recursive-exclude powerrag/sdk/tests * - diff --git a/powerrag/sdk/pyproject.toml b/powerrag/sdk/pyproject.toml index 89943ba45..219bb3585 100644 --- a/powerrag/sdk/pyproject.toml +++ b/powerrag/sdk/pyproject.toml @@ -25,9 +25,20 @@ classifiers = [ "Topic :: Software Development :: Libraries :: Python Modules", ] -# Include SDK modules using relative path from repository root +# Package configuration +# IMPORTANT: Build must be run from repository root (not from powerrag/sdk/) +# This ensures the package path 'powerrag.sdk' is correctly resolved +# Build workflow: GitHub Action copies this file to root and runs poetry build +# Import path: from powerrag.sdk import PowerRAGClient packages = [ - { include = "powerrag/sdk", from = "../.." }, + { include = "powerrag/sdk" }, +] + +# Exclude patterns (Poetry automatically excludes __pycache__, *.pyc, tests, etc.) +exclude = [ + "powerrag/sdk/tests", + "powerrag/sdk/**/__pycache__", + "powerrag/sdk/**/*.pyc", ] [tool.poetry.dependencies] From 632532887591f4d5beb0f1df1d6a44fe8759e320 Mon Sep 17 00:00:00 2001 From: "suiyu.zg" Date: Tue, 6 Jan 2026 11:07:02 +0800 Subject: [PATCH 04/19] chore: update Python version requirement in pyproject.toml to support 3.10 --- powerrag/sdk/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/powerrag/sdk/pyproject.toml b/powerrag/sdk/pyproject.toml index 219bb3585..dd0737693 100644 --- a/powerrag/sdk/pyproject.toml +++ b/powerrag/sdk/pyproject.toml @@ -42,7 +42,7 @@ exclude = [ ] [tool.poetry.dependencies] -python = ">=3.9,<4.0" +python = ">=3.10,<4.0" requests = ">=2.28.0" typing-extensions = { version = ">=4.0.0", python = "<3.11" } From 79d1294d5ec5ea79608ba96b55cc2b20c118eb15 Mon Sep 17 00:00:00 2001 From: "suiyu.zg" Date: Tue, 6 Jan 2026 11:15:45 +0800 Subject: [PATCH 05/19] chore: add environment configuration for PyPI in GitHub Actions workflow --- .github/workflows/python-publish.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index 54eed1288..c53b42646 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -59,6 +59,9 @@ jobs: - release-build permissions: id-token: write + environment: + name: pypi + url: https://pypi.org/project/powerrag-sdk/ steps: - name: Retrieve release distributions From 16a0245a546e788443ae707e8cc7acba83c79aa5 Mon Sep 17 00:00:00 2001 From: "suiyu.zg" Date: Tue, 6 Jan 2026 16:52:53 +0800 Subject: [PATCH 06/19] docs: update SDK README.md --- powerrag/sdk/README.md | 24 ++---------------------- 1 file changed, 2 insertions(+), 22 deletions(-) diff --git a/powerrag/sdk/README.md b/powerrag/sdk/README.md index 0fac6a2ce..d2c470b87 100644 --- a/powerrag/sdk/README.md +++ b/powerrag/sdk/README.md @@ -8,39 +8,19 @@ PowerRAG SDK 是一个功能强大的 Python SDK,为 PowerRAG API 提供了简 ## 特性 - 🚀 **简单易用**: 面向对象的 API 设计,直观的方法调用 -- 📚 **完整功能**: 支持 PowerRAG 所有核心功能模块 +- 📚 **完整功能**: 支持 PowerRAG 所有核心功能模块,包括文档上传/解析/切片/提取/Raptor构建/知识库graph构建 - 🔄 **异步支持**: 支持异步任务的状态查询和轮询等待 - 📦 **批量操作**: 支持批量上传、删除、抽取等操作 - 📝 **Markdown 解析**: 支持文档解析为 Markdown 格式(同步/异步) -- 🎯 **类型提示**: 完整的类型注解,IDE 友好 -- ✅ **全面测试**: 包含完整的测试用例 ## 安装 -### 方式 1: 使用 pip(推荐) +### 使用 pip ```bash pip install powerrag-sdk ``` -### 方式 2: 从源码安装 - -```bash -git clone https://github.com/oceanbase/powerrag.git -cd powerrag -pip install -e . -``` - -### 方式 3: 仅安装 SDK 模块 - -如果你只需要 SDK 功能: - -```bash -git clone https://github.com/oceanbase/powerrag.git -cd powerrag/powerrag/sdk -pip install -e . -``` - ### 依赖要求 - Python 3.10+ From f15d0ef4f3ae8af9c15da73811604ecd7b7f862d Mon Sep 17 00:00:00 2001 From: "suiyu.zg" Date: Mon, 12 Jan 2026 21:07:45 +0800 Subject: [PATCH 07/19] feat(document): add binary file parsing to Markdown method --- powerrag/__init__.py | 21 --- powerrag/sdk/modules/document_manager.py | 76 +++++++++++ powerrag/sdk/tests/test_document.py | 162 +++++++++++++++++++++++ 3 files changed, 238 insertions(+), 21 deletions(-) diff --git a/powerrag/__init__.py b/powerrag/__init__.py index 1ea6623ac..e48fb89ee 100644 --- a/powerrag/__init__.py +++ b/powerrag/__init__.py @@ -21,27 +21,6 @@ that can be integrated into the RAGFlow pipeline system. """ -# Import all PowerRAG components for pipeline integration -from .flow.parsers.powerrag_parsers import ( - PDFParser, - PDFParserParam, -) - -from .flow.splitters.powerrag_splitters import ( - TitleBasedSplitter, - TitleBasedSplitterParam, -) - -from .flow.extractors.powerrag_extractors import ( - EntityExtractor, - EntityExtractorParam, -) - -from .flow.converters.powerrag_converters import ( - DocumentToPDF, - DocumentToPDFParam, -) - # Export all components for pipeline registration __all__ = [ # Parsers diff --git a/powerrag/sdk/modules/document_manager.py b/powerrag/sdk/modules/document_manager.py index 34971524a..c9978092f 100644 --- a/powerrag/sdk/modules/document_manager.py +++ b/powerrag/sdk/modules/document_manager.py @@ -681,6 +681,82 @@ def parse_to_md_upload( return res_json.get("data", {}) + def parse_to_md_binary( + self, + file_binary: bytes, + filename: str, + config: Optional[Dict[str, Any]] = None, + ) -> Dict[str, Any]: + """ + 直接使用文件二进制内容解析为Markdown(不切分) + + 使用文件二进制数据解析为 Markdown 格式,不进行切分。 + 适用于文件已在内存中或从其他来源获取的场景。 + + 支持的文件格式: + - PDF (.pdf) + - Office 文档 (.doc, .docx, .ppt, .pptx) + - 图片 (.jpg, .png) + - HTML (.html, .htm) + + Args: + file_binary: 文件的二进制内容 + filename: 文件名(必须包含正确的扩展名) + config: 解析配置(可选),同 parse_to_md + - layout_recognize: 布局识别引擎 (mineru 或 dots_ocr,默认 mineru) + - enable_formula: 是否识别公式 (默认 False) + - enable_table: 是否识别表格 (默认 True) + - from_page: 起始页(仅 PDF,默认 0) + - to_page: 结束页(仅 PDF,默认 100000) + + Returns: + 解析结果字典,包含以下字段: + - filename: 文件名 + - markdown: Markdown 内容 + - markdown_length: Markdown 长度 + - images: 图片字典 (base64) + - total_images: 图片总数 + + Raises: + ValueError: 文件名或二进制数据无效 + Exception: API调用失败 + + Example: + >>> with open("document.pdf", "rb") as f: + ... file_binary = f.read() + >>> result = doc_manager.parse_to_md_binary( + ... file_binary=file_binary, + ... filename="document.pdf", + ... config={"layout_recognize": "mineru", "enable_ocr": True} + ... ) + >>> print(result['markdown']) + >>> print(f"Parsed {result['total_images']} images") + """ + if not file_binary: + raise ValueError("file_binary cannot be empty") + if not filename: + raise ValueError("filename cannot be empty") + + # Prepare files from binary data + files = [("file", (filename, file_binary))] + + # Prepare form data + import json + form_data = {} + if config: + form_data["config"] = json.dumps(config) + + url = "/powerrag/parse_to_md/upload" + res = self.client.post(url, json=None, files=files, data=form_data) + + # Parse JSON response + res_json = res.json() + + if res_json.get("code") != 0: + raise Exception(res_json.get("message", "Parse to markdown (binary) failed")) + + return res_json.get("data", {}) + def parse_url( self, kb_id: str, diff --git a/powerrag/sdk/tests/test_document.py b/powerrag/sdk/tests/test_document.py index 3aa1e098b..fad327cd4 100644 --- a/powerrag/sdk/tests/test_document.py +++ b/powerrag/sdk/tests/test_document.py @@ -426,5 +426,167 @@ def test_parse_to_md_upload_different_formats(self, client: PowerRAGClient, test # 这里我们只测试 txt 文件,实际使用时可以添加更多格式 result = client.document.parse_to_md_upload(test_file_path) + assert "markdown" in result + assert result["markdown_length"] > 0 + + +class TestDocumentParseToMDBinary: + """测试使用二进制文件解析为 Markdown""" + + def test_parse_to_md_binary_basic(self, client: PowerRAGClient, test_file_path: str): + """测试基本的二进制文件解析功能""" + # 读取文件为二进制 + with open(test_file_path, "rb") as f: + file_binary = f.read() + + # 使用二进制数据解析 + result = client.document.parse_to_md_binary( + file_binary=file_binary, + filename="test_document.html" + ) + + # 验证返回结果 + assert "filename" in result + assert "markdown" in result + assert "markdown_length" in result + assert "images" in result + assert "total_images" in result + assert isinstance(result["markdown"], str) + assert result["markdown_length"] > 0 + + def test_parse_to_md_binary_with_config(self, client: PowerRAGClient, test_file_path: str): + """测试带配置参数的二进制文件解析""" + with open(test_file_path, "rb") as f: + file_binary = f.read() + + config = { + "layout_recognize": "mineru", + "enable_ocr": False, + "enable_table": True + } + + result = client.document.parse_to_md_binary( + file_binary=file_binary, + filename="test_document.html", + config=config + ) + + assert "markdown" in result + assert len(result["markdown"]) > 0 + assert result["markdown_length"] > 0 + + def test_parse_to_md_binary_empty_content(self, client: PowerRAGClient): + """测试空的二进制内容""" + with pytest.raises(ValueError) as exc_info: + client.document.parse_to_md_binary( + file_binary=b"", + filename="test.html" + ) + + assert "cannot be empty" in str(exc_info.value).lower() + + def test_parse_to_md_binary_empty_filename(self, client: PowerRAGClient): + """测试空的文件名""" + with pytest.raises(ValueError) as exc_info: + client.document.parse_to_md_binary( + file_binary=b"test content", + filename="" + ) + + assert "cannot be empty" in str(exc_info.value).lower() + + def test_parse_to_md_binary_different_file_types(self, client: PowerRAGClient, tmp_path): + """测试不同文件类型的二进制解析""" + # 测试 HTML 文件 + html_file = tmp_path / "test.html" + html_content = "

Test

Content

" + html_file.write_text(html_content) + + with open(html_file, "rb") as f: + file_binary = f.read() + + result = client.document.parse_to_md_binary( + file_binary=file_binary, + filename="test.html" + ) + + assert "markdown" in result + assert result["markdown_length"] > 0 + + def test_parse_to_md_binary_with_images(self, client: PowerRAGClient, test_file_path: str): + """测试解析带图片的文档(二进制)""" + with open(test_file_path, "rb") as f: + file_binary = f.read() + + result = client.document.parse_to_md_binary( + file_binary=file_binary, + filename="test_document.html" + ) + + # 验证图片相关字段 + assert "images" in result + assert "total_images" in result + assert isinstance(result["images"], dict) + assert isinstance(result["total_images"], int) + assert result["total_images"] >= 0 + + def test_parse_to_md_binary_filename_with_extension(self, client: PowerRAGClient, test_file_path: str): + """测试文件名必须包含扩展名""" + with open(test_file_path, "rb") as f: + file_binary = f.read() + + # 测试带正确扩展名的文件名 + result = client.document.parse_to_md_binary( + file_binary=file_binary, + filename="document.html" + ) + + assert result["filename"] == "document.html" + assert "markdown" in result + + def test_parse_to_md_binary_large_file(self, client: PowerRAGClient, tmp_path): + """测试较大文件的二进制解析""" + # 创建一个相对较大的测试文件 + large_file = tmp_path / "large_test.html" + large_content = "" + "

Test paragraph.

" * 1000 + "" + large_file.write_text(large_content) + + with open(large_file, "rb") as f: + file_binary = f.read() + + result = client.document.parse_to_md_binary( + file_binary=file_binary, + filename="large_test.html" + ) + + assert "markdown" in result + assert result["markdown_length"] > 0 + # 验证内容长度合理 + assert len(result["markdown"]) > 1000 + + def test_parse_to_md_binary_utf8_content(self, client: PowerRAGClient, tmp_path): + """测试包含UTF-8字符的文件""" + utf8_file = tmp_path / "utf8_test.html" + utf8_content = """ + + + UTF-8测试 + +

中文标题

+

这是中文内容。

+

English content with special chars: é, ñ, ü

+ + + """ + utf8_file.write_text(utf8_content, encoding="utf-8") + + with open(utf8_file, "rb") as f: + file_binary = f.read() + + result = client.document.parse_to_md_binary( + file_binary=file_binary, + filename="utf8_test.html" + ) + assert "markdown" in result assert result["markdown_length"] > 0 \ No newline at end of file From 7732022c323183dd4438b2d052e47aa6c3a258a0 Mon Sep 17 00:00:00 2001 From: "suiyu.zg" Date: Tue, 13 Jan 2026 14:25:28 +0800 Subject: [PATCH 08/19] refactor(document_manager): centralize parse to markdown upload logic --- powerrag/sdk/modules/document_manager.py | 82 +++++++++++++----------- 1 file changed, 45 insertions(+), 37 deletions(-) diff --git a/powerrag/sdk/modules/document_manager.py b/powerrag/sdk/modules/document_manager.py index c9978092f..f6bea1f6a 100644 --- a/powerrag/sdk/modules/document_manager.py +++ b/powerrag/sdk/modules/document_manager.py @@ -17,6 +17,7 @@ from typing import Optional, List, Dict, Any, Union from pathlib import Path from .document import DocumentInfo +import json class DocumentManager: @@ -615,6 +616,45 @@ def parse_to_md( return res_json.get("data", {}) + def _parse_to_md_with_binary( + self, + file_binary: bytes, + filename: str, + config: Optional[Dict[str, Any]] = None, + ) -> Dict[str, Any]: + """ + Internal helper method to parse file binary to Markdown + + Args: + file_binary: Binary content of the file + filename: Name of the file (must include correct extension) + config: Parse configuration (optional) + + Returns: + Parse result dictionary + + Raises: + Exception: API call failed + """ + # Prepare files from binary data + files = [("file", (filename, file_binary))] + + # Prepare form data + form_data = {} + if config: + form_data["config"] = json.dumps(config) + + url = "/powerrag/parse_to_md/upload" + res = self.client.post(url, json=None, files=files, data=form_data) + + # Parse JSON response + res_json = res.json() + + if res_json.get("code") != 0: + raise Exception(res_json.get("message", "Parse to markdown failed")) + + return res_json.get("data", {}) + def parse_to_md_upload( self, file_path: str, @@ -660,26 +700,11 @@ def parse_to_md_upload( if not path.exists(): raise FileNotFoundError(f"File not found: {file_path}") - # Prepare files + # Read file and delegate to helper method with open(path, "rb") as f: - files = [("file", (path.name, f.read()))] + file_binary = f.read() - # Prepare form data - import json - form_data = {} - if config: - form_data["config"] = json.dumps(config) - - url = "/powerrag/parse_to_md/upload" - res = self.client.post(url, json=None, files=files, data=form_data) - - # Parse JSON response - res_json = res.json() - - if res_json.get("code") != 0: - raise Exception(res_json.get("message", "Parse to markdown (upload) failed")) - - return res_json.get("data", {}) + return self._parse_to_md_with_binary(file_binary, path.name, config) def parse_to_md_binary( self, @@ -737,25 +762,8 @@ def parse_to_md_binary( if not filename: raise ValueError("filename cannot be empty") - # Prepare files from binary data - files = [("file", (filename, file_binary))] - - # Prepare form data - import json - form_data = {} - if config: - form_data["config"] = json.dumps(config) - - url = "/powerrag/parse_to_md/upload" - res = self.client.post(url, json=None, files=files, data=form_data) - - # Parse JSON response - res_json = res.json() - - if res_json.get("code") != 0: - raise Exception(res_json.get("message", "Parse to markdown (binary) failed")) - - return res_json.get("data", {}) + # Delegate to helper method + return self._parse_to_md_with_binary(file_binary, filename, config) def parse_url( self, From 33a05200d2c95c48490df20a08d9fd7cf45738d4 Mon Sep 17 00:00:00 2001 From: "suiyu.zg" Date: Tue, 13 Jan 2026 15:40:04 +0800 Subject: [PATCH 09/19] refactor(init): remove module docstring and __all__ exports --- powerrag/__init__.py | 26 -------------------------- 1 file changed, 26 deletions(-) diff --git a/powerrag/__init__.py b/powerrag/__init__.py index e48fb89ee..a1a24464a 100644 --- a/powerrag/__init__.py +++ b/powerrag/__init__.py @@ -13,29 +13,3 @@ # See the License for the specific language governing permissions and # limitations under the License. # - -""" -PowerRAG - Advanced RAG Components for RAGFlow Pipeline - -This module provides advanced parsing, splitting, and extraction components -that can be integrated into the RAGFlow pipeline system. -""" - -# Export all components for pipeline registration -__all__ = [ - # Parsers - "PDFParser", - "PDFParserParam", - - # Splitters - "TitleBasedSplitter", - "TitleBasedSplitterParam", - - # Extractors - "EntityExtractor", - "EntityExtractorParam", - - # Converters - "DocumentToPDF", - "DocumentToPDFParam", -] \ No newline at end of file From 399d17ae14a2c517e93574c9aba8bd0c4039830a Mon Sep 17 00:00:00 2001 From: "suiyu.zg" Date: Tue, 20 Jan 2026 19:59:02 +0800 Subject: [PATCH 10/19] chore(docker): add GOTENBERG server environment variables --- docker/.env.example | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/docker/.env.example b/docker/.env.example index ac71518a9..08a832e49 100644 --- a/docker/.env.example +++ b/docker/.env.example @@ -61,8 +61,12 @@ MINERU_BACKEND="pipeline" # MinerU VLM server url, required when backend is 'vlm-http-client' MINERU_VLM_URL= +# GOTENBERG server host +GOTENBERG_HOST="gotenberg" +# GOTENBERG server port +GOTENBERG_PORT=3000 # GOTENBERG server url -GOTENBERG_URL= +GOTENBERG_URL=http://${GOTENBERG_HOST}:${GOTENBERG_PORT} # dots.ocr server url DOTS_OCR_URL= From f28c27213421005b839034e9db8b3f5c1fa6231c Mon Sep 17 00:00:00 2001 From: He Wang Date: Wed, 21 Jan 2026 15:56:30 +0800 Subject: [PATCH 11/19] Update docker/.env.example Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- docker/.env.example | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/.env.example b/docker/.env.example index 08a832e49..a6968b96c 100644 --- a/docker/.env.example +++ b/docker/.env.example @@ -62,7 +62,7 @@ MINERU_BACKEND="pipeline" MINERU_VLM_URL= # GOTENBERG server host -GOTENBERG_HOST="gotenberg" +GOTENBERG_HOST=gotenberg # GOTENBERG server port GOTENBERG_PORT=3000 # GOTENBERG server url From e9780edb40a3db6af0bf9f740510fd63c4fa4015 Mon Sep 17 00:00:00 2001 From: "suiyu.zg" Date: Fri, 23 Jan 2026 14:21:16 +0800 Subject: [PATCH 12/19] feat(document): add input_type parameter for file type detection --- powerrag/sdk/modules/document_manager.py | 69 +++++++-- powerrag/sdk/tests/.env.example | 8 + powerrag/sdk/tests/conftest.py | 14 +- powerrag/sdk/tests/pytest.ini | 11 ++ powerrag/sdk/tests/test_document.py | 177 ++++++++++++++++++++++ powerrag/server/routes/powerrag_routes.py | 80 +++++++--- powerrag/server/services/parse_service.py | 72 ++++++--- powerrag/utils/file_utils.py | 65 ++++++++ 8 files changed, 437 insertions(+), 59 deletions(-) create mode 100644 powerrag/sdk/tests/.env.example diff --git a/powerrag/sdk/modules/document_manager.py b/powerrag/sdk/modules/document_manager.py index f6bea1f6a..08b58fea4 100644 --- a/powerrag/sdk/modules/document_manager.py +++ b/powerrag/sdk/modules/document_manager.py @@ -621,14 +621,18 @@ def _parse_to_md_with_binary( file_binary: bytes, filename: str, config: Optional[Dict[str, Any]] = None, + input_type: str = 'auto', ) -> Dict[str, Any]: """ Internal helper method to parse file binary to Markdown Args: file_binary: Binary content of the file - filename: Name of the file (must include correct extension) + filename: Name of the file config: Parse configuration (optional) + input_type: File type detection mode (default: 'auto'). Can be: + - 'auto': Try filename extension first, then auto-detect from binary if no extension (default) + - 'pdf', 'office', 'html', 'image': Explicit file type (bypass detection) Returns: Parse result dictionary @@ -642,7 +646,16 @@ def _parse_to_md_with_binary( # Prepare form data form_data = {} if config: - form_data["config"] = json.dumps(config) + # Add input_type to config if it's not 'auto' (since 'auto' is the default) + if input_type != 'auto': + config_copy = config.copy() + config_copy['input_type'] = input_type + form_data["config"] = json.dumps(config_copy) + else: + form_data["config"] = json.dumps(config) + elif input_type != 'auto': + # Create config with just input_type if not default + form_data["config"] = json.dumps({"input_type": input_type}) url = "/powerrag/parse_to_md/upload" res = self.client.post(url, json=None, files=files, data=form_data) @@ -659,6 +672,7 @@ def parse_to_md_upload( self, file_path: str, config: Optional[Dict[str, Any]] = None, + input_type: str = 'auto', ) -> Dict[str, Any]: """ 上传文件并解析为Markdown(不切分) @@ -675,6 +689,9 @@ def parse_to_md_upload( Args: file_path: 文件路径 config: 解析配置(可选),同 parse_to_md + input_type: 文件类型识别模式(默认: 'auto'),支持: + - 'auto': 优先使用文件扩展名,无扩展名或不支持时自动识别(默认) + - 'pdf', 'office', 'html', 'image': 显式指定文件类型(跳过识别) Returns: 解析结果字典,包含以下字段: @@ -689,12 +706,24 @@ def parse_to_md_upload( Exception: API调用失败 Example: + >>> # 默认使用扩展名识别(推荐) >>> result = doc_manager.parse_to_md_upload( - ... file_path="document.pdf", - ... config={"layout_recognize": "mineru"} + ... file_path="document.pdf" + ... ) + >>> print(result['markdown']) + >>> + >>> # 对于无扩展名文件,input_type='auto' 会自动从二进制内容识别 + >>> result = doc_manager.parse_to_md_upload( + ... file_path="document_no_ext" + ... # input_type='auto' 是默认值,可以省略 ... ) >>> print(result['markdown']) - >>> print(f"Parsed {result['total_images']} images") + >>> + >>> # 显式指定文件类型(跳过自动识别) + >>> result = doc_manager.parse_to_md_upload( + ... file_path="document", + ... input_type="pdf" + ... ) """ path = Path(file_path) if not path.exists(): @@ -704,13 +733,14 @@ def parse_to_md_upload( with open(path, "rb") as f: file_binary = f.read() - return self._parse_to_md_with_binary(file_binary, path.name, config) + return self._parse_to_md_with_binary(file_binary, path.name, config, input_type) def parse_to_md_binary( self, file_binary: bytes, filename: str, config: Optional[Dict[str, Any]] = None, + input_type: str = 'auto', ) -> Dict[str, Any]: """ 直接使用文件二进制内容解析为Markdown(不切分) @@ -726,13 +756,16 @@ def parse_to_md_binary( Args: file_binary: 文件的二进制内容 - filename: 文件名(必须包含正确的扩展名) + filename: 文件名 config: 解析配置(可选),同 parse_to_md - layout_recognize: 布局识别引擎 (mineru 或 dots_ocr,默认 mineru) - enable_formula: 是否识别公式 (默认 False) - enable_table: 是否识别表格 (默认 True) - from_page: 起始页(仅 PDF,默认 0) - to_page: 结束页(仅 PDF,默认 100000) + input_type: 文件类型识别模式(默认: 'auto'),支持: + - 'auto': 优先使用文件扩展名,无扩展名或不支持时自动识别(默认) + - 'pdf', 'office', 'html', 'image': 显式指定文件类型(跳过识别) Returns: 解析结果字典,包含以下字段: @@ -749,13 +782,27 @@ def parse_to_md_binary( Example: >>> with open("document.pdf", "rb") as f: ... file_binary = f.read() + >>> # 默认使用扩展名识别(推荐) >>> result = doc_manager.parse_to_md_binary( ... file_binary=file_binary, - ... filename="document.pdf", - ... config={"layout_recognize": "mineru", "enable_ocr": True} + ... filename="document.pdf" ... ) >>> print(result['markdown']) - >>> print(f"Parsed {result['total_images']} images") + >>> + >>> # 对于无扩展名的二进制数据,input_type='auto' 会自动识别 + >>> result = doc_manager.parse_to_md_binary( + ... file_binary=file_binary, + ... filename="document" # 无扩展名 + ... # input_type='auto' 是默认值 + ... ) + >>> print(result['markdown']) + >>> + >>> # 显式指定文件类型(跳过自动识别) + >>> result = doc_manager.parse_to_md_binary( + ... file_binary=file_binary, + ... filename="document", + ... input_type="pdf" + ... ) """ if not file_binary: raise ValueError("file_binary cannot be empty") @@ -763,7 +810,7 @@ def parse_to_md_binary( raise ValueError("filename cannot be empty") # Delegate to helper method - return self._parse_to_md_with_binary(file_binary, filename, config) + return self._parse_to_md_with_binary(file_binary, filename, config, input_type) def parse_url( self, diff --git a/powerrag/sdk/tests/.env.example b/powerrag/sdk/tests/.env.example new file mode 100644 index 000000000..63695f577 --- /dev/null +++ b/powerrag/sdk/tests/.env.example @@ -0,0 +1,8 @@ +# PowerRAG SDK Test Configuration +# Copy this file to .env and update with your actual values + +# PowerRAG API服务地址 +HOST_ADDRESS=http://127.0.0.1:9390 + +# PowerRAG API密钥(从PowerRAG系统管理-API密钥页面获取) +POWERRAG_API_KEY=your-api-key-here diff --git a/powerrag/sdk/tests/conftest.py b/powerrag/sdk/tests/conftest.py index 78075d876..e9a694986 100644 --- a/powerrag/sdk/tests/conftest.py +++ b/powerrag/sdk/tests/conftest.py @@ -18,12 +18,20 @@ import time import pytest from pathlib import Path - from powerrag.sdk import PowerRAGClient +from dotenv import load_dotenv +load_dotenv() # 从环境变量获取配置 -HOST_ADDRESS = os.getenv("HOST_ADDRESS", "http://127.0.0.1:9222") -API_KEY = os.getenv("POWERRAG_API_KEY", "ragflow-MAln1FNDn9PhIcqv1axaaUT3mM-efUZ83O5LVcroe9E") +HOST_ADDRESS = os.getenv("HOST_ADDRESS", "http://127.0.0.1:9390") +API_KEY = os.getenv("POWERRAG_API_KEY") + +if not API_KEY: + raise ValueError( + "POWERRAG_API_KEY environment variable is not set. " + "Please set it in your .env file or system environment. " + "Copy .env.example to .env and update with your API key." + ) @pytest.fixture(scope="session") diff --git a/powerrag/sdk/tests/pytest.ini b/powerrag/sdk/tests/pytest.ini index 029953c9e..42140ddc4 100644 --- a/powerrag/sdk/tests/pytest.ini +++ b/powerrag/sdk/tests/pytest.ini @@ -18,3 +18,14 @@ # pytest 配置文件 # 注意:环境变量需要在运行 pytest 之前设置,或者通过 conftest.py 设置 +# 测试超时设置(秒) +timeout = 300 + +# 显示详细信息 +addopts = -v --tb=short + +# 测试文件匹配模式 +python_files = test_*.py +python_classes = Test* +python_functions = test_* + diff --git a/powerrag/sdk/tests/test_document.py b/powerrag/sdk/tests/test_document.py index fad327cd4..90e811c54 100644 --- a/powerrag/sdk/tests/test_document.py +++ b/powerrag/sdk/tests/test_document.py @@ -588,5 +588,182 @@ def test_parse_to_md_binary_utf8_content(self, client: PowerRAGClient, tmp_path) filename="utf8_test.html" ) + assert "markdown" in result + assert result["markdown_length"] > 0 + + +class TestDocumentInputTypeAutoDetection: + """测试 input_type 自动检测功能""" + + def test_auto_detection_with_valid_extension(self, client: PowerRAGClient, tmp_path): + """测试有有效扩展名时,input_type='auto' 优先使用扩展名""" + # 创建一个 HTML 文件 + html_file = tmp_path / "test.html" + html_content = "

Test

Content

" + html_file.write_text(html_content) + + with open(html_file, "rb") as f: + file_binary = f.read() + + # input_type='auto' 是默认值,会优先使用 .html 扩展名 + result = client.document.parse_to_md_binary( + file_binary=file_binary, + filename="test.html" + # input_type='auto' 是默认值,可以省略 + ) + + assert "markdown" in result + assert result["filename"] == "test.html" + assert result["markdown_length"] > 0 + + def test_auto_detection_without_extension_pdf(self, client: PowerRAGClient, tmp_path): + """测试无扩展名 PDF 文件,input_type='auto' 会自动从二进制检测""" + # 创建一个简单的 PDF 文件头(实际测试可能需要真实的 PDF) + # 这里我们创建一个有 PDF 魔术数的文件 + pdf_header = b"%PDF-1.4\n%\xE2\xE3\xCF\xD3\n" + pdf_content = pdf_header + b"1 0 obj\n<<\n/Type /Catalog\n>>\nendobj\n" + + # 使用没有扩展名的文件名 + result = client.document.parse_to_md_binary( + file_binary=pdf_content, + filename="document_no_extension" + # input_type='auto' 会从二进制内容检测出 PDF + ) + + # 注意:这个测试可能会因为 PDF 内容不完整而失败 + # 实际环境中需要使用真实的 PDF 文件 + assert "filename" in result + assert result["filename"] == "document_no_extension" + + def test_auto_detection_without_extension_html(self, client: PowerRAGClient): + """测试无扩展名 HTML 文件,input_type='auto' 会自动从二进制检测""" + html_content = b"

Test Title

Test content

" + + # 使用没有扩展名的文件名 + result = client.document.parse_to_md_binary( + file_binary=html_content, + filename="document_without_ext" + # input_type='auto' 会从二进制内容检测出 HTML + ) + + assert "markdown" in result + assert result["filename"] == "document_without_ext" + assert result["markdown_length"] > 0 + + def test_explicit_input_type_pdf(self, client: PowerRAGClient, tmp_path): + """测试显式指定 input_type='pdf'""" + # 创建一个简单的 PDF 内容 + pdf_header = b"%PDF-1.4\n%\xE2\xE3\xCF\xD3\n" + pdf_content = pdf_header + b"1 0 obj\n<<\n/Type /Catalog\n>>\nendobj\n" + + # 显式指定为 PDF 类型,即使文件名没有扩展名 + result = client.document.parse_to_md_binary( + file_binary=pdf_content, + filename="document", + input_type="pdf" # 显式指定类型 + ) + + assert "filename" in result + + def test_explicit_input_type_html(self, client: PowerRAGClient): + """测试显式指定 input_type='html'""" + html_content = b"

Title

Paragraph

" + + # 显式指定为 HTML 类型 + result = client.document.parse_to_md_binary( + file_binary=html_content, + filename="document", + input_type="html" # 显式指定类型 + ) + + assert "markdown" in result + assert result["markdown_length"] > 0 + + def test_parse_to_md_upload_with_auto_detection(self, client: PowerRAGClient, tmp_path): + """测试 parse_to_md_upload 方法的自动检测功能""" + # 创建一个测试文件 + html_file = tmp_path / "test_auto.html" + html_content = "

Auto Detection Test

" + html_file.write_text(html_content) + + # 使用默认的 input_type='auto' + result = client.document.parse_to_md_upload(str(html_file)) + + assert "markdown" in result + assert "filename" in result + assert result["markdown_length"] > 0 + + def test_parse_to_md_upload_with_explicit_type(self, client: PowerRAGClient, tmp_path): + """测试 parse_to_md_upload 显式指定类型""" + html_file = tmp_path / "test_explicit.html" + html_content = "

Explicit Type Test

" + html_file.write_text(html_content) + + # 显式指定类型 + result = client.document.parse_to_md_upload( + str(html_file), + input_type="html" + ) + + assert "markdown" in result + assert result["markdown_length"] > 0 + + def test_auto_detection_priority_extension_over_binary(self, client: PowerRAGClient, tmp_path): + """测试 input_type='auto' 优先使用扩展名而非二进制检测""" + # 创建一个 HTML 文件 + html_file = tmp_path / "priority_test.html" + html_content = "

Priority Test

Extension should be used first

" + html_file.write_text(html_content) + + with open(html_file, "rb") as f: + file_binary = f.read() + + # 文件名有 .html 扩展名,应该优先使用扩展名识别 + result = client.document.parse_to_md_binary( + file_binary=file_binary, + filename="priority_test.html" + # input_type='auto' 默认值 + ) + + assert "markdown" in result + assert result["filename"] == "priority_test.html" + # 验证确实解析成功(说明使用了正确的类型) + assert "Priority Test" in result["markdown"] or result["markdown_length"] > 0 + + def test_auto_detection_fallback_to_binary(self, client: PowerRAGClient): + """测试扩展名不支持时,fallback 到二进制检测""" + html_content = b"

Fallback Test

" + + # 使用一个不支持的扩展名 + result = client.document.parse_to_md_binary( + file_binary=html_content, + filename="document.unknown_ext" + # input_type='auto' 会先尝试 .unknown_ext(失败),然后从二进制检测 + ) + + # 应该能够通过二进制检测识别为 HTML + assert "markdown" in result + assert result["markdown_length"] > 0 + + def test_config_with_input_type(self, client: PowerRAGClient, tmp_path): + """测试 config 中包含 input_type 参数""" + html_file = tmp_path / "config_test.html" + html_content = "

Config Test

" + html_file.write_text(html_content) + + with open(html_file, "rb") as f: + file_binary = f.read() + + # 同时使用 config 和 input_type + result = client.document.parse_to_md_binary( + file_binary=file_binary, + filename="config_test.html", + config={ + "layout_recognize": "mineru", + "enable_table": True + }, + input_type="html" + ) + assert "markdown" in result assert result["markdown_length"] > 0 \ No newline at end of file diff --git a/powerrag/server/routes/powerrag_routes.py b/powerrag/server/routes/powerrag_routes.py index 4cc881810..b1a63674b 100644 --- a/powerrag/server/routes/powerrag_routes.py +++ b/powerrag/server/routes/powerrag_routes.py @@ -980,6 +980,9 @@ async def parse_to_md_upload(tenant_id): - enable_table (bool): Enable table recognition (default: true) - from_page (int): Start page number (default: 0) - to_page (int): End page number (default: 100000) + - input_type (str): File type detection mode (default: 'auto'). Options: + * 'auto': Try filename extension first, then auto-detect from binary if no extension (default) + * 'pdf', 'office', 'html', 'image': Explicit file type (bypass detection) Response JSON: { @@ -1042,34 +1045,61 @@ async def parse_to_md_upload(tenant_id): # Add filename to config config['filename'] = filename - # Determine format type - from pathlib import Path - file_ext = Path(filename).suffix.lstrip('.').lower() - - logger.info(f"Parsed filename: {filename}, extension: '{file_ext}'") - - if not file_ext: - return jsonify({ - "code": 400, - "message": f"File must have an extension. Filename: '{filename}', parsed extension: '{file_ext}'" - }), 400 + # Get input_type parameter (default: 'auto') + input_type = config.get('input_type', 'auto') - # Supported: PDF, Office (doc/docx/ppt/pptx), HTML, Markdown, Images (jpg/png) - format_type_map = { - 'pdf': 'pdf', - 'docx': 'office', 'doc': 'office', - 'xlsx': 'office', 'xls': 'office', - 'pptx': 'office', 'ppt': 'office', - 'html': 'html', 'htm': 'html', - 'jpg': 'image', 'jpeg': 'image', - 'png': 'image' - } - - format_type = format_type_map.get(file_ext) - if not format_type: + # Determine format type based on input_type + if input_type == 'auto': + # Auto mode: Try extension first, then binary detection + from pathlib import Path + file_ext = Path(filename).suffix.lstrip('.').lower() + + if file_ext: + # Has extension, try to use it + format_type_map = { + 'pdf': 'pdf', + 'docx': 'office', 'doc': 'office', + 'xlsx': 'office', 'xls': 'office', + 'pptx': 'office', 'ppt': 'office', + 'html': 'html', 'htm': 'html', + 'jpg': 'image', 'jpeg': 'image', + 'png': 'image' + } + format_type = format_type_map.get(file_ext) + + if format_type: + # Valid extension found + logger.info(f"Using filename extension: {format_type} (.{file_ext}) for file: {filename}") + else: + # Unsupported extension, try auto-detect from binary + from powerrag.utils.file_utils import detect_file_type + format_type = detect_file_type(binary) + logger.info(f"Extension '{file_ext}' not supported, auto-detected from binary: {format_type} for file: {filename}") + + if format_type == 'unknown': + return jsonify({ + "code": 400, + "message": f"Unsupported file extension: {file_ext}. Supported formats: pdf, doc, docx, ppt, pptx, jpg, png, html. Binary auto-detection also failed." + }), 400 + else: + # No extension, auto-detect from binary content + from powerrag.utils.file_utils import detect_file_type + format_type = detect_file_type(binary) + logger.info(f"No extension found, auto-detected file type from binary: {format_type} for file: {filename}") + + if format_type == 'unknown': + return jsonify({ + "code": 400, + "message": f"Unable to determine file type for {filename}. File has no extension and binary auto-detection failed. Please provide a file with a valid extension or specify input_type explicitly." + }), 400 + elif input_type in ['pdf', 'office', 'html', 'image']: + # Use explicitly specified input_type + format_type = input_type + logger.info(f"Using explicit input_type: {format_type} for file: {filename}") + else: return jsonify({ "code": 400, - "message": f"Unsupported file format: {file_ext}. Supported formats: pdf, doc, docx, ppt, pptx, jpg, png, html" + "message": f"Invalid input_type: {input_type}. Must be 'auto' (default), 'pdf', 'office', 'html', or 'image'." }), 400 # Create service and parse diff --git a/powerrag/server/services/parse_service.py b/powerrag/server/services/parse_service.py index 8e4f96abb..1c4d24f09 100644 --- a/powerrag/server/services/parse_service.py +++ b/powerrag/server/services/parse_service.py @@ -140,7 +140,7 @@ def parse_document(self, doc_id: str) -> Dict[str, Any]: raise def parse_file_binary(self, binary: bytes, filename: str, - config: Dict[str, Any] = None) -> Dict[str, Any]: + config: Dict[str, Any] = None, input_type: str = 'auto') -> Dict[str, Any]: """ Parse file binary directly (without doc_id) and return markdown + images @@ -150,9 +150,11 @@ def parse_file_binary(self, binary: bytes, filename: str, Args: binary: File binary data - filename: Original filename (used to detect format) - parser_id: Parser ID (e.g., "title", "naive", "paper") + filename: Original filename (used to detect format if possible) config: Parser configuration + input_type: File type detection mode (default: 'auto'). Can be: + - 'auto': Try filename extension first, then auto-detect from binary if no extension (default) + - 'pdf', 'office', 'html', 'image', 'markdown': Explicit file type (bypass detection) Returns: Dict containing markdown content and images @@ -164,22 +166,43 @@ def parse_file_binary(self, binary: bytes, filename: str, } """ try: - # Check if format is supported - file_ext = Path(filename).suffix.lstrip('.').lower() - if file_ext not in self.SUPPORTED_FORMATS: + # Determine format type based on input_type parameter + if input_type == 'auto': + # Auto mode: Try extension first, then binary detection + file_ext = Path(filename).suffix.lstrip('.').lower() + + if file_ext and file_ext in self.SUPPORTED_FORMATS: + # Has valid extension, use it + format_type = self.SUPPORTED_FORMATS[file_ext] + logger.info(f"Using filename extension for file type: {format_type} (.{file_ext}) for file: {filename}") + else: + # No extension or unsupported extension, auto-detect from binary + from powerrag.utils.file_utils import detect_file_type + format_type = detect_file_type(binary) + logger.info(f"Auto-detected file type from binary: {format_type} for file: {filename}") + + if format_type == 'unknown': + raise ValueError( + f"Unable to determine file type for {filename}. " + f"File has no extension or unsupported extension '{file_ext}', and binary auto-detection failed. " + f"Please provide a valid input_type explicitly." + ) + elif input_type in ['pdf', 'office', 'html', 'image', 'markdown']: + # Use explicitly specified input_type + format_type = input_type + logger.info(f"Using explicit input_type: {format_type} for file: {filename}") + else: raise ValueError( - f"Unsupported format: .{file_ext}. " - f"PowerRAG only supports: {', '.join(sorted(set(self.SUPPORTED_FORMATS.values())))}" + f"Invalid input_type: {input_type}. " + f"Must be 'auto', 'pdf', 'office', 'html', 'image', or 'markdown'" ) - format_type = self.SUPPORTED_FORMATS[file_ext] - # Parse document to get markdown and images md_content, images = self._parse_to_markdown(filename, binary, format_type, config) return { "filename": filename, - "file_format": file_ext, + "file_format": Path(filename).suffix.lstrip('.').lower() if filename else 'unknown', "format_type": format_type, "markdown": md_content, "images": images, @@ -636,15 +659,24 @@ def _parse_to_markdown_for_task(self, doc_id: str = None, filename: str = None, logger.error(f"Failed to get binary for doc {doc_id}: {e}", exc_info=True) raise ValueError(f"Failed to retrieve document binary: {e}") - # Determine format - file_ext = Path(doc.name).suffix.lstrip('.').lower() - format_type_map = { - 'pdf': 'pdf', 'docx': 'office', 'doc': 'office', - 'xlsx': 'office', 'xls': 'office', 'pptx': 'office', 'ppt': 'office', - 'html': 'html', 'htm': 'html', - 'jpg': 'image', 'jpeg': 'image', 'png': 'image' - } - format_type = format_type_map.get(file_ext, 'pdf') + # Determine format from config or filename + input_type = config.get('input_type') + if input_type == 'auto': + from powerrag.utils.file_utils import detect_file_type + format_type = detect_file_type(binary) + logger.info(f"Auto-detected file type: {format_type} for document {doc_id}") + elif input_type: + format_type = input_type + else: + # Auto-detect from file extension + file_ext = Path(doc.name).suffix.lstrip('.').lower() + format_type_map = { + 'pdf': 'pdf', 'docx': 'office', 'doc': 'office', + 'xlsx': 'office', 'xls': 'office', 'pptx': 'office', 'ppt': 'office', + 'html': 'html', 'htm': 'html', + 'jpg': 'image', 'jpeg': 'image', 'png': 'image' + } + format_type = format_type_map.get(file_ext, 'pdf') filename = doc.name # Case 2: Parse from direct binary (filename, binary, format_type provided) diff --git a/powerrag/utils/file_utils.py b/powerrag/utils/file_utils.py index f4a58bba9..bf6d7b084 100644 --- a/powerrag/utils/file_utils.py +++ b/powerrag/utils/file_utils.py @@ -313,6 +313,71 @@ def _guess_ext(b: bytes) -> str: return ".doc" return ".bin" + +def detect_file_type(binary: bytes) -> str: + """ + Detect file type from binary data using magic numbers. + + Returns the file type as a string compatible with PowerRAG format types: + - 'pdf': PDF files + - 'office': Office documents (doc, docx, xls, xlsx, ppt, pptx) + - 'html': HTML files + - 'image': Image files (jpg, jpeg, png) + - 'unknown': Unable to determine file type + + Args: + binary: File binary data + + Returns: + File type string ('pdf', 'office', 'html', 'image', 'unknown') + """ + if not binary or len(binary) < 8: + return 'unknown' + + head = binary[:8] + + # Check PDF + if _is_pdf(head): + return 'pdf' + + # Check ZIP-based Office formats (docx, xlsx, pptx) + if _is_zip(head): + try: + with zipfile.ZipFile(io.BytesIO(binary), "r") as z: + names = [n.lower() for n in z.namelist()] + if any(n.startswith("word/") for n in names): + return 'office' # docx + if any(n.startswith("ppt/") for n in names): + return 'office' # pptx + if any(n.startswith("xl/") for n in names): + return 'office' # xlsx + except Exception: + pass + + # Check OLE-based Office formats (doc, xls, ppt) + if _is_ole(head): + return 'office' + + # Check common image formats + # JPEG: FF D8 FF + if head.startswith(b"\xFF\xD8\xFF"): + return 'image' + + # PNG: 89 50 4E 47 0D 0A 1A 0A + if head.startswith(b"\x89PNG\r\n\x1a\n"): + return 'image' + + # Check HTML (basic detection) + # Try to decode as text and check for HTML markers + try: + text_sample = binary[:1024].decode('utf-8', errors='ignore').lower() + if ' bytes: try: From 045ad0af371552f85409ab5973c19c58dd86feae Mon Sep 17 00:00:00 2001 From: "suiyu.zg" Date: Fri, 23 Jan 2026 15:19:12 +0800 Subject: [PATCH 13/19] feat(document): add support for file_url to parse documents from URL --- powerrag/sdk/README.md | 229 +++++++++++++++++++++- powerrag/sdk/tests/test_document.py | 197 ++++++++++++++++++- powerrag/server/routes/powerrag_routes.py | 93 ++++++--- 3 files changed, 492 insertions(+), 27 deletions(-) diff --git a/powerrag/sdk/README.md b/powerrag/sdk/README.md index d2c470b87..410d07f2c 100644 --- a/powerrag/sdk/README.md +++ b/powerrag/sdk/README.md @@ -119,11 +119,49 @@ print(f"Status: {status['status']}") # 等待任务完成 result = client.document.wait_for_parse_to_md(task_id, timeout=300) -# 直接上传文件并解析为 Markdown +# 直接上传文件并解析为 Markdown(无需知识库) result = client.document.parse_to_md_upload( "document.pdf", config={"layout_recognize": "mineru"} ) + +# 从 URL 下载并解析(直接调用 API) +import requests +import json + +response = requests.post( + f"{client.api_url}/powerrag/parse_to_md/upload", + headers={"Authorization": f"Bearer {client.api_key}"}, + data={ + "file_url": "https://example.com/document.pdf", + "config": json.dumps({"layout_recognize": "mineru"}) + } +) +result = response.json() + +# 使用二进制数据解析(支持无扩展名文件) +with open("document.pdf", "rb") as f: + binary_data = f.read() + +result = client.document.parse_to_md_binary( + file_binary=binary_data, + filename="document.pdf", # 有扩展名时自动识别 + config={"layout_recognize": "mineru"} +) + +# 对于无扩展名的文件,使用 input_type='auto' 自动识别 +result = client.document.parse_to_md_binary( + file_binary=binary_data, + filename="document_no_extension", # 无扩展名 + # input_type='auto' 是默认值,会自动从二进制内容检测文件类型 +) + +# 或显式指定文件类型 +result = client.document.parse_to_md_binary( + file_binary=binary_data, + filename="document", + input_type="pdf" # 强制作为 PDF 处理 +) ``` ### 检索 @@ -156,7 +194,7 @@ PowerRAG SDK 提供了强大的文档解析为 Markdown 的功能,支持多种 - 图片 (.jpg, .png) - HTML (.html, .htm) -**三种使用方式:** +**四种使用方式:** 1. **同步解析**(适合小文档): ```python @@ -173,7 +211,38 @@ result = client.document.wait_for_parse_to_md(task_id, timeout=300) 3. **直接上传解析**(无需知识库): ```python +# 上传本地文件 result = client.document.parse_to_md_upload("file.pdf", config={...}) + +# 或使用 file_url 参数从URL下载(通过 config 传入) +import requests +response = requests.post( + "http://localhost:9390/api/v1/powerrag/parse_to_md/upload", + headers={"Authorization": "Bearer YOUR_API_KEY"}, + data={ + "file_url": "https://example.com/document.pdf", + "config": json.dumps({"layout_recognize": "mineru"}) + } +) +``` + +4. **二进制数据解析**(支持无扩展名文件): +```python +with open("document.pdf", "rb") as f: + binary_data = f.read() + +# 自动识别文件类型(默认) +result = client.document.parse_to_md_binary( + file_binary=binary_data, + filename="document_no_extension" # 无扩展名也可以 +) + +# 或显式指定类型 +result = client.document.parse_to_md_binary( + file_binary=binary_data, + filename="document", + input_type="pdf" # 'pdf', 'office', 'html', 'image' +) ``` **配置选项:** @@ -182,6 +251,9 @@ result = client.document.parse_to_md_upload("file.pdf", config={...}) - `enable_formula`: 是否识别公式 - `enable_table`: 是否识别表格 - `from_page`/`to_page`: PDF 页面范围 +- `input_type`: 文件类型识别模式(默认: `'auto'`) + - `'auto'`: 优先使用文件扩展名,无扩展名时自动检测(推荐) + - `'pdf'`, `'office'`, `'html'`, `'image'`: 显式指定文件类型 ### 结构化信息抽取 @@ -339,6 +411,62 @@ result = client.document.wait_for_parse_to_md(task_id, timeout=300) # 上传并解析为 Markdown(无需知识库) result = client.document.parse_to_md_upload("file.pdf", config={...}) +# 使用 file_url 参数从 URL 下载并解析(直接调用 API) +import requests +import json + +response = requests.post( + f"{client.api_url}/powerrag/parse_to_md/upload", + headers={"Authorization": f"Bearer {client.api_key}"}, + data={ + "file_url": "https://example.com/document.pdf", + "config": json.dumps({ + "layout_recognize": "mineru", + "input_type": "auto" # 可选,自动检测文件类型 + }) + } +) +result = response.json() + +# 使用 file_url 并指定文件名 +response = requests.post( + f"{client.api_url}/powerrag/parse_to_md/upload", + headers={"Authorization": f"Bearer {client.api_key}"}, + data={ + "file_url": "https://example.com/download?id=123", + "config": json.dumps({ + "filename": "report.pdf", # 自定义文件名 + "input_type": "pdf", + "layout_recognize": "mineru" + }) + } +) + +# 使用二进制数据解析为 Markdown +with open("document.pdf", "rb") as f: + binary_data = f.read() + +result = client.document.parse_to_md_binary( + file_binary=binary_data, + filename="document.pdf", + config={"layout_recognize": "mineru"}, + input_type="auto" # 默认值,自动识别文件类型 +) + +# 无扩展名文件解析(自动检测文件类型) +result = client.document.parse_to_md_binary( + file_binary=binary_data, + filename="document_no_extension", # 无扩展名 + # input_type='auto' 会从二进制内容自动检测 PDF/Office/HTML 等 +) + +# 显式指定文件类型(跳过自动检测) +result = client.document.parse_to_md_binary( + file_binary=binary_data, + filename="document", + input_type="pdf" # 强制作为 PDF 处理 +) + # 解析URL文档(同步等待) doc = client.document.parse_url( kb_id, @@ -1043,6 +1171,103 @@ for result in results: # 重新解析或删除 ``` +### Q: 如何解析无扩展名的文件? + +A: 使用 `parse_to_md_binary` 方法并使用 `input_type='auto'`(默认值): +```python +with open("document_no_extension", "rb") as f: + binary_data = f.read() + +# input_type='auto' 会自动从二进制内容检测文件类型 +result = client.document.parse_to_md_binary( + file_binary=binary_data, + filename="document_no_extension" + # input_type='auto' 是默认值,可以省略 +) +``` + +支持的 `input_type` 值: +- `'auto'` (默认): 优先使用文件扩展名,无扩展名或不支持时从二进制内容自动检测 +- `'pdf'`, `'office'`, `'html'`, `'image'`: 显式指定文件类型 + +### Q: 如何从URL直接解析文件? + +A: 在 `/parse_to_md/upload` API 请求中使用 `file_url` 参数,服务器会自动下载并解析: + +**方式 1:基本用法** +```python +import requests +import json + +# 使用 file_url 参数 +response = requests.post( + "http://localhost:9390/api/v1/powerrag/parse_to_md/upload", + headers={"Authorization": "Bearer YOUR_API_KEY"}, + data={ + "file_url": "https://example.com/document.pdf", + "config": json.dumps({ + "layout_recognize": "mineru", + "input_type": "auto" # 可选,默认为 'auto' + }) + } +) +result = response.json() +print(result['data']['markdown']) +``` + +**方式 2:指定文件名(适用于无扩展名URL)** +```python +response = requests.post( + "http://localhost:9390/api/v1/powerrag/parse_to_md/upload", + headers={"Authorization": "Bearer YOUR_API_KEY"}, + data={ + "file_url": "https://api.example.com/download?id=123", + "config": json.dumps({ + "filename": "report.pdf", # 覆盖文件名 + "input_type": "pdf", # 显式指定类型 + "layout_recognize": "mineru", + "enable_table": True + }) + } +) +``` + +**方式 3:与 SDK 客户端结合使用** +```python +from powerrag.sdk import PowerRAGClient +import requests +import json + +client = PowerRAGClient(api_key="your-api-key", base_url="http://localhost:9390") + +# 使用客户端的 api_url 和认证信息 +response = requests.post( + f"{client.api_url}/powerrag/parse_to_md/upload", + headers={"Authorization": f"Bearer {client.api_key}"}, + data={ + "file_url": "https://example.com/document.pdf", + "config": json.dumps({"layout_recognize": "mineru"}) + } +) +``` + +**配置参数说明:** +- `file_url` (str): 文件的 URL 地址(与 `file` 参数二选一) +- `config.filename` (str): 自定义文件名(可选,不提供则从 URL 提取) +- `config.input_type` (str): 文件类型检测模式 + - `'auto'` (默认): 优先使用扩展名,无扩展名时自动检测 + - `'pdf'`, `'office'`, `'html'`, `'image'`: 显式指定类型 +- `config.layout_recognize` (str): 布局识别引擎(`'mineru'` 或 `'dots_ocr'`) +- 其他解析配置参数同 `parse_to_md` 方法 + +**注意事项:** +- ✅ URL 必须可公开访问,不支持需要认证的 URL +- ✅ 下载超时时间为 60 秒 +- ✅ 支持所有文件格式(PDF, Office, HTML, 图片) +- ✅ 自动从 URL 路径提取文件名,或使用 `config.filename` 覆盖 +- ❌ 不能同时提供 `file` 和 `file_url` 参数 +- ❌ 必须提供 `file` 或 `file_url` 其中之一 + ### Q: SDK 是否支持流式返回? A: 当前版本主要支持标准 REST API 调用。对于下载等操作,SDK 内部使用了流式传输。 diff --git a/powerrag/sdk/tests/test_document.py b/powerrag/sdk/tests/test_document.py index 90e811c54..d882c4cda 100644 --- a/powerrag/sdk/tests/test_document.py +++ b/powerrag/sdk/tests/test_document.py @@ -766,4 +766,199 @@ def test_config_with_input_type(self, client: PowerRAGClient, tmp_path): ) assert "markdown" in result - assert result["markdown_length"] > 0 \ No newline at end of file + assert result["markdown_length"] > 0 + + +class TestDocumentFileUrl: + """测试 file_url 参数功能""" + + def test_parse_from_url_basic(self, client: PowerRAGClient): + """测试从URL下载并解析文件(基本功能)""" + import requests + import json + + # 使用一个公开可访问的示例 HTML URL + file_url = "https://httpbin.org/html" + + # 直接调用 API(因为 SDK 方法已被删除) + response = requests.post( + f"{client.api_url}/powerrag/parse_to_md/upload", + headers={"Authorization": f"Bearer {client.api_key}"}, + data={ + "file_url": file_url, + "config": json.dumps({"input_type": "html"}) + } + ) + + assert response.status_code == 200 + result = response.json() + assert result["code"] == 0 + assert "markdown" in result["data"] + assert result["data"]["markdown_length"] > 0 + + def test_parse_from_url_with_filename(self, client: PowerRAGClient): + """测试从URL下载并指定文件名""" + import requests + import json + + file_url = "https://httpbin.org/html" + custom_filename = "custom_document.html" + + response = requests.post( + f"{client.api_url}/powerrag/parse_to_md/upload", + headers={"Authorization": f"Bearer {client.api_key}"}, + data={ + "file_url": file_url, + "config": json.dumps({ + "filename": custom_filename, + "input_type": "html" + }) + } + ) + + assert response.status_code == 200 + result = response.json() + assert result["code"] == 0 + assert result["data"]["filename"] == custom_filename + + def test_parse_from_url_with_auto_detection(self, client: PowerRAGClient): + """测试从URL下载,使用 input_type='auto' 自动检测""" + import requests + import json + + file_url = "https://httpbin.org/html" + + response = requests.post( + f"{client.api_url}/powerrag/parse_to_md/upload", + headers={"Authorization": f"Bearer {client.api_key}"}, + data={ + "file_url": file_url, + "config": json.dumps({ + "input_type": "auto" # 自动检测 + }) + } + ) + + assert response.status_code == 200 + result = response.json() + assert result["code"] == 0 + assert "markdown" in result["data"] + + def test_parse_from_invalid_url(self, client: PowerRAGClient): + """测试无效URL应返回错误""" + import requests + import json + + invalid_url = "https://invalid-url-that-does-not-exist-12345.com/file.pdf" + + response = requests.post( + f"{client.api_url}/powerrag/parse_to_md/upload", + headers={"Authorization": f"Bearer {client.api_key}"}, + data={ + "file_url": invalid_url, + "config": json.dumps({}) + } + ) + + # 应该返回 400 错误 + assert response.status_code == 400 + result = response.json() + assert result["code"] == 400 + assert "Failed to download" in result["message"] + + def test_parse_cannot_provide_both_file_and_url(self, client: PowerRAGClient, tmp_path): + """测试不能同时提供 file 和 file_url""" + import requests + + # 创建临时文件 + html_file = tmp_path / "test.html" + html_file.write_text("Test") + + file_url = "https://httpbin.org/html" + + # 同时提供 file 和 file_url + with open(html_file, "rb") as f: + response = requests.post( + f"{client.api_url}/powerrag/parse_to_md/upload", + headers={"Authorization": f"Bearer {client.api_key}"}, + files={"file": ("test.html", f, "text/html")}, + data={ + "file_url": file_url, + "config": "{}" + } + ) + + # 应该返回 400 错误 + assert response.status_code == 400 + result = response.json() + assert result["code"] == 400 + assert "Cannot provide both" in result["message"] + + def test_parse_must_provide_file_or_url(self, client: PowerRAGClient): + """测试必须提供 file 或 file_url 其中之一""" + import requests + + # 不提供 file 也不提供 file_url + response = requests.post( + f"{client.api_url}/powerrag/parse_to_md/upload", + headers={"Authorization": f"Bearer {client.api_key}"}, + data={"config": "{}"} + ) + + # 应该返回 400 错误 + assert response.status_code == 400 + result = response.json() + assert result["code"] == 400 + assert "Either 'file' or 'file_url' must be provided" in result["message"] + + def test_parse_from_url_with_config(self, client: PowerRAGClient): + """测试从URL下载并使用完整配置""" + import requests + import json + + file_url = "https://httpbin.org/html" + + response = requests.post( + f"{client.api_url}/powerrag/parse_to_md/upload", + headers={"Authorization": f"Bearer {client.api_key}"}, + data={ + "file_url": file_url, + "config": json.dumps({ + "filename": "complete_config.html", + "input_type": "html", + "layout_recognize": "mineru", + "enable_table": True + }) + } + ) + + assert response.status_code == 200 + result = response.json() + assert result["code"] == 0 + assert result["data"]["filename"] == "complete_config.html" + assert "markdown" in result["data"] + + def test_parse_from_url_empty_file(self, client: PowerRAGClient): + """测试从URL下载空文件应返回错误""" + import requests + import json + + # 使用一个返回空内容的URL(如果存在) + # 注意:这个测试可能需要 mock,这里使用真实场景 + # httpbin.org/bytes/0 返回 0 字节 + empty_url = "https://httpbin.org/bytes/0" + + response = requests.post( + f"{client.api_url}/powerrag/parse_to_md/upload", + headers={"Authorization": f"Bearer {client.api_key}"}, + data={ + "file_url": empty_url, + "config": json.dumps({"filename": "empty.bin"}) + } + ) + + # 应该返回 400 错误 + assert response.status_code == 400 + result = response.json() + assert result["code"] == 400 + assert "empty" in result["message"].lower() \ No newline at end of file diff --git a/powerrag/server/routes/powerrag_routes.py b/powerrag/server/routes/powerrag_routes.py index b1a63674b..6a3dd6b1d 100644 --- a/powerrag/server/routes/powerrag_routes.py +++ b/powerrag/server/routes/powerrag_routes.py @@ -960,6 +960,7 @@ async def parse_to_md_upload(tenant_id): Parse uploaded file to Markdown WITHOUT chunking 直接上传文件并解析为 Markdown,不进行切分。 + 支持两种方式:1) 直接上传文件 2) 提供文件URL 支持的文件格式: - PDF (.pdf) @@ -970,7 +971,8 @@ async def parse_to_md_upload(tenant_id): Authentication: Requires RAGFlow API key in Authorization header (Bearer token) Request (multipart/form-data): - - file: File to parse (required) - supports PDF, Office (doc/docx/ppt/pptx), Images (jpg/png), HTML + - file: File to parse (optional, required if file_url not provided) + - file_url: URL of file to download and parse (optional, required if file not provided) - config: JSON string of parser config (optional) Config parameters: @@ -983,6 +985,7 @@ async def parse_to_md_upload(tenant_id): - input_type (str): File type detection mode (default: 'auto'). Options: * 'auto': Try filename extension first, then auto-detect from binary if no extension (default) * 'pdf', 'office', 'html', 'image': Explicit file type (bypass detection) + - filename (str): Override filename (optional, useful with file_url) Response JSON: { @@ -998,21 +1001,6 @@ async def parse_to_md_upload(tenant_id): } """ try: - # Check if file is present - files = await request.files - if 'file' not in files: - return jsonify({ - "code": 400, - "message": "No file provided" - }), 400 - - file = files['file'] - if file.filename == '': - return jsonify({ - "code": 400, - "message": "No file selected" - }), 400 - # Parse config from JSON string if provided import json form = await request.form @@ -1025,23 +1013,80 @@ async def parse_to_md_upload(tenant_id): "message": "Invalid JSON in config parameter" }), 400 - # Read file binary - filename = file.filename - logger.info(f"Received file upload: filename={filename}, file object={file}") + # Get file_url parameter + file_url = form.get('file_url') + + # Check if file or file_url is provided + files = await request.files + has_file = 'file' in files and files['file'].filename != '' + has_url = file_url and file_url.strip() != '' - if not filename: + if not has_file and not has_url: return jsonify({ "code": 400, - "message": "Filename is required" + "message": "Either 'file' or 'file_url' must be provided" }), 400 - binary = file.read() - if not binary: + if has_file and has_url: return jsonify({ "code": 400, - "message": "File is empty" + "message": "Cannot provide both 'file' and 'file_url'. Please choose one." }), 400 + # Handle file upload or URL download + if has_file: + # Direct file upload + file = files['file'] + filename = file.filename + logger.info(f"Received file upload: filename={filename}") + + if not filename: + return jsonify({ + "code": 400, + "message": "Filename is required" + }), 400 + + binary = file.read() + if not binary: + return jsonify({ + "code": 400, + "message": "File is empty" + }), 400 + else: + # Download from URL + import requests + from urllib.parse import urlparse + from pathlib import Path + + logger.info(f"Downloading file from URL: {file_url}") + + try: + response = requests.get(file_url, timeout=60) + response.raise_for_status() + binary = response.content + except requests.exceptions.RequestException as e: + logger.error(f"Failed to download file from URL: {file_url}. Error: {e}") + return jsonify({ + "code": 400, + "message": f"Failed to download file from URL: {str(e)}" + }), 400 + + if not binary: + return jsonify({ + "code": 400, + "message": "Downloaded file is empty" + }), 400 + + # Extract filename from URL or use override from config + filename = config.get('filename') + if not filename: + parsed_url = urlparse(file_url) + filename = Path(parsed_url.path).name + if not filename: + filename = "downloaded_file" + + logger.info(f"Downloaded file: {filename}, size: {len(binary)} bytes") + # Add filename to config config['filename'] = filename From 5cd36ad13df160f88bcb274f2e78dc7eebdd665e Mon Sep 17 00:00:00 2001 From: zhanggan7723 Date: Mon, 26 Jan 2026 20:13:54 +0800 Subject: [PATCH 14/19] Update powerrag/utils/file_utils.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- powerrag/utils/file_utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/powerrag/utils/file_utils.py b/powerrag/utils/file_utils.py index bf6d7b084..334517c5b 100644 --- a/powerrag/utils/file_utils.py +++ b/powerrag/utils/file_utils.py @@ -352,6 +352,8 @@ def detect_file_type(binary: bytes) -> str: if any(n.startswith("xl/") for n in names): return 'office' # xlsx except Exception: + # Any error while reading as a ZIP (corrupt/non-Office archive, etc.) + # means we cannot classify it as a ZIP-based Office file; fall through. pass # Check OLE-based Office formats (doc, xls, ppt) From 56358669c2517e16a441423cc82bd34110eb7b0c Mon Sep 17 00:00:00 2001 From: zhanggan7723 Date: Mon, 26 Jan 2026 20:20:19 +0800 Subject: [PATCH 15/19] Update powerrag/utils/file_utils.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- powerrag/utils/file_utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/powerrag/utils/file_utils.py b/powerrag/utils/file_utils.py index 334517c5b..5e96988e8 100644 --- a/powerrag/utils/file_utils.py +++ b/powerrag/utils/file_utils.py @@ -352,6 +352,9 @@ def detect_file_type(binary: bytes) -> str: if any(n.startswith("xl/") for n in names): return 'office' # xlsx except Exception: + # If the ZIP is not a recognized Office document (or cannot be read), + # fall through to other format checks; it may be classified as 'unknown' + # at the end if no other type matches. # Any error while reading as a ZIP (corrupt/non-Office archive, etc.) # means we cannot classify it as a ZIP-based Office file; fall through. pass From 2b3496a769ad71666a5e9fa9042e04ada5d6bab9 Mon Sep 17 00:00:00 2001 From: "suiyu.zg" Date: Tue, 27 Jan 2026 21:11:55 +0800 Subject: [PATCH 16/19] refactor(document): update README and code to use 'content' instead of 'markdown' for parsed results --- powerrag/sdk/README.md | 54 ++++- powerrag/sdk/modules/document_manager.py | 16 +- powerrag/sdk/tests/test_document.py | 227 ++++++++++++------ powerrag/server/routes/powerrag_routes.py | 276 +++++++++++++++++----- powerrag/server/services/parse_service.py | 61 +++-- powerrag/utils/file_utils.py | 2 + 6 files changed, 470 insertions(+), 166 deletions(-) diff --git a/powerrag/sdk/README.md b/powerrag/sdk/README.md index 410d07f2c..f59868396 100644 --- a/powerrag/sdk/README.md +++ b/powerrag/sdk/README.md @@ -124,6 +124,9 @@ result = client.document.parse_to_md_upload( "document.pdf", config={"layout_recognize": "mineru"} ) +# 访问返回结果 +print(result['content']) # Markdown 内容 +print(result['total_images']) # 图片总数 # 从 URL 下载并解析(直接调用 API) import requests @@ -138,6 +141,9 @@ response = requests.post( } ) result = response.json() +# 访问返回结果 +print(result['data']['content']) # Markdown 内容 +print(result['data']['total_images']) # 图片总数 # 使用二进制数据解析(支持无扩展名文件) with open("document.pdf", "rb") as f: @@ -148,6 +154,8 @@ result = client.document.parse_to_md_binary( filename="document.pdf", # 有扩展名时自动识别 config={"layout_recognize": "mineru"} ) +# 访问返回结果 +print(result['content']) # Markdown 内容 # 对于无扩展名的文件,使用 input_type='auto' 自动识别 result = client.document.parse_to_md_binary( @@ -155,13 +163,17 @@ result = client.document.parse_to_md_binary( filename="document_no_extension", # 无扩展名 # input_type='auto' 是默认值,会自动从二进制内容检测文件类型 ) +# 访问返回结果 +print(result['content']) # Markdown 内容 -# 或显式指定文件类型 +# 或显式指定文件类型(使用具体扩展名) result = client.document.parse_to_md_binary( file_binary=binary_data, filename="document", - input_type="pdf" # 强制作为 PDF 处理 + input_type="pdf" # 具体扩展名: 'pdf', 'docx', 'html', 'jpg' 等 ) +# 访问返回结果 +print(result['content']) # Markdown 内容 ``` ### 检索 @@ -236,13 +248,20 @@ result = client.document.parse_to_md_binary( file_binary=binary_data, filename="document_no_extension" # 无扩展名也可以 ) +# 访问返回结果 +print(result['content']) # Markdown 内容 # 或显式指定类型 result = client.document.parse_to_md_binary( file_binary=binary_data, filename="document", - input_type="pdf" # 'pdf', 'office', 'html', 'image' + input_type="pdf" # 具体扩展名: 'pdf', 'docx', 'html', 'jpg' 等 ) + +# 访问返回结果 +print(result['content']) # Markdown 内容 +print(result['total_images']) # 图片总数 +print(result['images']) # 图片字典 ``` **配置选项:** @@ -253,7 +272,7 @@ result = client.document.parse_to_md_binary( - `from_page`/`to_page`: PDF 页面范围 - `input_type`: 文件类型识别模式(默认: `'auto'`) - `'auto'`: 优先使用文件扩展名,无扩展名时自动检测(推荐) - - `'pdf'`, `'office'`, `'html'`, `'image'`: 显式指定文件类型 + - 具体文件扩展名: `'pdf'`, `'docx'`, `'doc'`, `'xlsx'`, `'xls'`, `'pptx'`, `'ppt'`, `'html'`, `'htm'`, `'jpg'`, `'jpeg'`, `'png'` - 显式指定文件扩展名 ### 结构化信息抽取 @@ -410,6 +429,8 @@ result = client.document.wait_for_parse_to_md(task_id, timeout=300) # 上传并解析为 Markdown(无需知识库) result = client.document.parse_to_md_upload("file.pdf", config={...}) +# 访问返回结果 +print(result['content']) # Markdown 内容 # 使用 file_url 参数从 URL 下载并解析(直接调用 API) import requests @@ -427,6 +448,8 @@ response = requests.post( } ) result = response.json() +# 访问返回结果 +print(result['data']['content']) # Markdown 内容 # 使用 file_url 并指定文件名 response = requests.post( @@ -441,6 +464,9 @@ response = requests.post( }) } ) +result = response.json() +# 访问返回结果 +print(result['data']['content']) # Markdown 内容 # 使用二进制数据解析为 Markdown with open("document.pdf", "rb") as f: @@ -452,6 +478,8 @@ result = client.document.parse_to_md_binary( config={"layout_recognize": "mineru"}, input_type="auto" # 默认值,自动识别文件类型 ) +# 访问返回结果 +print(result['content']) # Markdown 内容 # 无扩展名文件解析(自动检测文件类型) result = client.document.parse_to_md_binary( @@ -459,6 +487,8 @@ result = client.document.parse_to_md_binary( filename="document_no_extension", # 无扩展名 # input_type='auto' 会从二进制内容自动检测 PDF/Office/HTML 等 ) +# 访问返回结果 +print(result['content']) # Markdown 内容 # 显式指定文件类型(跳过自动检测) result = client.document.parse_to_md_binary( @@ -466,6 +496,8 @@ result = client.document.parse_to_md_binary( filename="document", input_type="pdf" # 强制作为 PDF 处理 ) +# 访问返回结果 +print(result['content']) # Markdown 内容 # 解析URL文档(同步等待) doc = client.document.parse_url( @@ -1184,11 +1216,13 @@ result = client.document.parse_to_md_binary( filename="document_no_extension" # input_type='auto' 是默认值,可以省略 ) +# 访问返回结果 +print(result['content']) # Markdown 内容 ``` 支持的 `input_type` 值: - `'auto'` (默认): 优先使用文件扩展名,无扩展名或不支持时从二进制内容自动检测 -- `'pdf'`, `'office'`, `'html'`, `'image'`: 显式指定文件类型 +- 具体文件扩展名: `'pdf'`, `'docx'`, `'doc'`, `'xlsx'`, `'xls'`, `'pptx'`, `'ppt'`, `'html'`, `'htm'`, `'jpg'`, `'jpeg'`, `'png'` - 显式指定文件扩展名 ### Q: 如何从URL直接解析文件? @@ -1212,7 +1246,7 @@ response = requests.post( } ) result = response.json() -print(result['data']['markdown']) +print(result['data']['content']) ``` **方式 2:指定文件名(适用于无扩展名URL)** @@ -1224,7 +1258,7 @@ response = requests.post( "file_url": "https://api.example.com/download?id=123", "config": json.dumps({ "filename": "report.pdf", # 覆盖文件名 - "input_type": "pdf", # 显式指定类型 + "input_type": "pdf", # 显式指定文件扩展名 "layout_recognize": "mineru", "enable_table": True }) @@ -1249,6 +1283,10 @@ response = requests.post( "config": json.dumps({"layout_recognize": "mineru"}) } ) +result = response.json() +# 访问返回结果 +print(result['data']['content']) # Markdown 内容 +print(result['data']['total_images']) # 图片总数 ``` **配置参数说明:** @@ -1256,7 +1294,7 @@ response = requests.post( - `config.filename` (str): 自定义文件名(可选,不提供则从 URL 提取) - `config.input_type` (str): 文件类型检测模式 - `'auto'` (默认): 优先使用扩展名,无扩展名时自动检测 - - `'pdf'`, `'office'`, `'html'`, `'image'`: 显式指定类型 + - 具体文件扩展名: `'pdf'`, `'docx'`, `'doc'`, `'xlsx'`, `'xls'`, `'pptx'`, `'ppt'`, `'html'`, `'htm'`, `'jpg'`, `'jpeg'`, `'png'` - 显式指定文件扩展名 - `config.layout_recognize` (str): 布局识别引擎(`'mineru'` 或 `'dots_ocr'`) - 其他解析配置参数同 `parse_to_md` 方法 diff --git a/powerrag/sdk/modules/document_manager.py b/powerrag/sdk/modules/document_manager.py index 08b58fea4..8f3ac6bcd 100644 --- a/powerrag/sdk/modules/document_manager.py +++ b/powerrag/sdk/modules/document_manager.py @@ -695,9 +695,7 @@ def parse_to_md_upload( Returns: 解析结果字典,包含以下字段: - - filename: 文件名 - - markdown: Markdown 内容 - - markdown_length: Markdown 长度 + - content: Markdown 内容 - images: 图片字典 - total_images: 图片总数 @@ -710,14 +708,14 @@ def parse_to_md_upload( >>> result = doc_manager.parse_to_md_upload( ... file_path="document.pdf" ... ) - >>> print(result['markdown']) + >>> print(result['content']) >>> >>> # 对于无扩展名文件,input_type='auto' 会自动从二进制内容识别 >>> result = doc_manager.parse_to_md_upload( ... file_path="document_no_ext" ... # input_type='auto' 是默认值,可以省略 ... ) - >>> print(result['markdown']) + >>> print(result['content']) >>> >>> # 显式指定文件类型(跳过自动识别) >>> result = doc_manager.parse_to_md_upload( @@ -769,9 +767,7 @@ def parse_to_md_binary( Returns: 解析结果字典,包含以下字段: - - filename: 文件名 - - markdown: Markdown 内容 - - markdown_length: Markdown 长度 + - content: Markdown 内容 - images: 图片字典 (base64) - total_images: 图片总数 @@ -787,7 +783,7 @@ def parse_to_md_binary( ... file_binary=file_binary, ... filename="document.pdf" ... ) - >>> print(result['markdown']) + >>> print(result['content']) >>> >>> # 对于无扩展名的二进制数据,input_type='auto' 会自动识别 >>> result = doc_manager.parse_to_md_binary( @@ -795,7 +791,7 @@ def parse_to_md_binary( ... filename="document" # 无扩展名 ... # input_type='auto' 是默认值 ... ) - >>> print(result['markdown']) + >>> print(result['content']) >>> >>> # 显式指定文件类型(跳过自动识别) >>> result = doc_manager.parse_to_md_binary( diff --git a/powerrag/sdk/tests/test_document.py b/powerrag/sdk/tests/test_document.py index d882c4cda..173b1ddde 100644 --- a/powerrag/sdk/tests/test_document.py +++ b/powerrag/sdk/tests/test_document.py @@ -396,13 +396,11 @@ def test_parse_to_md_upload_json_response(self, client: PowerRAGClient, test_fil result = client.document.parse_to_md_upload(test_file_path) # 验证返回结果 - assert "filename" in result - assert "markdown" in result - assert "markdown_length" in result + assert "content" in result assert "images" in result assert "total_images" in result - assert isinstance(result["markdown"], str) - assert result["markdown_length"] > 0 + assert isinstance(result["content"], str) + assert len(result["content"]) > 0 def test_parse_to_md_upload_with_config(self, client: PowerRAGClient, test_file_path: str): """测试带配置参数上传并解析""" @@ -412,8 +410,8 @@ def test_parse_to_md_upload_with_config(self, client: PowerRAGClient, test_file_ } result = client.document.parse_to_md_upload(test_file_path, config=config) - assert "markdown" in result - assert len(result["markdown"]) > 0 + assert "content" in result + assert len(result["content"]) > 0 def test_parse_to_md_upload_nonexistent_file(self, client: PowerRAGClient): """测试上传不存在的文件""" @@ -426,8 +424,8 @@ def test_parse_to_md_upload_different_formats(self, client: PowerRAGClient, test # 这里我们只测试 txt 文件,实际使用时可以添加更多格式 result = client.document.parse_to_md_upload(test_file_path) - assert "markdown" in result - assert result["markdown_length"] > 0 + assert "content" in result + assert len(result["content"]) > 0 class TestDocumentParseToMDBinary: @@ -446,13 +444,11 @@ def test_parse_to_md_binary_basic(self, client: PowerRAGClient, test_file_path: ) # 验证返回结果 - assert "filename" in result - assert "markdown" in result - assert "markdown_length" in result + assert "content" in result assert "images" in result assert "total_images" in result - assert isinstance(result["markdown"], str) - assert result["markdown_length"] > 0 + assert isinstance(result["content"], str) + assert len(result["content"]) > 0 def test_parse_to_md_binary_with_config(self, client: PowerRAGClient, test_file_path: str): """测试带配置参数的二进制文件解析""" @@ -471,9 +467,8 @@ def test_parse_to_md_binary_with_config(self, client: PowerRAGClient, test_file_ config=config ) - assert "markdown" in result - assert len(result["markdown"]) > 0 - assert result["markdown_length"] > 0 + assert "content" in result + assert len(result["content"]) > 0 def test_parse_to_md_binary_empty_content(self, client: PowerRAGClient): """测试空的二进制内容""" @@ -510,8 +505,8 @@ def test_parse_to_md_binary_different_file_types(self, client: PowerRAGClient, t filename="test.html" ) - assert "markdown" in result - assert result["markdown_length"] > 0 + assert "content" in result + assert len(result["content"]) > 0 def test_parse_to_md_binary_with_images(self, client: PowerRAGClient, test_file_path: str): """测试解析带图片的文档(二进制)""" @@ -541,8 +536,7 @@ def test_parse_to_md_binary_filename_with_extension(self, client: PowerRAGClient filename="document.html" ) - assert result["filename"] == "document.html" - assert "markdown" in result + assert "content" in result def test_parse_to_md_binary_large_file(self, client: PowerRAGClient, tmp_path): """测试较大文件的二进制解析""" @@ -559,10 +553,10 @@ def test_parse_to_md_binary_large_file(self, client: PowerRAGClient, tmp_path): filename="large_test.html" ) - assert "markdown" in result - assert result["markdown_length"] > 0 + assert "content" in result + assert len(result["content"]) > 0 # 验证内容长度合理 - assert len(result["markdown"]) > 1000 + assert len(result["content"]) > 1000 def test_parse_to_md_binary_utf8_content(self, client: PowerRAGClient, tmp_path): """测试包含UTF-8字符的文件""" @@ -588,13 +582,111 @@ def test_parse_to_md_binary_utf8_content(self, client: PowerRAGClient, tmp_path) filename="utf8_test.html" ) - assert "markdown" in result - assert result["markdown_length"] > 0 + assert "content" in result + assert len(result["content"]) > 0 class TestDocumentInputTypeAutoDetection: """测试 input_type 自动检测功能""" + @staticmethod + def _create_valid_pdf_content(text_lines=None): + """ + 创建一个有效的 PDF 文件内容 + + Args: + text_lines: 可选的文本行列表,用于自定义 PDF 内容 + + Returns: + bytes: PDF 文件的二进制内容 + """ + if text_lines is None: + text_lines = ["Test PDF File"] + + try: + from reportlab.pdfgen import canvas + from io import BytesIO + + # 创建一个有效的 PDF 内容 + buffer = BytesIO() + c = canvas.Canvas(buffer) + y_pos = 750 + for line in text_lines: + c.drawString(100, y_pos, line) + y_pos -= 20 + c.showPage() + c.save() + pdf_content = buffer.getvalue() + buffer.close() + return pdf_content + except ImportError: + # 如果 reportlab 不可用,创建一个最小但有效的 PDF + # 这是一个最小有效的 PDF 1.4 文档,包含一个文本对象 + return ( + b"%PDF-1.4\n" + b"1 0 obj\n" + b"<<\n" + b"/Type /Catalog\n" + b"/Pages 2 0 R\n" + b">>\n" + b"endobj\n" + b"2 0 obj\n" + b"<<\n" + b"/Type /Pages\n" + b"/Kids [3 0 R]\n" + b"/Count 1\n" + b">>\n" + b"endobj\n" + b"3 0 obj\n" + b"<<\n" + b"/Type /Page\n" + b"/Parent 2 0 R\n" + b"/MediaBox [0 0 612 792]\n" + b"/Contents 4 0 R\n" + b"/Resources <<\n" + b"/Font <<\n" + b"/F1 5 0 R\n" + b">>\n" + b">>\n" + b">>\n" + b"endobj\n" + b"4 0 obj\n" + b"<<\n" + b"/Length 44\n" + b">>\n" + b"stream\n" + b"BT\n" + b"/F1 12 Tf\n" + b"100 700 Td\n" + b"(Test PDF) Tj\n" + b"ET\n" + b"endstream\n" + b"endobj\n" + b"5 0 obj\n" + b"<<\n" + b"/Type /Font\n" + b"/Subtype /Type1\n" + b"/BaseFont /Helvetica\n" + b">>\n" + b"endobj\n" + b"xref\n" + b"0 6\n" + b"0000000000 65535 f \n" + b"0000000009 00000 n \n" + b"0000000058 00000 n \n" + b"0000000115 00000 n \n" + b"0000000306 00000 n \n" + b"0000000400 00000 n \n" + b"trailer\n" + b"<<\n" + b"/Size 6\n" + b"/Root 1 0 R\n" + b">>\n" + b"startxref\n" + b"492\n" + b"%%EOF\n" + ) + def test_auto_detection_with_valid_extension(self, client: PowerRAGClient, tmp_path): """测试有有效扩展名时,input_type='auto' 优先使用扩展名""" # 创建一个 HTML 文件 @@ -612,28 +704,29 @@ def test_auto_detection_with_valid_extension(self, client: PowerRAGClient, tmp_p # input_type='auto' 是默认值,可以省略 ) - assert "markdown" in result - assert result["filename"] == "test.html" - assert result["markdown_length"] > 0 + assert "content" in result + assert len(result["content"]) > 0 def test_auto_detection_without_extension_pdf(self, client: PowerRAGClient, tmp_path): """测试无扩展名 PDF 文件,input_type='auto' 会自动从二进制检测""" - # 创建一个简单的 PDF 文件头(实际测试可能需要真实的 PDF) - # 这里我们创建一个有 PDF 魔术数的文件 - pdf_header = b"%PDF-1.4\n%\xE2\xE3\xCF\xD3\n" - pdf_content = pdf_header + b"1 0 obj\n<<\n/Type /Catalog\n>>\nendobj\n" - - # 使用没有扩展名的文件名 + # 创建一个有效的 PDF 文件 + pdf_content = self._create_valid_pdf_content([ + "Test PDF File", + "This is a test PDF document for auto-detection testing.", + "The file has no extension, so binary detection should be used." + ]) + + # 使用没有扩展名的文件名,input_type='auto' 会从二进制内容检测出 PDF result = client.document.parse_to_md_binary( file_binary=pdf_content, filename="document_no_extension" - # input_type='auto' 会从二进制内容检测出 PDF + # input_type='auto' 是默认值,会从二进制内容检测出 PDF ) - # 注意:这个测试可能会因为 PDF 内容不完整而失败 - # 实际环境中需要使用真实的 PDF 文件 - assert "filename" in result - assert result["filename"] == "document_no_extension" + # 验证解析结果 + assert "content" in result, "Result should contain 'content' field" + assert len(result["content"]) > 0, "Content should not be empty (PDF should be successfully parsed)" + assert isinstance(result["content"], str), "Content should be a string" def test_auto_detection_without_extension_html(self, client: PowerRAGClient): """测试无扩展名 HTML 文件,input_type='auto' 会自动从二进制检测""" @@ -646,15 +739,16 @@ def test_auto_detection_without_extension_html(self, client: PowerRAGClient): # input_type='auto' 会从二进制内容检测出 HTML ) - assert "markdown" in result - assert result["filename"] == "document_without_ext" - assert result["markdown_length"] > 0 + assert "content" in result + assert len(result["content"]) > 0 def test_explicit_input_type_pdf(self, client: PowerRAGClient, tmp_path): """测试显式指定 input_type='pdf'""" - # 创建一个简单的 PDF 内容 - pdf_header = b"%PDF-1.4\n%\xE2\xE3\xCF\xD3\n" - pdf_content = pdf_header + b"1 0 obj\n<<\n/Type /Catalog\n>>\nendobj\n" + # 创建一个有效的 PDF 文件 + pdf_content = self._create_valid_pdf_content([ + "Test PDF File", + "This is a test PDF document for PowerRAG SDK testing." + ]) # 显式指定为 PDF 类型,即使文件名没有扩展名 result = client.document.parse_to_md_binary( @@ -663,7 +757,10 @@ def test_explicit_input_type_pdf(self, client: PowerRAGClient, tmp_path): input_type="pdf" # 显式指定类型 ) - assert "filename" in result + # 验证解析结果 + assert "content" in result, "Result should contain 'content' field" + assert len(result["content"]) > 0, "Content should not be empty" + assert isinstance(result["content"], str), "Content should be a string" def test_explicit_input_type_html(self, client: PowerRAGClient): """测试显式指定 input_type='html'""" @@ -676,8 +773,8 @@ def test_explicit_input_type_html(self, client: PowerRAGClient): input_type="html" # 显式指定类型 ) - assert "markdown" in result - assert result["markdown_length"] > 0 + assert "content" in result + assert len(result["content"]) > 0 def test_parse_to_md_upload_with_auto_detection(self, client: PowerRAGClient, tmp_path): """测试 parse_to_md_upload 方法的自动检测功能""" @@ -689,9 +786,8 @@ def test_parse_to_md_upload_with_auto_detection(self, client: PowerRAGClient, tm # 使用默认的 input_type='auto' result = client.document.parse_to_md_upload(str(html_file)) - assert "markdown" in result - assert "filename" in result - assert result["markdown_length"] > 0 + assert "content" in result + assert len(result["content"]) > 0 def test_parse_to_md_upload_with_explicit_type(self, client: PowerRAGClient, tmp_path): """测试 parse_to_md_upload 显式指定类型""" @@ -705,8 +801,8 @@ def test_parse_to_md_upload_with_explicit_type(self, client: PowerRAGClient, tmp input_type="html" ) - assert "markdown" in result - assert result["markdown_length"] > 0 + assert "content" in result + assert len(result["content"]) > 0 def test_auto_detection_priority_extension_over_binary(self, client: PowerRAGClient, tmp_path): """测试 input_type='auto' 优先使用扩展名而非二进制检测""" @@ -725,10 +821,9 @@ def test_auto_detection_priority_extension_over_binary(self, client: PowerRAGCli # input_type='auto' 默认值 ) - assert "markdown" in result - assert result["filename"] == "priority_test.html" + assert "content" in result # 验证确实解析成功(说明使用了正确的类型) - assert "Priority Test" in result["markdown"] or result["markdown_length"] > 0 + assert "Priority Test" in result["content"] or len(result["content"]) > 0 def test_auto_detection_fallback_to_binary(self, client: PowerRAGClient): """测试扩展名不支持时,fallback 到二进制检测""" @@ -742,8 +837,8 @@ def test_auto_detection_fallback_to_binary(self, client: PowerRAGClient): ) # 应该能够通过二进制检测识别为 HTML - assert "markdown" in result - assert result["markdown_length"] > 0 + assert "content" in result + assert len(result["content"]) > 0 def test_config_with_input_type(self, client: PowerRAGClient, tmp_path): """测试 config 中包含 input_type 参数""" @@ -765,8 +860,8 @@ def test_config_with_input_type(self, client: PowerRAGClient, tmp_path): input_type="html" ) - assert "markdown" in result - assert result["markdown_length"] > 0 + assert "content" in result + assert len(result["content"]) > 0 class TestDocumentFileUrl: @@ -793,8 +888,8 @@ def test_parse_from_url_basic(self, client: PowerRAGClient): assert response.status_code == 200 result = response.json() assert result["code"] == 0 - assert "markdown" in result["data"] - assert result["data"]["markdown_length"] > 0 + assert "content" in result["data"] + assert len(result["data"]["content"]) > 0 def test_parse_from_url_with_filename(self, client: PowerRAGClient): """测试从URL下载并指定文件名""" @@ -819,7 +914,6 @@ def test_parse_from_url_with_filename(self, client: PowerRAGClient): assert response.status_code == 200 result = response.json() assert result["code"] == 0 - assert result["data"]["filename"] == custom_filename def test_parse_from_url_with_auto_detection(self, client: PowerRAGClient): """测试从URL下载,使用 input_type='auto' 自动检测""" @@ -842,7 +936,7 @@ def test_parse_from_url_with_auto_detection(self, client: PowerRAGClient): assert response.status_code == 200 result = response.json() assert result["code"] == 0 - assert "markdown" in result["data"] + assert "content" in result["data"] def test_parse_from_invalid_url(self, client: PowerRAGClient): """测试无效URL应返回错误""" @@ -935,8 +1029,7 @@ def test_parse_from_url_with_config(self, client: PowerRAGClient): assert response.status_code == 200 result = response.json() assert result["code"] == 0 - assert result["data"]["filename"] == "complete_config.html" - assert "markdown" in result["data"] + assert "content" in result["data"] def test_parse_from_url_empty_file(self, client: PowerRAGClient): """测试从URL下载空文件应返回错误""" diff --git a/powerrag/server/routes/powerrag_routes.py b/powerrag/server/routes/powerrag_routes.py index 6a3dd6b1d..b7d7f3221 100644 --- a/powerrag/server/routes/powerrag_routes.py +++ b/powerrag/server/routes/powerrag_routes.py @@ -17,15 +17,20 @@ """PowerRAG Unified API Routes""" import os +import json import logging +from pathlib import Path +from urllib.parse import urlparse from quart import Blueprint, request, jsonify, Response from powerrag.server.services.parse_service import PowerRAGParseService from powerrag.server.services.convert_service import PowerRAGConvertService from powerrag.server.services.split_service import PowerRAGSplitService from powerrag.server.services.extract_service import PowerRAGExtractService from powerrag.utils.api_utils import get_data_error_result +from powerrag.utils.file_utils import detect_file_type from api.utils.api_utils import apikey_required import langextract as lx +import requests # Import RAGFlow services for task queue integration from api.db.services.document_service import DocumentService @@ -44,6 +49,114 @@ gotenberg_config = get_base_config("gotenberg", {}) GOTENBERG_URL = gotenberg_config.get("url", os.environ.get("GOTENBERG_URL", "http://localhost:3000")) +# File download timeout settings (in seconds) +DEFAULT_DOWNLOAD_TIMEOUT = int(os.environ.get("FILE_DOWNLOAD_TIMEOUT", "300")) # 5 minutes default +DEFAULT_HEAD_REQUEST_TIMEOUT = int(os.environ.get("FILE_HEAD_REQUEST_TIMEOUT", "30")) # 30 seconds default + +# File extension to format type mapping (used across multiple endpoints) +FILE_EXTENSION_TO_FORMAT_TYPE = { + 'pdf': 'pdf', + 'docx': 'office', 'doc': 'office', + 'xlsx': 'office', 'xls': 'office', + 'pptx': 'office', 'ppt': 'office', + 'html': 'html', 'htm': 'html', + 'jpg': 'image', 'jpeg': 'image', + 'png': 'image' +} + + +def download_file_with_validation( + file_url: str, + max_file_size: int, + download_timeout: int = DEFAULT_DOWNLOAD_TIMEOUT, + head_timeout: int = DEFAULT_HEAD_REQUEST_TIMEOUT +) -> tuple[bytes, str | None]: + """ + Download file from URL with size validation and intelligent download strategy. + + This function implements a two-layer defense strategy: + 1. HEAD request pre-check: Fast rejection for oversized files (saves bandwidth) + 2. Intelligent download: Direct or streaming based on Content-Length availability + + Args: + file_url: URL of the file to download + max_file_size: Maximum allowed file size in bytes + download_timeout: Timeout for GET request in seconds + head_timeout: Timeout for HEAD request in seconds + + Returns: + tuple: (binary_content, error_message) + - binary_content: Downloaded file content as bytes (None if error) + - error_message: Error message if download failed (None if success) + + Raises: + requests.exceptions.RequestException: For network-related errors + """ + max_size_mb = max_file_size / (1024 * 1024) + + # First, make a HEAD request to check Content-Length before downloading + content_length_known = False + verified_content_length = None + + try: + head_response = requests.head(file_url, timeout=head_timeout, allow_redirects=True) + content_length = head_response.headers.get('Content-Length') + + if content_length: + content_length = int(content_length) + if content_length > max_file_size: + logger.warning(f"File size {content_length} bytes exceeds limit {max_file_size} bytes") + return None, f"File size ({content_length / (1024 * 1024):.2f}MB) exceeds maximum allowed size ({max_size_mb:.2f}MB)" + logger.info(f"Content-Length check passed: {content_length} bytes") + content_length_known = True + verified_content_length = content_length + except requests.exceptions.RequestException as e: + # HEAD request failed, continue with streaming download with size checks + logger.info(f"HEAD request failed, will use streaming download with size checks: {e}") + + # Choose download strategy based on whether Content-Length is known + if content_length_known: + # Direct download: Content-Length verified, size is within limit + logger.info(f"Using direct download (Content-Length verified: {verified_content_length} bytes)") + response = requests.get(file_url, timeout=download_timeout) + response.raise_for_status() + binary = response.content + + # Verify actual size matches Content-Length (defense against malicious servers) + actual_size = len(binary) + if actual_size != verified_content_length: + logger.warning(f"Size mismatch: Content-Length={verified_content_length}, actual={actual_size}") + if actual_size > max_file_size: + return None, f"File size ({actual_size / (1024 * 1024):.2f}MB) exceeds maximum allowed size ({max_size_mb:.2f}MB)" + + logger.info(f"Successfully downloaded {actual_size} bytes") + return binary, None + else: + # Streaming download with size limit enforcement + logger.info(f"Using streaming download (Content-Length unknown, will enforce size limit during download)") + response = requests.get(file_url, timeout=download_timeout, stream=True) + response.raise_for_status() + + # Download in chunks and enforce size limit + downloaded_size = 0 + chunks = [] + chunk_size = 8192 # 8KB chunks + + for chunk in response.iter_content(chunk_size=chunk_size): + if chunk: + downloaded_size += len(chunk) + + # Check if size limit exceeded during download + if downloaded_size > max_file_size: + logger.warning(f"Download aborted: size exceeded {max_file_size} bytes during streaming") + return None, f"File size exceeds maximum allowed size ({max_size_mb:.2f}MB). Download aborted at {downloaded_size / (1024 * 1024):.2f}MB." + + chunks.append(chunk) + + binary = b''.join(chunks) + logger.info(f"Successfully downloaded {downloaded_size} bytes") + return binary, None + # ============================================================================ # 文档解析接口 @@ -727,22 +840,11 @@ async def parse_to_md(tenant_id): service = PowerRAGParseService(gotenberg_url=gotenberg_url) # Parse document to markdown (no chunking) - from pathlib import Path file_ext = Path(doc.name).suffix.lstrip('.').lower() # Determine format type # Supported: PDF, Office (doc/docx/ppt/pptx), HTML, Images (jpg/png) - format_type_map = { - 'pdf': 'pdf', - 'docx': 'office', 'doc': 'office', - 'xlsx': 'office', 'xls': 'office', - 'pptx': 'office', 'ppt': 'office', - 'html': 'html', 'htm': 'html', - 'jpg': 'image', 'jpeg': 'image', - 'png': 'image' - } - - format_type = format_type_map.get(file_ext) + format_type = FILE_EXTENSION_TO_FORMAT_TYPE.get(file_ext) if not format_type: return jsonify({ "code": 400, @@ -852,15 +954,8 @@ async def parse_to_md_async(tenant_id): }), 404 # Determine format type - from pathlib import Path file_ext = Path(doc.name).suffix.lstrip('.').lower() - format_type_map = { - 'pdf': 'pdf', 'docx': 'office', 'doc': 'office', - 'xlsx': 'office', 'xls': 'office', 'pptx': 'office', 'ppt': 'office', - 'html': 'html', 'htm': 'html', - 'jpg': 'image', 'jpeg': 'image', 'png': 'image' - } - format_type = format_type_map.get(file_ext, 'pdf') + format_type = FILE_EXTENSION_TO_FORMAT_TYPE.get(file_ext, 'pdf') # Get task manager and service from powerrag.server.services.parse_to_md_task_manager import get_task_manager @@ -974,6 +1069,8 @@ async def parse_to_md_upload(tenant_id): - file: File to parse (optional, required if file_url not provided) - file_url: URL of file to download and parse (optional, required if file not provided) - config: JSON string of parser config (optional) + - download_timeout: Timeout in seconds for file download (optional, default: 300) + - head_request_timeout: Timeout in seconds for HEAD request (optional, default: 30) Config parameters: - layout_recognize (str): mineru or dots_ocr (default: mineru) @@ -984,16 +1081,16 @@ async def parse_to_md_upload(tenant_id): - to_page (int): End page number (default: 100000) - input_type (str): File type detection mode (default: 'auto'). Options: * 'auto': Try filename extension first, then auto-detect from binary if no extension (default) - * 'pdf', 'office', 'html', 'image': Explicit file type (bypass detection) + * Specific file extension: 'pdf', 'docx', 'doc', 'xlsx', 'xls', 'pptx', 'ppt', 'html', 'htm', 'jpg', 'jpeg', 'png' (bypass detection) - filename (str): Override filename (optional, useful with file_url) + - max_file_size (int): Maximum file size in bytes for URL downloads (optional, + default: uses DOC_MAXIMUM_SIZE from settings, typically 128MB) Response JSON: { "code": 0, "data": { - "filename": "document.pdf", - "markdown": "# Title\n\nContent...", - "markdown_length": 5000, + "content": "# Title\n\nContent...", "images": {...}, "total_images": 2 }, @@ -1002,20 +1099,23 @@ async def parse_to_md_upload(tenant_id): """ try: # Parse config from JSON string if provided - import json form = await request.form config_str = form.get('config', '{}') try: config = json.loads(config_str) - except json.JSONDecodeError: + except json.JSONDecodeError as e: + logger.warning(f"Invalid JSON in config parameter: {e}") return jsonify({ "code": 400, - "message": "Invalid JSON in config parameter" + "message": f"Invalid JSON in config parameter: {str(e)}" }), 400 # Get file_url parameter file_url = form.get('file_url') + # Get timeout parameters from form (optional) + download_timeout_str = form.get('download_timeout') + # Check if file or file_url is provided files = await request.files has_file = 'file' in files and files['file'].filename != '' @@ -1036,16 +1136,12 @@ async def parse_to_md_upload(tenant_id): # Handle file upload or URL download if has_file: # Direct file upload + # Note: file.read() is synchronous in Quart but runs in async context. + # For large files, consider using streaming or async file reading in future. file = files['file'] filename = file.filename logger.info(f"Received file upload: filename={filename}") - if not filename: - return jsonify({ - "code": 400, - "message": "Filename is required" - }), 400 - binary = file.read() if not binary: return jsonify({ @@ -1054,22 +1150,66 @@ async def parse_to_md_upload(tenant_id): }), 400 else: # Download from URL - import requests - from urllib.parse import urlparse - from pathlib import Path + # Get maximum file size from settings (default: 128MB or from environment) + max_file_size = config.get('max_file_size', settings.DOC_MAXIMUM_SIZE) + max_size_mb = max_file_size / (1024 * 1024) + + # Get timeout settings from form parameters with fallback to config or environment defaults + try: + download_timeout = int(download_timeout_str) if download_timeout_str else config.get('download_timeout', DEFAULT_DOWNLOAD_TIMEOUT) + except (ValueError, TypeError): + download_timeout = config.get('download_timeout', DEFAULT_DOWNLOAD_TIMEOUT) + + head_timeout = config.get('head_request_timeout', DEFAULT_HEAD_REQUEST_TIMEOUT) - logger.info(f"Downloading file from URL: {file_url}") + logger.info(f"Downloading file from URL: {file_url} (max size: {max_size_mb:.2f}MB, timeout: {download_timeout}s)") try: - response = requests.get(file_url, timeout=60) - response.raise_for_status() - binary = response.content + # Use the download_file_with_validation function + binary, error_msg = download_file_with_validation( + file_url=file_url, + max_file_size=max_file_size, + download_timeout=download_timeout, + head_timeout=head_timeout + ) + + if error_msg: + # Download failed due to size limit or other validation error + return jsonify({ + "code": 400, + "message": error_msg + }), 400 + + except requests.exceptions.Timeout as e: + logger.error(f"Timeout downloading file from URL: {file_url}. Error: {e}") + return jsonify({ + "code": 408, + "message": f"Request timeout while downloading file from URL. Please try again or increase timeout." + }), 408 + except requests.exceptions.ConnectionError as e: + logger.error(f"Connection error downloading file from URL: {file_url}. Error: {e}") + return jsonify({ + "code": 503, + "message": f"Failed to connect to file URL. Please check the URL and try again." + }), 503 + except requests.exceptions.HTTPError as e: + logger.error(f"HTTP error downloading file from URL: {file_url}. Error: {e}") + return jsonify({ + "code": 502, + "message": f"HTTP error while downloading file: {str(e)}" + }), 502 except requests.exceptions.RequestException as e: - logger.error(f"Failed to download file from URL: {file_url}. Error: {e}") + logger.error(f"Request error downloading file from URL: {file_url}. Error: {e}") return jsonify({ "code": 400, "message": f"Failed to download file from URL: {str(e)}" }), 400 + except Exception as e: + logger.error(f"Unexpected error downloading file from URL: {file_url}. Error: {e}", exc_info=True) + return jsonify({ + "code": 500, + "message": f"Unexpected error while downloading file: {str(e)}" + }), 500 if not binary: return jsonify({ @@ -1096,28 +1236,17 @@ async def parse_to_md_upload(tenant_id): # Determine format type based on input_type if input_type == 'auto': # Auto mode: Try extension first, then binary detection - from pathlib import Path file_ext = Path(filename).suffix.lstrip('.').lower() if file_ext: # Has extension, try to use it - format_type_map = { - 'pdf': 'pdf', - 'docx': 'office', 'doc': 'office', - 'xlsx': 'office', 'xls': 'office', - 'pptx': 'office', 'ppt': 'office', - 'html': 'html', 'htm': 'html', - 'jpg': 'image', 'jpeg': 'image', - 'png': 'image' - } - format_type = format_type_map.get(file_ext) + format_type = FILE_EXTENSION_TO_FORMAT_TYPE.get(file_ext) if format_type: # Valid extension found logger.info(f"Using filename extension: {format_type} (.{file_ext}) for file: {filename}") else: # Unsupported extension, try auto-detect from binary - from powerrag.utils.file_utils import detect_file_type format_type = detect_file_type(binary) logger.info(f"Extension '{file_ext}' not supported, auto-detected from binary: {format_type} for file: {filename}") @@ -1128,7 +1257,6 @@ async def parse_to_md_upload(tenant_id): }), 400 else: # No extension, auto-detect from binary content - from powerrag.utils.file_utils import detect_file_type format_type = detect_file_type(binary) logger.info(f"No extension found, auto-detected file type from binary: {format_type} for file: {filename}") @@ -1137,15 +1265,23 @@ async def parse_to_md_upload(tenant_id): "code": 400, "message": f"Unable to determine file type for {filename}. File has no extension and binary auto-detection failed. Please provide a file with a valid extension or specify input_type explicitly." }), 400 - elif input_type in ['pdf', 'office', 'html', 'image']: - # Use explicitly specified input_type - format_type = input_type - logger.info(f"Using explicit input_type: {format_type} for file: {filename}") else: - return jsonify({ - "code": 400, - "message": f"Invalid input_type: {input_type}. Must be 'auto' (default), 'pdf', 'office', 'html', or 'image'." - }), 400 + # input_type is a specific file extension (e.g., 'pdf', 'docx', 'html', 'jpg') + # Normalize to lowercase and remove leading dot if present + input_ext = input_type.lstrip('.').lower() + + # Map extension to format type + format_type = FILE_EXTENSION_TO_FORMAT_TYPE.get(input_ext) + + if format_type: + logger.info(f"Using explicit input_type extension: {format_type} (.{input_ext}) for file: {filename}") + else: + # Invalid extension specified + supported_extensions = ', '.join(sorted(set(FILE_EXTENSION_TO_FORMAT_TYPE.keys()))) + return jsonify({ + "code": 400, + "message": f"Invalid input_type: '{input_type}'. Must be 'auto' (default) or a specific file extension: {supported_extensions}" + }), 400 # Create service and parse gotenberg_url = config.get("gotenberg_url", GOTENBERG_URL) @@ -1163,15 +1299,25 @@ async def parse_to_md_upload(tenant_id): return jsonify({ "code": 0, "data": { - "filename": filename, - "markdown": md_content, - "markdown_length": len(md_content), + "content": md_content, "images": images, "total_images": len(images) }, "message": "success" }), 200 + except json.JSONDecodeError as e: + logger.error(f"JSON decode error in parse_to_md_upload: {e}", exc_info=True) + return jsonify({ + "code": 400, + "message": f"Invalid JSON in request: {str(e)}" + }), 400 + except ValueError as e: + logger.error(f"Value error in parse_to_md_upload: {e}", exc_info=True) + return jsonify({ + "code": 400, + "message": str(e) + }), 400 except Exception as e: logger.error(f"Parse to markdown (upload) error: {e}", exc_info=True) return jsonify({ diff --git a/powerrag/server/services/parse_service.py b/powerrag/server/services/parse_service.py index 1c4d24f09..227e779a7 100644 --- a/powerrag/server/services/parse_service.py +++ b/powerrag/server/services/parse_service.py @@ -166,6 +166,10 @@ def parse_file_binary(self, binary: bytes, filename: str, } """ try: + # Normalize input_type: treat None as 'auto' + if input_type is None: + input_type = 'auto' + # Determine format type based on input_type parameter if input_type == 'auto': # Auto mode: Try extension first, then binary detection @@ -187,15 +191,27 @@ def parse_file_binary(self, binary: bytes, filename: str, f"File has no extension or unsupported extension '{file_ext}', and binary auto-detection failed. " f"Please provide a valid input_type explicitly." ) - elif input_type in ['pdf', 'office', 'html', 'image', 'markdown']: - # Use explicitly specified input_type - format_type = input_type - logger.info(f"Using explicit input_type: {format_type} for file: {filename}") else: - raise ValueError( - f"Invalid input_type: {input_type}. " - f"Must be 'auto', 'pdf', 'office', 'html', 'image', or 'markdown'" - ) + # input_type is a specific file extension (e.g., 'pdf', 'docx', 'html', 'jpg') + # Normalize to lowercase and remove leading dot if present + input_ext = input_type.lstrip('.').lower() + + # Map extension to format type + format_type = self.SUPPORTED_FORMATS.get(input_ext) + + if format_type: + logger.info(f"Using explicit input_type extension: {format_type} (.{input_ext}) for file: {filename}") + elif input_ext == 'markdown' or input_ext == 'md': + # Special case for markdown files + format_type = 'markdown' + logger.info(f"Using explicit input_type: {format_type} for file: {filename}") + else: + # Invalid extension specified + supported_extensions = ', '.join(sorted(set(self.SUPPORTED_FORMATS.keys()) | {'md', 'markdown'})) + raise ValueError( + f"Invalid input_type: '{input_type}'. " + f"Must be 'auto' (default) or a specific file extension: {supported_extensions}" + ) # Parse document to get markdown and images md_content, images = self._parse_to_markdown(filename, binary, format_type, config) @@ -666,17 +682,30 @@ def _parse_to_markdown_for_task(self, doc_id: str = None, filename: str = None, format_type = detect_file_type(binary) logger.info(f"Auto-detected file type: {format_type} for document {doc_id}") elif input_type: - format_type = input_type + # input_type is a specific file extension (e.g., 'pdf', 'docx', 'html', 'jpg') + # Normalize to lowercase and remove leading dot if present + input_ext = input_type.lstrip('.').lower() + + # Map extension to format type using SUPPORTED_FORMATS + format_type = self.SUPPORTED_FORMATS.get(input_ext) + + if format_type: + logger.info(f"Using explicit input_type extension: {format_type} (.{input_ext}) for document {doc_id}") + elif input_ext == 'markdown' or input_ext == 'md': + # Special case for markdown files + format_type = 'markdown' + logger.info(f"Using explicit input_type: {format_type} for document {doc_id}") + else: + # Invalid extension specified + supported_extensions = ', '.join(sorted(set(self.SUPPORTED_FORMATS.keys()) | {'md', 'markdown'})) + raise ValueError( + f"Invalid input_type: '{input_type}'. " + f"Must be 'auto' (default) or a specific file extension: {supported_extensions}" + ) else: # Auto-detect from file extension file_ext = Path(doc.name).suffix.lstrip('.').lower() - format_type_map = { - 'pdf': 'pdf', 'docx': 'office', 'doc': 'office', - 'xlsx': 'office', 'xls': 'office', 'pptx': 'office', 'ppt': 'office', - 'html': 'html', 'htm': 'html', - 'jpg': 'image', 'jpeg': 'image', 'png': 'image' - } - format_type = format_type_map.get(file_ext, 'pdf') + format_type = self.SUPPORTED_FORMATS.get(file_ext, 'pdf') filename = doc.name # Case 2: Parse from direct binary (filename, binary, format_type provided) diff --git a/powerrag/utils/file_utils.py b/powerrag/utils/file_utils.py index 5e96988e8..7c79e527b 100644 --- a/powerrag/utils/file_utils.py +++ b/powerrag/utils/file_utils.py @@ -374,6 +374,8 @@ def detect_file_type(binary: bytes) -> str: # Check HTML (basic detection) # Try to decode as text and check for HTML markers + # Note: Detection is case-insensitive - works regardless of original HTML tag casing + # (e.g., '', '', '' are all detected) try: text_sample = binary[:1024].decode('utf-8', errors='ignore').lower() if ' Date: Wed, 28 Jan 2026 16:22:49 +0800 Subject: [PATCH 17/19] feat(sdk): implement file splitting functionality and enhance documentation --- api/apps/sdk/powerrag_proxy.py | 45 ++++- powerrag/sdk/README.md | 195 ++++++++++++++++++++- powerrag/sdk/modules/chunk_manager.py | 168 +++++++++++++++++- powerrag/sdk/tests/test_chunk.py | 57 +++++- powerrag/sdk/tests/test_document.py | 5 + powerrag/server/routes/powerrag_routes.py | 203 +++++++++++++++++++++- powerrag/server/services/split_service.py | 177 ++++++++++++++++++- 7 files changed, 834 insertions(+), 16 deletions(-) diff --git a/api/apps/sdk/powerrag_proxy.py b/api/apps/sdk/powerrag_proxy.py index 3bafdf867..f0fffb811 100644 --- a/api/apps/sdk/powerrag_proxy.py +++ b/api/apps/sdk/powerrag_proxy.py @@ -91,12 +91,19 @@ async def _forward_request(method: str, endpoint: str, tenant_id: str = None): if files_dict: # 保留文件名信息!重要:不能直接 dict(files_dict) # 因为会丢失文件名。需要构造 httpx 期望的格式 + import asyncio + from io import BytesIO files = {} for field_name, file_storage in files_dict.items(): - # httpx 期望格式: (filename, content, content_type) + # 在线程中读取文件内容(避免阻塞事件循环) + # httpx 期望文件对象或元组格式 + # 使用 BytesIO 将 bytes 包装成文件对象 + file_content = await asyncio.to_thread(file_storage.read) + # httpx 期望格式: (filename, file_object, content_type) 或 (filename, file_object) + file_obj = BytesIO(file_content) files[field_name] = ( file_storage.filename, - file_storage.read(), + file_obj, file_storage.content_type or 'application/octet-stream' ) except Exception: @@ -592,3 +599,37 @@ async def parse_to_md_upload_proxy(tenant_id): """ return await _forward_request("POST", "/parse_to_md/upload", tenant_id) + +@manager.route("/powerrag/split/file", methods=["POST"]) # noqa: F821 +@token_required +async def split_file_proxy(tenant_id): + """ + 代理 split/file API 请求到 PowerRAG server + + 支持所有ParserType方法对文件进行切片(使用文件路径或URL) + + --- + tags: + - PowerRAG Proxy + security: + - ApiKeyAuth: [] + """ + return await _forward_request("POST", "/split/file", tenant_id) + + +@manager.route("/powerrag/split/file/upload", methods=["POST"]) # noqa: F821 +@token_required +async def split_file_upload_proxy(tenant_id): + """ + 代理 split/file/upload API 请求到 PowerRAG server + + 上传文件并切片,支持所有ParserType方法 + + --- + tags: + - PowerRAG Proxy + security: + - ApiKeyAuth: [] + """ + return await _forward_request("POST", "/split/file/upload", tenant_id) + diff --git a/powerrag/sdk/README.md b/powerrag/sdk/README.md index f59868396..6f4e37a4b 100644 --- a/powerrag/sdk/README.md +++ b/powerrag/sdk/README.md @@ -290,14 +290,123 @@ status = client.extraction.get_struct_extract_status(task['task_id']) ### 文本切片 -无需上传文档即可对文本进行切片: +无需上传文档即可对文本进行切片。 + +**注意**: `split_text` 方法仅支持以下三种解析器: +- `title`: 基于标题的切片 +- `regex`: 基于正则表达式的切片 +- `smart`: 智能切片 + +对于其他解析器(如 `naive`, `book`, `qa` 等),请使用 `split_file` 或 `split_file_upload` 方法。 ```python +# 使用 title 解析器 result = client.chunk.split_text( text="# Title\n\nContent...", parser_id="title", config={"chunk_token_num": 512} ) + +# 使用 regex 解析器 +result = client.chunk.split_text( + text="Section 1\n\nContent...", + parser_id="regex", + config={ + "chunk_token_num": 256, + "regex_pattern": r"Section \d+" + } +) + +# 使用 smart 解析器 +result = client.chunk.split_text( + text="Long text content...", + parser_id="smart", + config={"chunk_token_num": 512} +) + +print(f"Total chunks: {result['total_chunks']}") +for chunk in result['chunks']: + print(chunk) +``` + +### 文件切片 + +文件切片支持所有 ParserType 方法,提供三种使用方式: + +#### 方式 1: 使用本地文件路径 + +```python +result = client.chunk.split_file( + file_path="/path/to/document.pdf", + parser_id="book", # 支持所有 ParserType + config={ + "chunk_token_num": 512, + "delimiter": "\n。.;;!!??", + "lang": "Chinese", + "from_page": 0, + "to_page": 100000 + } +) +``` + +#### 方式 2: 使用文件 URL + +```python +result = client.chunk.split_file( + file_url="https://example.com/doc.pdf", + parser_id="naive", + config={ + "chunk_token_num": 256, + "max_file_size": 128 * 1024 * 1024, # 128MB + "download_timeout": 300, # 5分钟 + "head_request_timeout": 30 # 30秒 + } +) +``` + +#### 方式 3: 上传文件并切片 + +```python +result = client.chunk.split_file_upload( + file_path="/path/to/document.pdf", + parser_id="book", + config={ + "chunk_token_num": 512, + "delimiter": "\n。.;;!!??", + "lang": "Chinese" + } +) + +print(f"Total chunks: {result['total_chunks']}") +print(f"Filename: {result['filename']}") +for chunk in result['chunks']: + print(chunk) +``` + +**支持的 ParserType 方法:** +- 基础方法: `naive`, `title`, `regex`, `smart` +- 专业方法: `qa`, `book`, `laws`, `paper`, `manual`, `presentation` +- 特殊格式: `table`, `resume`, `picture`, `one`, `email` +- 高级方法: `knowledge_graph` + +**配置参数说明:** +- `chunk_token_num` (int): 目标分块大小(tokens),默认 512 +- `delimiter` (str): 分隔符字符串,默认 `"\n。.;;!!??"` +- `lang` (str): 语言,默认 `"Chinese"` +- `from_page` (int): PDF 起始页码,默认 0 +- `to_page` (int): PDF 结束页码,默认 100000 +- `max_file_size` (int): URL 下载的最大文件大小(字节),仅用于 `file_url` 方式 +- `download_timeout` (int): 下载超时时间(秒),仅用于 `file_url` 方式 +- `head_request_timeout` (int): HEAD 请求超时时间(秒),仅用于 `file_url` 方式 + +**返回值结构:** +```python +{ + "parser_id": "book", + "chunks": ["chunk1", "chunk2", ...], # 字符串列表 + "total_chunks": 10, + "filename": "document.pdf" +} ``` ## 核心模块 @@ -558,15 +667,49 @@ client.chunk.delete(kb_id, doc_id, [chunk_id]) # 删除文档的所有切片 client.chunk.delete(kb_id, doc_id, None) -# 文本切片(无需上传文档) +# 文本切片(仅支持 title, regex, smart) result = client.chunk.split_text( text="# Title\n\nLong text to be chunked...", - parser_id="title", # 解析器ID - config={"chunk_token_num": 512} # 自定义配置 + parser_id="title", # 仅支持: title, regex, smart + config={"chunk_token_num": 512} ) print(f"Total chunks: {result['total_chunks']}") for chunk in result['chunks']: - print(chunk['content']) + print(chunk) + +# 文件切片(支持所有ParserType方法) +# 方式1: 使用本地文件路径 +result = client.chunk.split_file( + file_path="/path/to/document.pdf", + parser_id="book", # 支持所有 ParserType + config={ + "chunk_token_num": 512, + "delimiter": "\n。.;;!!??", + "lang": "Chinese" + } +) + +# 方式2: 使用文件URL +result = client.chunk.split_file( + file_url="https://example.com/doc.pdf", + parser_id="naive", + config={ + "chunk_token_num": 256, + "max_file_size": 128 * 1024 * 1024, # 128MB + "download_timeout": 300 + } +) + +# 方式3: 上传文件并切片 +result = client.chunk.split_file_upload( + file_path="/path/to/document.pdf", + parser_id="book", + config={"chunk_token_num": 512} +) +print(f"Total chunks: {result['total_chunks']}") +print(f"Filename: {result['filename']}") +for chunk in result['chunks']: + print(chunk) ``` ### 4. 信息抽取 (Extraction) @@ -894,6 +1037,7 @@ SDK 包含完整的测试套件,覆盖所有功能模块。 # 设置环境变量 export HOST_ADDRESS="http://127.0.0.1:9380" export POWERRAG_API_KEY="your-api-key" +export PYTHONPATH=$(pwd) # 运行测试 pytest powerrag/sdk/tests/ @@ -1203,6 +1347,47 @@ for result in results: # 重新解析或删除 ``` +### Q: 文本切片和文件切片有什么区别?应该使用哪个? + +A: +- **`split_text`**: 仅支持 `title`, `regex`, `smart` 三种解析器,适用于纯文本内容(Markdown格式) +- **`split_file`**: 支持所有 ParserType 方法,适用于文件(通过路径或URL) +- **`split_file_upload`**: 支持所有 ParserType 方法,适用于文件上传 + +**使用建议:** +- 如果只有文本内容且需要使用 `title`/`regex`/`smart`,使用 `split_text` +- 如果有文件且需要使用其他解析器(如 `book`, `qa`, `naive` 等),使用 `split_file` 或 `split_file_upload` +- 如果文件在本地,使用 `split_file(file_path=...)` 或 `split_file_upload` +- 如果文件在远程URL,使用 `split_file(file_url=...)` + +**示例:** +```python +# 文本切片(仅支持 title, regex, smart) +result = client.chunk.split_text( + text="# Title\n\nContent...", + parser_id="title" +) + +# 文件切片(支持所有解析器) +# 本地文件 +result = client.chunk.split_file( + file_path="/path/to/doc.pdf", + parser_id="book" # 可以使用任何解析器 +) + +# 远程文件 +result = client.chunk.split_file( + file_url="https://example.com/doc.pdf", + parser_id="naive" +) + +# 文件上传 +result = client.chunk.split_file_upload( + file_path="/path/to/doc.pdf", + parser_id="qa" +) +``` + ### Q: 如何解析无扩展名的文件? A: 使用 `parse_to_md_binary` 方法并使用 `input_type='auto'`(默认值): diff --git a/powerrag/sdk/modules/chunk_manager.py b/powerrag/sdk/modules/chunk_manager.py index 61acccd1b..84bbdf643 100644 --- a/powerrag/sdk/modules/chunk_manager.py +++ b/powerrag/sdk/modules/chunk_manager.py @@ -14,7 +14,8 @@ # limitations under the License. # -from typing import Optional, List, Dict, Any +from typing import Optional, List, Dict, Any, Union +from pathlib import Path from .chunk import ChunkInfo @@ -268,4 +269,169 @@ def split_text( raise Exception(res_json.get("message", "Split text failed")) return res_json.get("data", {}) + + def split_file( + self, + file_path: Optional[str] = None, + file_url: Optional[str] = None, + parser_id: str = "naive", + config: Optional[Dict[str, Any]] = None, + ) -> Dict[str, Any]: + """ + 文件切片(支持所有ParserType方法) + + 支持三种方式: + 1. 本地文件路径:file_path + 2. 文件URL:file_url + 3. 文件上传:使用 split_file_upload 方法 + + Args: + file_path: 本地文件路径(可选,与file_url二选一) + file_url: 文件URL(可选,与file_path二选一) + parser_id: 解析器ID,支持所有ParserType: + - naive, qa, book, laws, paper, manual, presentation + - table, resume, picture, one, audio, email, tag + - knowledge_graph, title, regex, smart + 默认"naive" + config: 解析配置(可选) + - chunk_token_num: 目标分块大小(tokens),默认512 + - delimiter: 分隔符字符串,默认"\n。.;;!!??" + - lang: 语言,默认"Chinese" + - from_page: 起始页码,默认0 + - to_page: 结束页码,默认100000 + - max_file_size: URL下载的最大文件大小(字节) + - download_timeout: 下载超时时间(秒) + - head_request_timeout: HEAD请求超时时间(秒) + + Returns: + 切片结果,包含chunks列表、total_chunks数量和filename + + Raises: + Exception: API调用失败 + ValueError: file_path和file_url都未提供 + + Example: + ```python + # 使用本地文件路径 + result = client.chunk.split_file( + file_path="/path/to/document.pdf", + parser_id="book", + config={"chunk_token_num": 512} + ) + + # 使用文件URL + result = client.chunk.split_file( + file_url="https://example.com/doc.pdf", + parser_id="naive", + config={"chunk_token_num": 256} + ) + ``` + """ + if not file_path and not file_url: + raise ValueError("Either file_path or file_url must be provided") + + payload = { + "parser_id": parser_id, + } + + if file_path: + payload["file_path"] = file_path + if file_url: + payload["file_url"] = file_url + + if config: + payload["config"] = config + + url = "/powerrag/split/file" + res = self.client.post(url, json=payload) + + # 检查响应状态码 + if res.status_code != 200: + try: + error_json = res.json() + error_msg = error_json.get("message", f"HTTP {res.status_code}") + except Exception: + error_msg = f"HTTP {res.status_code}: {res.text[:200]}" + raise Exception(error_msg) + + res_json = res.json() + + if res_json.get("code") != 0: + raise Exception(res_json.get("message", "Split file failed")) + + return res_json.get("data", {}) + + def split_file_upload( + self, + file_path: Union[str, Path], + parser_id: str = "naive", + config: Optional[Dict[str, Any]] = None, + ) -> Dict[str, Any]: + """ + 上传文件并切片(支持所有ParserType方法) + + Args: + file_path: 本地文件路径 + parser_id: 解析器ID,支持所有ParserType,默认"naive" + config: 解析配置(可选) + - chunk_token_num: 目标分块大小(tokens),默认512 + - delimiter: 分隔符字符串,默认"\n。.;;!!??" + - lang: 语言,默认"Chinese" + - from_page: 起始页码,默认0 + - to_page: 结束页码,默认100000 + + Returns: + 切片结果,包含chunks列表、total_chunks数量和filename + + Raises: + Exception: API调用失败 + FileNotFoundError: 文件不存在 + + Example: + ```python + result = client.chunk.split_file_upload( + file_path="/path/to/document.pdf", + parser_id="book", + config={"chunk_token_num": 512} + ) + print(f"Total chunks: {result['total_chunks']}") + for chunk in result['chunks']: + print(chunk) + ``` + """ + path = Path(file_path) + if not path.exists(): + raise FileNotFoundError(f"File not found: {file_path}") + + # 准备文件 + with open(path, "rb") as f: + files = [("file", (path.name, f.read()))] + + # 准备表单数据 + form_data = { + "parser_id": parser_id, + } + + if config: + import json + form_data["config"] = json.dumps(config) + + url = "/powerrag/split/file/upload" + res = self.client.post(url, json=None, files=files, data=form_data) + + # 检查响应状态码 + if res.status_code != 200: + try: + error_json = res.json() + error_msg = error_json.get("message", f"HTTP {res.status_code}") + except Exception: + error_msg = f"HTTP {res.status_code}: {res.text[:200]}" + raise Exception(error_msg) + + res_json = res.json() + + if res_json.get("code") != 0: + raise Exception(res_json.get("message", "Split file upload failed")) + + return res_json.get("data", {}) diff --git a/powerrag/sdk/tests/test_chunk.py b/powerrag/sdk/tests/test_chunk.py index 940460201..84f649d6f 100644 --- a/powerrag/sdk/tests/test_chunk.py +++ b/powerrag/sdk/tests/test_chunk.py @@ -156,8 +156,63 @@ def test_split_text_with_config(self, client: PowerRAGClient): text = "This is a test document with multiple paragraphs." result = client.chunk.split_text( text=text, - parser_id="naive", + parser_id="regex", config={"chunk_token_num": 128} ) assert "chunks" in result or "total_chunks" in result + + def test_split_text_unsupported_parser(self, client: PowerRAGClient): + """测试不支持的parser_id应该抛出错误""" + text = "Test text" + # 使用一个真正不支持的 parser_id(如 "paper") + # 注意:naive 实际上是被支持的(通过 RAGFlow 代理) + with pytest.raises(Exception) as exc_info: + client.chunk.split_text( + text=text, + parser_id="paper", # paper 不支持纯文本切片,需要文件处理 + config={"chunk_token_num": 128} + ) + assert "not supported" in str(exc_info.value).lower() or "unknown" in str(exc_info.value).lower() or "failed" in str(exc_info.value).lower() + + +class TestChunkSplitFile: + """测试文件切片""" + + def test_split_file_upload(self, client: PowerRAGClient, test_file_path: str): + """测试上传文件并切片""" + result = client.chunk.split_file_upload( + file_path=test_file_path, + parser_id="naive", + config={"chunk_token_num": 512} + ) + assert "chunks" in result + assert "total_chunks" in result + assert "filename" in result + assert isinstance(result["chunks"], list) + assert result["total_chunks"] >= 0 + + def test_split_file_upload_with_different_parsers(self, client: PowerRAGClient, test_file_path: str): + """测试使用不同parser_id的文件切片""" + parsers = ["naive", "book", "title"] + for parser_id in parsers: + try: + result = client.chunk.split_file_upload( + file_path=test_file_path, + parser_id=parser_id, + config={"chunk_token_num": 256} + ) + assert "chunks" in result + assert result["total_chunks"] >= 0 + except Exception as e: + # 某些parser可能不支持特定文件类型,这是正常的 + if "not supported" not in str(e).lower(): + raise + + def test_split_file_upload_nonexistent_file(self, client: PowerRAGClient): + """测试不存在的文件应该抛出错误""" + with pytest.raises(FileNotFoundError): + client.chunk.split_file_upload( + file_path="/nonexistent/file.pdf", + parser_id="naive" + ) diff --git a/powerrag/sdk/tests/test_document.py b/powerrag/sdk/tests/test_document.py index 173b1ddde..947c385d4 100644 --- a/powerrag/sdk/tests/test_document.py +++ b/powerrag/sdk/tests/test_document.py @@ -217,13 +217,18 @@ def test_parse_to_chunk_async(self, client: PowerRAGClient, kb_id: str, test_fil def test_cancel_parse(self, client: PowerRAGClient, kb_id: str, test_file_path: str): """测试取消解析""" + import time uploaded_docs = client.document.upload(kb_id, test_file_path) doc_id = uploaded_docs[0]["id"] try: client.document.parse_to_chunk(kb_id, [doc_id], wait=False) + # Wait a bit for parsing to start + time.sleep(0.5) client.document.cancel_parse(kb_id, [doc_id]) + # Wait a bit for status update + time.sleep(0.5) doc = client.document.get(kb_id, doc_id) assert doc["run"] in ["CANCEL", "UNSTART"] finally: diff --git a/powerrag/server/routes/powerrag_routes.py b/powerrag/server/routes/powerrag_routes.py index b7d7f3221..03c7d08fb 100644 --- a/powerrag/server/routes/powerrag_routes.py +++ b/powerrag/server/routes/powerrag_routes.py @@ -1187,11 +1187,13 @@ async def parse_to_md_upload(tenant_id): "message": f"Request timeout while downloading file from URL. Please try again or increase timeout." }), 408 except requests.exceptions.ConnectionError as e: + # ConnectionError includes DNS resolution failures, invalid URLs, etc. + # Return 400 (Bad Request) instead of 503 (Service Unavailable) for invalid URLs logger.error(f"Connection error downloading file from URL: {file_url}. Error: {e}") return jsonify({ - "code": 503, - "message": f"Failed to connect to file URL. Please check the URL and try again." - }), 503 + "code": 400, + "message": f"Failed to download file from URL: {str(e)}" + }), 400 except requests.exceptions.HTTPError as e: logger.error(f"HTTP error downloading file from URL: {file_url}. Error: {e}") return jsonify({ @@ -1402,6 +1404,201 @@ async def split_text(tenant_id): }), 500 +@powerrag_bp.route("/split/file", methods=["POST"]) +@apikey_required +async def split_file(tenant_id): + """ + Split file into chunks using rag/app chunking methods + + Supports all ParserType methods: naive, qa, book, laws, paper, manual, + presentation, table, resume, picture, one, audio, email, tag, knowledge_graph, + title, regex, smart + + Request JSON: + { + "file_path": "/path/to/document.pdf", # or use file_url + "file_url": "https://example.com/doc.pdf", # optional + "parser_id": "naive", + "config": { + "chunk_token_num": 512, + "delimiter": "\n。.;;!!??", + "lang": "Chinese", + "from_page": 0, + "to_page": 100000 + } + } + + Response: + { + "code": 0, + "data": { + "parser_id": "naive", + "chunks": ["chunk1", "chunk2", ...], + "total_chunks": 10, + "filename": "document.pdf" + } + } + """ + try: + data = await request.get_json() + + if not data: + return jsonify({ + "code": 400, + "message": "No JSON data provided" + }), 400 + + file_path = data.get("file_path") + file_url = data.get("file_url") + parser_id = data.get("parser_id", "naive") + config = data.get("config", {}) + + if not file_path and not file_url: + return jsonify({ + "code": 400, + "message": "Either file_path or file_url is required" + }), 400 + + # Handle file URL download + if file_url: + max_file_size = config.get('max_file_size', settings.DOC_MAXIMUM_SIZE) + download_timeout = config.get('download_timeout', DEFAULT_DOWNLOAD_TIMEOUT) + head_timeout = config.get('head_request_timeout', DEFAULT_HEAD_REQUEST_TIMEOUT) + + logger.info(f"Downloading file from URL: {file_url}") + try: + binary, error_msg = download_file_with_validation( + file_url, max_file_size, download_timeout, head_timeout + ) + if error_msg: + return jsonify({ + "code": 400, + "message": f"Failed to download file: {error_msg}" + }), 400 + + # Extract filename from URL or use provided filename + filename = config.get('filename') or file_url.split('/')[-1].split('?')[0] + if not filename: + filename = "downloaded_file" + except Exception as e: + logger.error(f"Error downloading file from URL: {e}", exc_info=True) + return jsonify({ + "code": 500, + "message": f"Failed to download file: {str(e)}" + }), 500 + else: + # Use file path + filename = file_path + binary = None + + service = PowerRAGSplitService() + result = service.split_file(filename=filename, binary=binary, parser_id=parser_id, config=config) + + return jsonify({ + "code": 0, + "data": result, + "message": "success" + }), 200 + + except Exception as e: + logger.error(f"Split file error: {e}", exc_info=True) + return jsonify({ + "code": 500, + "message": str(e) + }), 500 + + +@powerrag_bp.route("/split/file/upload", methods=["POST"]) +@apikey_required +async def split_file_upload(tenant_id): + """ + Split uploaded file into chunks using rag/app chunking methods + + Supports all ParserType methods: naive, qa, book, laws, paper, manual, + presentation, table, resume, picture, one, audio, email, tag, knowledge_graph, + title, regex, smart + + Request (multipart/form-data): + - file: File to split (required) + - parser_id: Parser ID (optional, default: "naive") + - config: JSON string of parser config (optional) + + Config parameters: + - chunk_token_num (int): Target chunk size in tokens (default: 512) + - delimiter (str): Delimiter string for splitting (default: "\n。.;;!!??") + - lang (str): Language (default: "Chinese") + - from_page (int): Start page number (default: 0) + - to_page (int): End page number (default: 100000) + + Response: + { + "code": 0, + "data": { + "parser_id": "naive", + "chunks": ["chunk1", "chunk2", ...], + "total_chunks": 10, + "filename": "document.pdf" + } + } + """ + try: + # Check if file is present + files = await request.files + if 'file' not in files: + return jsonify({ + "code": 400, + "message": "No file provided" + }), 400 + + file = files['file'] + if file.filename == '': + return jsonify({ + "code": 400, + "message": "No file selected" + }), 400 + + # Get parameters + form = await request.form + parser_id = form.get('parser_id', 'naive') + + # Parse config from JSON string if provided + import json + config_str = form.get('config', '{}') + try: + config = json.loads(config_str) + except json.JSONDecodeError: + return jsonify({ + "code": 400, + "message": "Invalid JSON in config parameter" + }), 400 + + filename = file.filename + + # Read file binary (file.read() is synchronous in Quart) + binary = file.read() + if not binary: + return jsonify({ + "code": 400, + "message": "File is empty" + }), 400 + + service = PowerRAGSplitService() + result = service.split_file(filename=filename, binary=binary, parser_id=parser_id, config=config) + + return jsonify({ + "code": 0, + "data": result, + "message": "success" + }), 200 + + except Exception as e: + logger.error(f"Split file upload error: {e}", exc_info=True) + return jsonify({ + "code": 500, + "message": str(e) + }), 500 + + # ============================================================================ # 信息抽取接口 # ============================================================================ diff --git a/powerrag/server/services/split_service.py b/powerrag/server/services/split_service.py index e4b535938..76b8b6ef4 100644 --- a/powerrag/server/services/split_service.py +++ b/powerrag/server/services/split_service.py @@ -50,12 +50,13 @@ class PowerRAGSplitService: def __init__(self): # 初始化时动态导入chunker,避免循环导入 self._init_chunker_factory() + self._init_file_chunker_factory() def _init_chunker_factory(self): """动态导入chunker模块,避免循环导入""" global CHUNKER_FACTORY if not CHUNKER_FACTORY: - # 直接引用同一模块中定义的函数 + # PowerRAG 专门的 chunker(仅支持文本切分) CHUNKER_FACTORY.update({ ParserType.TITLE.value: title_based_chunking, # PowerRAG Title Chunker ParserType.REGEX.value: regex_based_chunking, # PowerRAG regex Chunker @@ -158,9 +159,14 @@ def dummy(prog=None, msg=""): # Smart chunking returns a list of chunks directly chunks = chunker(text, parser_config=parser_config) else: - # Use config as-is for other parsers - chunks=[] - raise ValueError(f"Chunker not found for parser_id: {parser_id}") + # Other parser types (naive, qa, book, laws, etc.) are not supported for text splitting + # Use split_file method instead for file-based chunking + raise ValueError( + f"Parser '{parser_id}' is not supported for text splitting. " + f"Supported parsers for text splitting are: {ParserType.TITLE.value}, " + f"{ParserType.REGEX.value}, {ParserType.SMART.value}. " + f"For other parser types, please use split_file() method instead." + ) # Ensure all chunks are strings and handle encoding processed_chunks = [] @@ -196,6 +202,169 @@ def dummy(prog=None, msg=""): logger.error(f"Error splitting text with parser '{parser_id}': {e}", exc_info=True) raise + def _init_file_chunker_factory(self): + """初始化文件 chunker factory,映射 ParserType 到 rag/app 模块""" + # 延迟导入,避免循环导入 + if not hasattr(self, '_file_chunker_factory'): + self._file_chunker_factory = {} + try: + # 导入 rag/app 模块 + from rag.app import ( + laws, paper, presentation, manual, qa, table, book, resume, + picture, naive, one, audio, email, tag + ) + # 导入 powerrag/app 模块 + from powerrag.app import title as powerrag_title, regex as powerrag_regex, smart as powerrag_smart + + # 映射 ParserType 到对应的 chunk 模块 + self._file_chunker_factory = { + ParserType.NAIVE.value: naive, + ParserType.PAPER.value: paper, + ParserType.BOOK.value: book, + ParserType.PRESENTATION.value: presentation, + ParserType.MANUAL.value: manual, + ParserType.LAWS.value: laws, + ParserType.QA.value: qa, + ParserType.TABLE.value: table, + ParserType.RESUME.value: resume, + ParserType.PICTURE.value: picture, + ParserType.ONE.value: one, + ParserType.EMAIL.value: email, + ParserType.KG.value: naive, # knowledge_graph 使用 naive + ParserType.TAG.value: tag, + ParserType.TITLE.value: powerrag_title, # PowerRAG Title Parser + ParserType.REGEX.value: powerrag_regex, # PowerRAG Regex Parser + ParserType.SMART.value: powerrag_smart, # PowerRAG Smart Parser + } + except ImportError as e: + logger.warning(f"Failed to import some rag/app modules: {e}") + # 如果导入失败,至少提供基本的 naive chunker + try: + from rag.app import naive + self._file_chunker_factory = {ParserType.NAIVE.value: naive} + except ImportError: + logger.error("Failed to import naive chunker, file splitting will not work") + self._file_chunker_factory = {} + + def split_file(self, filename: str = None, binary: bytes = None, parser_id: str = "naive", + config: Dict[str, Any] = None) -> Dict[str, Any]: + """ + Split file into chunks using rag/app chunking methods + + Args: + filename: File path (optional if binary is provided) + binary: File binary content (optional if filename is provided) + parser_id: Parser/chunker ID (e.g., "naive", "book", "title") + config: Chunking configuration (optional) + + Returns: + Dict containing chunks and metadata + + Example: + ```python + service = PowerRAGSplitService() + + # Using file path + result = service.split_file( + filename="/path/to/document.pdf", + parser_id="book", + config={"chunk_token_num": 512} + ) + + # Using binary + with open("document.pdf", "rb") as f: + binary = f.read() + result = service.split_file( + filename="document.pdf", + binary=binary, + parser_id="naive", + config={"chunk_token_num": 256} + ) + ``` + """ + if not filename and not binary: + raise ValueError("Either filename or binary must be provided") + + if filename and not binary: + # Read file from path + with open(filename, "rb") as f: + binary = f.read() + + if not filename: + # Generate a temporary filename from binary or use default + filename = "temp_file" + + if config is None: + config = {} + + # Get chunker module + chunker_module = self._file_chunker_factory.get(parser_id.lower()) + if not chunker_module: + logger.warning(f"Chunker '{parser_id}' not found in file chunker factory, using naive") + chunker_module = self._file_chunker_factory.get(ParserType.NAIVE.value) + if not chunker_module: + raise ValueError(f"Chunker '{parser_id}' not found and naive chunker not available") + + # Prepare callback + def dummy(prog=None, msg=""): + """Dummy callback for progress""" + pass + + # Build parser_config from config + parser_config = config.copy() + parser_config.setdefault("chunk_token_num", 512) + parser_config.setdefault("delimiter", "\n。.;;!!??") + + # Build kwargs + kwargs = { + "lang": config.get("lang", "Chinese"), + "callback": dummy, + "parser_config": parser_config, + "from_page": config.get("from_page", 0), + "to_page": config.get("to_page", 100000), + } + + # Add optional fields + if config.get("tenant_id"): + kwargs["tenant_id"] = config["tenant_id"] + if config.get("kb_id"): + kwargs["kb_id"] = config["kb_id"] + if config.get("doc_id"): + kwargs["doc_id"] = config["doc_id"] + + try: + # Call chunk function + logger.info(f"Calling chunk function for parser '{parser_id}' on file '{filename}'") + tokenized_chunks = chunker_module.chunk(filename, binary=binary, **kwargs) + + # Extract text content from tokenized chunks + chunks = [] + for chunk_dict in tokenized_chunks: + if isinstance(chunk_dict, dict): + # Extract content_with_weight or content field + content = chunk_dict.get("content_with_weight") or chunk_dict.get("content", "") + if content: + chunks.append(content) + elif isinstance(chunk_dict, str): + chunks.append(chunk_dict) + + logger.info(f"Split file '{filename}' with parser '{parser_id}': {len(chunks)} chunks") + + return { + "parser_id": parser_id, + "chunks": chunks, + "total_chunks": len(chunks), + "filename": filename, + "metadata": { + "chunker": "rag/app", + "config": config + } + } + + except Exception as e: + logger.error(f"Error splitting file '{filename}' with parser '{parser_id}': {e}", exc_info=True) + raise + # ============================================== # Shared utility functions for chunking From 537e40acdbcbd4966b860a0d935297852c5aa92c Mon Sep 17 00:00:00 2001 From: zhanggan7723 Date: Tue, 24 Feb 2026 20:15:11 +0800 Subject: [PATCH 18/19] Update powerrag/server/services/split_service.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- powerrag/server/services/split_service.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/powerrag/server/services/split_service.py b/powerrag/server/services/split_service.py index 76b8b6ef4..832287546 100644 --- a/powerrag/server/services/split_service.py +++ b/powerrag/server/services/split_service.py @@ -287,8 +287,13 @@ def split_file(self, filename: str = None, binary: bytes = None, parser_id: str if filename and not binary: # Read file from path - with open(filename, "rb") as f: - binary = f.read() + try: + with open(filename, "rb") as f: + binary = f.read() + except FileNotFoundError as e: + raise FileNotFoundError( + f"Failed to open file '{filename}' for splitting" + ) from e if not filename: # Generate a temporary filename from binary or use default From 0cb7666c8369a49b32ee0d683dd0d1ed6f6994a4 Mon Sep 17 00:00:00 2001 From: "suiyu.zg" Date: Tue, 24 Feb 2026 20:50:29 +0800 Subject: [PATCH 19/19] feat(sdk): enhance file splitting functionality and improve error handling --- api/apps/sdk/powerrag_proxy.py | 7 ++- powerrag/sdk/README.md | 1 - powerrag/sdk/modules/chunk_manager.py | 4 +- powerrag/sdk/tests/test_chunk.py | 73 +++++++++++++++++++++-- powerrag/sdk/tests/test_document.py | 41 +++++++++---- powerrag/server/routes/powerrag_routes.py | 53 +++++++++------- powerrag/server/services/split_service.py | 32 +++++----- 7 files changed, 151 insertions(+), 60 deletions(-) diff --git a/api/apps/sdk/powerrag_proxy.py b/api/apps/sdk/powerrag_proxy.py index f0fffb811..065071c34 100644 --- a/api/apps/sdk/powerrag_proxy.py +++ b/api/apps/sdk/powerrag_proxy.py @@ -21,8 +21,11 @@ 这样 SDK 可以通过主 RAGFlow 服务访问 PowerRAG 功能,无需直接连接到 PowerRAG server """ -import os +import asyncio import logging +import os +from io import BytesIO + import httpx from quart import request, jsonify from api.utils.api_utils import token_required, get_error_data_result @@ -91,8 +94,6 @@ async def _forward_request(method: str, endpoint: str, tenant_id: str = None): if files_dict: # 保留文件名信息!重要:不能直接 dict(files_dict) # 因为会丢失文件名。需要构造 httpx 期望的格式 - import asyncio - from io import BytesIO files = {} for field_name, file_storage in files_dict.items(): # 在线程中读取文件内容(避免阻塞事件循环) diff --git a/powerrag/sdk/README.md b/powerrag/sdk/README.md index 6f4e37a4b..825a134f5 100644 --- a/powerrag/sdk/README.md +++ b/powerrag/sdk/README.md @@ -387,7 +387,6 @@ for chunk in result['chunks']: - 基础方法: `naive`, `title`, `regex`, `smart` - 专业方法: `qa`, `book`, `laws`, `paper`, `manual`, `presentation` - 特殊格式: `table`, `resume`, `picture`, `one`, `email` -- 高级方法: `knowledge_graph` **配置参数说明:** - `chunk_token_num` (int): 目标分块大小(tokens),默认 512 diff --git a/powerrag/sdk/modules/chunk_manager.py b/powerrag/sdk/modules/chunk_manager.py index 84bbdf643..45c41b64e 100644 --- a/powerrag/sdk/modules/chunk_manager.py +++ b/powerrag/sdk/modules/chunk_manager.py @@ -14,8 +14,9 @@ # limitations under the License. # -from typing import Optional, List, Dict, Any, Union +import json from pathlib import Path +from typing import Optional, List, Dict, Any, Union from .chunk import ChunkInfo @@ -413,7 +414,6 @@ def split_file_upload( } if config: - import json form_data["config"] = json.dumps(config) url = "/powerrag/split/file/upload" diff --git a/powerrag/sdk/tests/test_chunk.py b/powerrag/sdk/tests/test_chunk.py index 84f649d6f..7b7861e90 100644 --- a/powerrag/sdk/tests/test_chunk.py +++ b/powerrag/sdk/tests/test_chunk.py @@ -165,19 +165,81 @@ def test_split_text_unsupported_parser(self, client: PowerRAGClient): """测试不支持的parser_id应该抛出错误""" text = "Test text" # 使用一个真正不支持的 parser_id(如 "paper") - # 注意:naive 实际上是被支持的(通过 RAGFlow 代理) with pytest.raises(Exception) as exc_info: client.chunk.split_text( text=text, parser_id="paper", # paper 不支持纯文本切片,需要文件处理 config={"chunk_token_num": 128} ) - assert "not supported" in str(exc_info.value).lower() or "unknown" in str(exc_info.value).lower() or "failed" in str(exc_info.value).lower() + error_msg = str(exc_info.value).lower() + # PowerRAG: "not supported for text splitting" + "split_file" + # RAGFlow proxy: "unknown chunker" + "supported text chunkers" + assert ( + ("not supported for text splitting" in error_msg and "split_file" in error_msg) + or ("unknown chunker" in error_msg and "paper" in error_msg) + ) class TestChunkSplitFile: """测试文件切片""" - + + def test_split_file_with_file_path(self, client: PowerRAGClient, test_file_path: str): + """测试使用 file_path 参数的文件切片(服务器需能访问该路径)""" + result = client.chunk.split_file( + file_path=test_file_path, + parser_id="naive", + config={"chunk_token_num": 512}, + ) + assert "chunks" in result + assert "total_chunks" in result + assert "filename" in result + assert isinstance(result["chunks"], list) + assert result["total_chunks"] >= 0 + + def test_split_file_with_file_url(self, client: PowerRAGClient): + """测试使用 file_url 参数的文件切片""" + # 使用 httpbin.org(HTTP 避免 SSL 证书问题,返回 HTML) + result = client.chunk.split_file( + file_url="http://httpbin.org/html", + parser_id="naive", + config={"chunk_token_num": 512, "filename": "example.html"}, + ) + assert "chunks" in result + assert "total_chunks" in result + assert "filename" in result + assert isinstance(result["chunks"], list) + assert result["total_chunks"] >= 0 + + def test_split_file_missing_both(self, client: PowerRAGClient): + """测试 file_path 和 file_url 都未提供时应抛出错误""" + with pytest.raises(ValueError) as exc_info: + client.chunk.split_file(parser_id="naive") + assert "file_path" in str(exc_info.value).lower() or "file_url" in str(exc_info.value).lower() + assert "must be provided" in str(exc_info.value).lower() + + def test_split_file_invalid_url(self, client: PowerRAGClient): + """测试无效 URL 应抛出错误""" + with pytest.raises(Exception) as exc_info: + client.chunk.split_file( + file_url="http://invalid-domain-does-not-exist-12345.example.com/file.pdf", + parser_id="naive", + config={"download_timeout": 5}, + ) + error_msg = str(exc_info.value).lower() + assert "download" in error_msg or "failed" in error_msg or "connection" in error_msg + + def test_split_file_size_limit_exceeded(self, client: PowerRAGClient): + """测试超过大小限制的 URL 应抛出错误""" + # httpbin.org/bytes/1000000 返回 1MB,max_file_size=1024 应触发限制 + with pytest.raises(Exception) as exc_info: + client.chunk.split_file( + file_url="https://httpbin.org/bytes/1000000", + parser_id="naive", + config={"max_file_size": 1024}, + ) + error_msg = str(exc_info.value).lower() + assert "exceeds" in error_msg or "size" in error_msg or "limit" in error_msg + def test_split_file_upload(self, client: PowerRAGClient, test_file_path: str): """测试上传文件并切片""" result = client.chunk.split_file_upload( @@ -210,9 +272,12 @@ def test_split_file_upload_with_different_parsers(self, client: PowerRAGClient, def test_split_file_upload_nonexistent_file(self, client: PowerRAGClient): """测试不存在的文件应该抛出错误""" - with pytest.raises(FileNotFoundError): + with pytest.raises((FileNotFoundError, Exception)) as exc_info: client.chunk.split_file_upload( file_path="/nonexistent/file.pdf", parser_id="naive" ) + # SDK raises FileNotFoundError locally; API may wrap in generic Exception + error_msg = str(exc_info.value).lower() + assert "not found" in error_msg or "no such file" in error_msg diff --git a/powerrag/sdk/tests/test_document.py b/powerrag/sdk/tests/test_document.py index 947c385d4..69dac99ff 100644 --- a/powerrag/sdk/tests/test_document.py +++ b/powerrag/sdk/tests/test_document.py @@ -14,6 +14,8 @@ # limitations under the License. # +import time + import pytest from powerrag.sdk import PowerRAGClient @@ -217,20 +219,33 @@ def test_parse_to_chunk_async(self, client: PowerRAGClient, kb_id: str, test_fil def test_cancel_parse(self, client: PowerRAGClient, kb_id: str, test_file_path: str): """测试取消解析""" - import time uploaded_docs = client.document.upload(kb_id, test_file_path) doc_id = uploaded_docs[0]["id"] - + try: client.document.parse_to_chunk(kb_id, [doc_id], wait=False) - # Wait a bit for parsing to start - time.sleep(0.5) + + # Poll for parsing to start (RUNNING or SCHEDULE), timeout 10s + for _ in range(100): + doc = client.document.get(kb_id, doc_id) + if doc["run"] in ["RUNNING", "1", "SCHEDULE"]: + break + time.sleep(0.1) + else: + pytest.fail("Parsing did not start within 10s") + client.document.cancel_parse(kb_id, [doc_id]) - - # Wait a bit for status update - time.sleep(0.5) - doc = client.document.get(kb_id, doc_id) - assert doc["run"] in ["CANCEL", "UNSTART"] + + # Poll for cancel to propagate, timeout 5s + for _ in range(50): + doc = client.document.get(kb_id, doc_id) + if doc["run"] in ["CANCEL", "UNSTART", "2", "0"]: + break + time.sleep(0.1) + else: + pytest.fail("Cancel did not propagate within 5s") + + assert doc["run"] in ["CANCEL", "UNSTART", "2", "0"] finally: client.document.delete(kb_id, [doc_id]) @@ -959,12 +974,12 @@ def test_parse_from_invalid_url(self, client: PowerRAGClient): } ) - # 应该返回 400 错误 - assert response.status_code == 400 + # 无效 URL(连接失败)返回 502 (Bad Gateway) + assert response.status_code == 502 result = response.json() - assert result["code"] == 400 + assert result["code"] == 502 assert "Failed to download" in result["message"] - + def test_parse_cannot_provide_both_file_and_url(self, client: PowerRAGClient, tmp_path): """测试不能同时提供 file 和 file_url""" import requests diff --git a/powerrag/server/routes/powerrag_routes.py b/powerrag/server/routes/powerrag_routes.py index 03c7d08fb..c6acc82cb 100644 --- a/powerrag/server/routes/powerrag_routes.py +++ b/powerrag/server/routes/powerrag_routes.py @@ -1187,13 +1187,13 @@ async def parse_to_md_upload(tenant_id): "message": f"Request timeout while downloading file from URL. Please try again or increase timeout." }), 408 except requests.exceptions.ConnectionError as e: - # ConnectionError includes DNS resolution failures, invalid URLs, etc. - # Return 400 (Bad Request) instead of 503 (Service Unavailable) for invalid URLs + # ConnectionError: DNS failure, invalid URL, network unreachable, connection refused. + # 502 (Bad Gateway) indicates we could not reach the upstream URL. logger.error(f"Connection error downloading file from URL: {file_url}. Error: {e}") return jsonify({ - "code": 400, + "code": 502, "message": f"Failed to download file from URL: {str(e)}" - }), 400 + }), 502 except requests.exceptions.HTTPError as e: logger.error(f"HTTP error downloading file from URL: {file_url}. Error: {e}") return jsonify({ @@ -1201,11 +1201,12 @@ async def parse_to_md_upload(tenant_id): "message": f"HTTP error while downloading file: {str(e)}" }), 502 except requests.exceptions.RequestException as e: + # Catch-all for other request errors (e.g. TooManyRedirects) logger.error(f"Request error downloading file from URL: {file_url}. Error: {e}") return jsonify({ - "code": 400, + "code": 502, "message": f"Failed to download file from URL: {str(e)}" - }), 400 + }), 502 except Exception as e: logger.error(f"Unexpected error downloading file from URL: {file_url}. Error: {e}", exc_info=True) return jsonify({ @@ -1409,11 +1410,11 @@ async def split_text(tenant_id): async def split_file(tenant_id): """ Split file into chunks using rag/app chunking methods - - Supports all ParserType methods: naive, qa, book, laws, paper, manual, + + Supports all ParserType methods: naive, qa, book, laws, paper, manual, presentation, table, resume, picture, one, audio, email, tag, knowledge_graph, - title, regex, smart - + title, regex, smart. + Request JSON: { "file_path": "/path/to/document.pdf", # or use file_url @@ -1492,14 +1493,20 @@ async def split_file(tenant_id): binary = None service = PowerRAGSplitService() - result = service.split_file(filename=filename, binary=binary, parser_id=parser_id, config=config) - + result = service.split_file( + filename=filename, + binary=binary, + parser_id=parser_id, + config=config, + tenant_id=tenant_id, + ) + return jsonify({ "code": 0, "data": result, "message": "success" }), 200 - + except Exception as e: logger.error(f"Split file error: {e}", exc_info=True) return jsonify({ @@ -1513,11 +1520,11 @@ async def split_file(tenant_id): async def split_file_upload(tenant_id): """ Split uploaded file into chunks using rag/app chunking methods - - Supports all ParserType methods: naive, qa, book, laws, paper, manual, + + Supports all ParserType methods: naive, qa, book, laws, paper, manual, presentation, table, resume, picture, one, audio, email, tag, knowledge_graph, - title, regex, smart - + title, regex, smart. + Request (multipart/form-data): - file: File to split (required) - parser_id: Parser ID (optional, default: "naive") @@ -1583,14 +1590,20 @@ async def split_file_upload(tenant_id): }), 400 service = PowerRAGSplitService() - result = service.split_file(filename=filename, binary=binary, parser_id=parser_id, config=config) - + result = service.split_file( + filename=filename, + binary=binary, + parser_id=parser_id, + config=config, + tenant_id=tenant_id, + ) + return jsonify({ "code": 0, "data": result, "message": "success" }), 200 - + except Exception as e: logger.error(f"Split file upload error: {e}", exc_info=True) return jsonify({ diff --git a/powerrag/server/services/split_service.py b/powerrag/server/services/split_service.py index 76b8b6ef4..5b32f707b 100644 --- a/powerrag/server/services/split_service.py +++ b/powerrag/server/services/split_service.py @@ -34,6 +34,12 @@ logger = logging.getLogger(__name__) + +def _dummy_callback(prog=None, msg=""): + """No-op callback for parser progress; used when progress reporting is not needed.""" + pass + + # Chunker Factory - mapping parser_id to chunking module CHUNKER_FACTORY = {} @@ -118,11 +124,6 @@ def split_text(self, text: str, parser_id: str = "title", config: Dict[str, Any] chunker = self._get_chunker(parser_id) logger.info(f"Using chunker: {parser_id} for text splitting") - # Prepare callback - def dummy(prog=None, msg=""): - """Dummy callback for progress""" - pass - # Build parser_config based on parser_id if parser_id == ParserType.TITLE.value: # Title parser specific config @@ -246,16 +247,17 @@ def _init_file_chunker_factory(self): logger.error("Failed to import naive chunker, file splitting will not work") self._file_chunker_factory = {} - def split_file(self, filename: str = None, binary: bytes = None, parser_id: str = "naive", - config: Dict[str, Any] = None) -> Dict[str, Any]: + def split_file(self, filename: str = None, binary: bytes = None, parser_id: str = "naive", + config: Dict[str, Any] = None, tenant_id: str = None) -> Dict[str, Any]: """ Split file into chunks using rag/app chunking methods - + Args: filename: File path (optional if binary is provided) binary: File binary content (optional if filename is provided) parser_id: Parser/chunker ID (e.g., "naive", "book", "title") config: Chunking configuration (optional) + tenant_id: Tenant ID (required for audio and picture parsers; used for LLM model lookup) Returns: Dict containing chunks and metadata @@ -305,11 +307,6 @@ def split_file(self, filename: str = None, binary: bytes = None, parser_id: str if not chunker_module: raise ValueError(f"Chunker '{parser_id}' not found and naive chunker not available") - # Prepare callback - def dummy(prog=None, msg=""): - """Dummy callback for progress""" - pass - # Build parser_config from config parser_config = config.copy() parser_config.setdefault("chunk_token_num", 512) @@ -318,15 +315,16 @@ def dummy(prog=None, msg=""): # Build kwargs kwargs = { "lang": config.get("lang", "Chinese"), - "callback": dummy, + "callback": _dummy_callback, "parser_config": parser_config, "from_page": config.get("from_page", 0), "to_page": config.get("to_page", 100000), } - + + if tenant_id: + kwargs["tenant_id"] = tenant_id + # Add optional fields - if config.get("tenant_id"): - kwargs["tenant_id"] = config["tenant_id"] if config.get("kb_id"): kwargs["kb_id"] = config["kb_id"] if config.get("doc_id"):