Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
352fa8f
feat:add PowerRAG SDK and API Proxy
Zhangg7723 Dec 31, 2025
1f60049
Merge branch 'oceanbase:main' into powerrag_sdk_api
Zhangg7723 Jan 4, 2026
8b735e4
Merge branch 'oceanbase:main' into powerrag_sdk_api
Zhangg7723 Jan 5, 2026
f8d2bd5
feat: add GitHub Actions workflow for Python package publishing and i…
Zhangg7723 Jan 5, 2026
35ba7e2
chore: update GitHub Actions workflow for SDK publishing and refine p…
Zhangg7723 Jan 6, 2026
6325328
chore: update Python version requirement in pyproject.toml to support…
Zhangg7723 Jan 6, 2026
79d1294
chore: add environment configuration for PyPI in GitHub Actions workflow
Zhangg7723 Jan 6, 2026
3dbcd5d
Merge branch 'oceanbase:main' into powerrag_sdk_api
Zhangg7723 Jan 6, 2026
16a0245
docs: update SDK README.md
Zhangg7723 Jan 6, 2026
e160ce0
Merge branch 'oceanbase:main' into powerrag_sdk_api
Zhangg7723 Jan 12, 2026
f15d0ef
feat(document): add binary file parsing to Markdown method
Zhangg7723 Jan 12, 2026
7732022
refactor(document_manager): centralize parse to markdown upload logic
Zhangg7723 Jan 13, 2026
33a0520
refactor(init): remove module docstring and __all__ exports
Zhangg7723 Jan 13, 2026
23dde0b
Merge branch 'oceanbase:main' into powerrag_sdk_api
Zhangg7723 Jan 20, 2026
399d17a
chore(docker): add GOTENBERG server environment variables
Zhangg7723 Jan 20, 2026
f28c272
Update docker/.env.example
whhe Jan 21, 2026
d92c91d
Merge branch 'oceanbase:main' into powerrag_sdk_api
Zhangg7723 Jan 23, 2026
e9780ed
feat(document): add input_type parameter for file type detection
Zhangg7723 Jan 23, 2026
045ad0a
feat(document): add support for file_url to parse documents from URL
Zhangg7723 Jan 23, 2026
5cd36ad
Update powerrag/utils/file_utils.py
Zhangg7723 Jan 26, 2026
5635866
Update powerrag/utils/file_utils.py
Zhangg7723 Jan 26, 2026
2b3496a
refactor(document): update README and code to use 'content' instead o…
Zhangg7723 Jan 27, 2026
6a1564d
Merge branch 'oceanbase:main' into powerrag_sdk_api
Zhangg7723 Jan 28, 2026
15d378b
feat(sdk): implement file splitting functionality and enhance documen…
Zhangg7723 Jan 28, 2026
537e40a
Update powerrag/server/services/split_service.py
Zhangg7723 Feb 24, 2026
0cb7666
feat(sdk): enhance file splitting functionality and improve error han…
Zhangg7723 Feb 24, 2026
372e38d
Merge branch 'powerrag_sdk_api' of https://github.com/Zhangg7723/powe…
Zhangg7723 Feb 24, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 45 additions & 3 deletions api/apps/sdk/powerrag_proxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,11 @@
这样 SDK 可以通过主 RAGFlow 服务访问 PowerRAG 功能,无需直接连接到 PowerRAG server
"""

import os
import asyncio
import logging
import os
from io import BytesIO

import httpx
from quart import request, jsonify
from api.utils.api_utils import token_required, get_error_data_result
Expand Down Expand Up @@ -93,10 +96,15 @@ async def _forward_request(method: str, endpoint: str, tenant_id: str = None):
# 因为会丢失文件名。需要构造 httpx 期望的格式
files = {}
for field_name, file_storage in files_dict.items():
# httpx 期望格式: (filename, content, content_type)
# 在线程中读取文件内容(避免阻塞事件循环)
# httpx 期望文件对象或元组格式
# 使用 BytesIO 将 bytes 包装成文件对象
file_content = await asyncio.to_thread(file_storage.read)
# httpx 期望格式: (filename, file_object, content_type) 或 (filename, file_object)
file_obj = BytesIO(file_content)
files[field_name] = (
file_storage.filename,
file_storage.read(),
file_obj,
file_storage.content_type or 'application/octet-stream'
)
except Exception:
Expand Down Expand Up @@ -592,3 +600,37 @@ async def parse_to_md_upload_proxy(tenant_id):
"""
return await _forward_request("POST", "/parse_to_md/upload", tenant_id)


@manager.route("/powerrag/split/file", methods=["POST"]) # noqa: F821
@token_required
async def split_file_proxy(tenant_id):
"""
代理 split/file API 请求到 PowerRAG server

支持所有ParserType方法对文件进行切片(使用文件路径或URL)

---
tags:
- PowerRAG Proxy
security:
- ApiKeyAuth: []
"""
return await _forward_request("POST", "/split/file", tenant_id)


@manager.route("/powerrag/split/file/upload", methods=["POST"]) # noqa: F821
@token_required
async def split_file_upload_proxy(tenant_id):
"""
代理 split/file/upload API 请求到 PowerRAG server

上传文件并切片,支持所有ParserType方法

---
tags:
- PowerRAG Proxy
security:
- ApiKeyAuth: []
"""
return await _forward_request("POST", "/split/file/upload", tenant_id)

194 changes: 189 additions & 5 deletions powerrag/sdk/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -290,14 +290,122 @@ status = client.extraction.get_struct_extract_status(task['task_id'])

### 文本切片

无需上传文档即可对文本进行切片:
无需上传文档即可对文本进行切片。

**注意**: `split_text` 方法仅支持以下三种解析器:
- `title`: 基于标题的切片
- `regex`: 基于正则表达式的切片
- `smart`: 智能切片

对于其他解析器(如 `naive`, `book`, `qa` 等),请使用 `split_file` 或 `split_file_upload` 方法。

```python
# 使用 title 解析器
result = client.chunk.split_text(
text="# Title\n\nContent...",
parser_id="title",
config={"chunk_token_num": 512}
)

# 使用 regex 解析器
result = client.chunk.split_text(
text="Section 1\n\nContent...",
parser_id="regex",
config={
"chunk_token_num": 256,
"regex_pattern": r"Section \d+"
}
)

# 使用 smart 解析器
result = client.chunk.split_text(
text="Long text content...",
parser_id="smart",
config={"chunk_token_num": 512}
)

print(f"Total chunks: {result['total_chunks']}")
for chunk in result['chunks']:
print(chunk)
```

### 文件切片

文件切片支持所有 ParserType 方法,提供三种使用方式:

#### 方式 1: 使用本地文件路径

```python
result = client.chunk.split_file(
file_path="/path/to/document.pdf",
parser_id="book", # 支持所有 ParserType
config={
"chunk_token_num": 512,
"delimiter": "\n。.;;!!??",
"lang": "Chinese",
"from_page": 0,
"to_page": 100000
}
)
```

#### 方式 2: 使用文件 URL

```python
result = client.chunk.split_file(
file_url="https://example.com/doc.pdf",
parser_id="naive",
config={
"chunk_token_num": 256,
"max_file_size": 128 * 1024 * 1024, # 128MB
"download_timeout": 300, # 5分钟
"head_request_timeout": 30 # 30秒
}
)
```

#### 方式 3: 上传文件并切片

```python
result = client.chunk.split_file_upload(
file_path="/path/to/document.pdf",
parser_id="book",
config={
"chunk_token_num": 512,
"delimiter": "\n。.;;!!??",
"lang": "Chinese"
}
)

print(f"Total chunks: {result['total_chunks']}")
print(f"Filename: {result['filename']}")
for chunk in result['chunks']:
print(chunk)
```

**支持的 ParserType 方法:**
- 基础方法: `naive`, `title`, `regex`, `smart`
- 专业方法: `qa`, `book`, `laws`, `paper`, `manual`, `presentation`
- 特殊格式: `table`, `resume`, `picture`, `one`, `email`

**配置参数说明:**
- `chunk_token_num` (int): 目标分块大小(tokens),默认 512
- `delimiter` (str): 分隔符字符串,默认 `"\n。.;;!!??"`
- `lang` (str): 语言,默认 `"Chinese"`
- `from_page` (int): PDF 起始页码,默认 0
- `to_page` (int): PDF 结束页码,默认 100000
- `max_file_size` (int): URL 下载的最大文件大小(字节),仅用于 `file_url` 方式
- `download_timeout` (int): 下载超时时间(秒),仅用于 `file_url` 方式
- `head_request_timeout` (int): HEAD 请求超时时间(秒),仅用于 `file_url` 方式

**返回值结构:**
```python
{
"parser_id": "book",
"chunks": ["chunk1", "chunk2", ...], # 字符串列表
"total_chunks": 10,
"filename": "document.pdf"
}
```

## 核心模块
Expand Down Expand Up @@ -558,15 +666,49 @@ client.chunk.delete(kb_id, doc_id, [chunk_id])
# 删除文档的所有切片
client.chunk.delete(kb_id, doc_id, None)

# 文本切片(无需上传文档
# 文本切片(仅支持 title, regex, smart
result = client.chunk.split_text(
text="# Title\n\nLong text to be chunked...",
parser_id="title", # 解析器ID
config={"chunk_token_num": 512} # 自定义配置
parser_id="title", # 仅支持: title, regex, smart
config={"chunk_token_num": 512}
)
print(f"Total chunks: {result['total_chunks']}")
for chunk in result['chunks']:
print(chunk['content'])
print(chunk)

# 文件切片(支持所有ParserType方法)
# 方式1: 使用本地文件路径
result = client.chunk.split_file(
file_path="/path/to/document.pdf",
parser_id="book", # 支持所有 ParserType
config={
"chunk_token_num": 512,
"delimiter": "\n。.;;!!??",
"lang": "Chinese"
}
)

# 方式2: 使用文件URL
result = client.chunk.split_file(
file_url="https://example.com/doc.pdf",
parser_id="naive",
config={
"chunk_token_num": 256,
"max_file_size": 128 * 1024 * 1024, # 128MB
"download_timeout": 300
}
)

# 方式3: 上传文件并切片
result = client.chunk.split_file_upload(
file_path="/path/to/document.pdf",
parser_id="book",
config={"chunk_token_num": 512}
)
print(f"Total chunks: {result['total_chunks']}")
print(f"Filename: {result['filename']}")
for chunk in result['chunks']:
print(chunk)
```

### 4. 信息抽取 (Extraction)
Expand Down Expand Up @@ -894,6 +1036,7 @@ SDK 包含完整的测试套件,覆盖所有功能模块。
# 设置环境变量
export HOST_ADDRESS="http://127.0.0.1:9380"
export POWERRAG_API_KEY="your-api-key"
export PYTHONPATH=$(pwd)

# 运行测试
pytest powerrag/sdk/tests/
Expand Down Expand Up @@ -1203,6 +1346,47 @@ for result in results:
# 重新解析或删除
```

### Q: 文本切片和文件切片有什么区别?应该使用哪个?

A:
- **`split_text`**: 仅支持 `title`, `regex`, `smart` 三种解析器,适用于纯文本内容(Markdown格式)
- **`split_file`**: 支持所有 ParserType 方法,适用于文件(通过路径或URL)
- **`split_file_upload`**: 支持所有 ParserType 方法,适用于文件上传

**使用建议:**
- 如果只有文本内容且需要使用 `title`/`regex`/`smart`,使用 `split_text`
- 如果有文件且需要使用其他解析器(如 `book`, `qa`, `naive` 等),使用 `split_file` 或 `split_file_upload`
- 如果文件在本地,使用 `split_file(file_path=...)` 或 `split_file_upload`
- 如果文件在远程URL,使用 `split_file(file_url=...)`

**示例:**
```python
# 文本切片(仅支持 title, regex, smart)
result = client.chunk.split_text(
text="# Title\n\nContent...",
parser_id="title"
)

# 文件切片(支持所有解析器)
# 本地文件
result = client.chunk.split_file(
file_path="/path/to/doc.pdf",
parser_id="book" # 可以使用任何解析器
)

# 远程文件
result = client.chunk.split_file(
file_url="https://example.com/doc.pdf",
parser_id="naive"
)

# 文件上传
result = client.chunk.split_file_upload(
file_path="/path/to/doc.pdf",
parser_id="qa"
)
```

### Q: 如何解析无扩展名的文件?

A: 使用 `parse_to_md_binary` 方法并使用 `input_type='auto'`(默认值):
Expand Down
Loading