Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
762abe5
refactor(backend): fix async blocking, double commits, and exception …
sylvanding Mar 17, 2026
d3e041f
refactor(backend): unify config, centralize prompts, and optimize RAG…
sylvanding Mar 17, 2026
91bd9e1
test(backend): add comprehensive API endpoint tests covering 141 new …
sylvanding Mar 17, 2026
6c29e7c
docs(backend): add API endpoint catalog, brainstorms, plans, and rese…
sylvanding Mar 17, 2026
0080cc9
test(backend): add E2E live server tests with real LLM (25 passed, 1 …
sylvanding Mar 17, 2026
23e9574
docs(backend): mark E2E acceptance criteria as completed
sylvanding Mar 17, 2026
5722af7
feat(backend): enable MinerU PDF parsing + GPU parallel OCR + compreh…
sylvanding Mar 17, 2026
d7cff6f
fix(backend): resolve E2E test skips and CUDA OOM failures
sylvanding Mar 18, 2026
87f65b2
feat(backend): add GPU_MODE preset system for resource scheduling
sylvanding Mar 18, 2026
bb4800a
refactor(backend): comprehensive backend optimization — 21 improvemen…
sylvanding Mar 18, 2026
e4c52e6
refactor(backend): code quality improvements and comprehensive testin…
sylvanding Mar 18, 2026
c0feaff
feat(backend): GPU resource auto-management with TTL and MinerU subpr…
sylvanding Mar 18, 2026
e4be0d0
fix(backend): remove unsupported --no-banner flag from conda run command
sylvanding Mar 18, 2026
70fb96f
fix(backend): P0 bug fixes — pipeline data loss, async blocking, secu…
sylvanding Mar 18, 2026
6110c0b
refactor(backend): data integrity, pipeline persistence, code quality…
sylvanding Mar 18, 2026
72e7ccf
test(backend): add pdf_metadata tests and extend paper API test coverage
sylvanding Mar 18, 2026
7ede569
refactor(backend): P2 improvements — OpenAPI docs, SSE errors, rate l…
sylvanding Mar 18, 2026
17ece0f
fix(backend): update tests for unique constraint, search body, SSRF, …
sylvanding Mar 18, 2026
7d0e37c
docs(backend): mark backend comprehensive review plan as completed
sylvanding Mar 18, 2026
7fc088b
feat(backend): auto-release GPU resources on program exit
sylvanding Mar 18, 2026
c1d449a
docs: update README and .env.example for GPU management and MinerU fe…
sylvanding Mar 18, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 44 additions & 9 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
# --- Application ---
APP_ENV=development
# Set to true for development only. Production MUST use false.
APP_DEBUG=false
APP_DEBUG=true
APP_HOST=0.0.0.0
APP_PORT=8000
# SECURITY: Change this to a random secret key in production!
Expand Down Expand Up @@ -59,31 +59,66 @@ OLLAMA_MODEL=llama3
# --- Embedding ---
# Provider: local (HuggingFace) | api (OpenAI) | mock
EMBEDDING_PROVIDER=local
EMBEDDING_MODEL=BAAI/bge-m3
EMBEDDING_MODEL=Qwen/Qwen3-Embedding-8B
EMBEDDING_API_KEY=
RERANKER_MODEL=BAAI/bge-reranker-v2-m3
RERANKER_MODEL=tomaarsen/Qwen3-Reranker-8B-seq-cls

# --- OCR ---
# PaddleOCR language: ch (Chinese+English) | en (English only)
OCR_LANG=ch

# --- PDF Parsing ---
# --- PDF Parsing / MinerU ---
# Parser selection: auto (pdfplumber first, fallback to MinerU) | mineru | pdfplumber
PDF_PARSER=auto
PDF_PARSER=mineru
# MinerU independent API service URL
MINERU_API_URL=http://localhost:8010
# MinerU backend: pipeline | hybrid-auto-engine | vlm-auto-engine
MINERU_BACKEND=pipeline
# Timeout per PDF in seconds
MINERU_TIMEOUT=300
MINERU_TIMEOUT=8000
# Auto start/stop MinerU subprocess (true = Omelette manages MinerU lifecycle)
MINERU_AUTO_MANAGE=true
# Conda environment name for MinerU (used with conda run)
MINERU_CONDA_ENV=mineru
# Stop MinerU after N seconds idle (0 = never auto-stop)
MINERU_TTL_SECONDS=600
# MinerU startup timeout in seconds
MINERU_STARTUP_TIMEOUT=120
# GPU IDs for MinerU (empty = inherit CUDA_VISIBLE_DEVICES)
MINERU_GPU_IDS=

# --- GPU ---
# Comma-separated GPU IDs for OCR/embedding tasks
CUDA_VISIBLE_DEVICES=0,3
CUDA_VISIBLE_DEVICES=

# Auto-unload GPU models after N seconds idle (0 = never auto-unload)
MODEL_TTL_SECONDS=300
# TTL check interval in seconds
MODEL_TTL_CHECK_INTERVAL=30

# GPU preset mode: conservative | balanced | aggressive
# conservative: batch=1, parallel=1, safe for small VRAM / debugging
# balanced: batch=8/16, auto parallel, good default
# aggressive: batch=32/50, parallel=GPU*2, max throughput (32G+ VRAM)
GPU_MODE=balanced

# Per-service overrides (0 = follow GPU_MODE preset)
# EMBED_BATCH_SIZE=0
# RERANK_BATCH_SIZE=0

# Pin models to specific GPU index (-1 = auto-select by free memory)
# EMBED_GPU_ID=-1
# RERANK_GPU_ID=-1

# Comma-separated GPU IDs for OCR workers (empty = use all visible GPUs)
# OCR_GPU_IDS=

# Max parallel OCR tasks. 0=auto (GPU count, or GPU*2 in aggressive mode)
# OCR_PARALLEL_LIMIT=0

# --- Network Proxy ---
HTTP_PROXY=http://127.0.0.1:20171/
HTTPS_PROXY=http://127.0.0.1:20171/
# HTTP_PROXY=http://your-proxy:port
# HTTPS_PROXY=http://your-proxy:port

# --- HuggingFace Mirror ---
# For users in China, set to https://hf-mirror.com to speed up model downloads
Expand Down
55 changes: 46 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ Omelette automates the full research literature pipeline — from keyword manage
Multi-channel download via Unpaywall, arXiv, and direct URL fallback strategies.

**📝 OCR Processing**
Native text extraction with PaddleOCR GPU fallback for scanned documents.
Native text extraction via MinerU (auto-managed subprocess) or PaddleOCR GPU fallback.

**🧠 RAG Knowledge Base**
LlamaIndex engine with ChromaDB, GPU-aware embeddings, hybrid retrieval, and cited answers.
Expand All @@ -69,7 +69,10 @@ Omelette automates the full research literature pipeline — from keyword manage
Summarization, citation generation (GB/T 7714, APA, MLA), review outlines, and gap analysis.

**🔄 LangGraph Pipeline**
Pipeline orchestration with human-in-the-loop interrupt and resume.
Pipeline orchestration with HITL interrupt/resume and persistent checkpointing.

**⚡ GPU Resource Management**
TTL-based auto-unload for GPU models, MinerU subprocess auto-management, monitoring API, and exit cleanup watchdog.

**🔗 MCP Integration**
Model Context Protocol server for AI IDE clients (Cursor, Claude Code, etc.).
Expand Down Expand Up @@ -103,7 +106,7 @@ Keywords ─→ Search ─→ Dedup ─→ Crawler ─→ OCR ─→ RAG ─→
| **RAG** | LlamaIndex with GPU-aware embeddings |
| **LLM** | LangChain (OpenAI, Anthropic, Aliyun, Volcengine, Ollama) |
| **Orchestration** | LangGraph with HITL interrupt/resume |
| **OCR** | pdfplumber (native) + PaddleOCR (scanned, optional) |
| **OCR** | MinerU (auto-managed) + pdfplumber (native) + PaddleOCR (scanned) |
| **MCP** | Model Context Protocol server |
| **Docs** | VitePress (bilingual EN/ZH) |

Expand Down Expand Up @@ -147,6 +150,10 @@ cp .env.example .env
| `ALIYUN_API_KEY` | Aliyun Bailian API key |
| `VOLCENGINE_API_KEY` | Volcengine Doubao API key |
| `SEMANTIC_SCHOLAR_API_KEY` | Optional; increases Semantic Scholar rate limit |
| `GPU_MODE` | GPU preset: `conservative`, `balanced` (default), `aggressive` |
| `MODEL_TTL_SECONDS` | Auto-unload GPU models after N seconds idle (default: 300) |
| `MINERU_AUTO_MANAGE` | Auto start/stop MinerU subprocess (default: true) |
| `PDF_PARSER` | `auto`, `mineru`, or `pdfplumber` |

See [`.env.example`](.env.example) for the full list.

Expand All @@ -156,10 +163,31 @@ See [`.env.example`](.env.example) for the full list.

```bash
cd backend

# Run database migrations
alembic upgrade head

# Start server
uvicorn app.main:app --reload --host 0.0.0.0 --port 8000
```

### 4. Start frontend
On startup, the backend automatically:
- Writes a PID file to `DATA_DIR/omelette.pid`
- Starts a GPU model TTL monitor (auto-unloads idle models)
- If `MINERU_AUTO_MANAGE=true`, manages MinerU subprocess lifecycle
- Registers cleanup handlers (`atexit` + `SIGHUP`) so GPU resources are released even if the process exits unexpectedly

### 4. (Optional) GPU watchdog

For extra safety against `kill -9` or crashes, run the external watchdog:

```bash
python backend/scripts/gpu_watchdog.py --daemon
```

The watchdog monitors the Omelette process and cleans up GPU resources if it terminates abnormally.

### 5. Start frontend

```bash
cd frontend
Expand All @@ -169,13 +197,19 @@ npm run dev

Open [http://localhost:3000](http://localhost:3000) in your browser.

### 5. (Optional) OCR & Embeddings
### 6. (Optional) MinerU setup

If using MinerU for PDF parsing (`PDF_PARSER=mineru`):

```bash
cd backend
pip install -e ".[ocr,ml]"
# Create a separate conda env for MinerU
conda create -n mineru python=3.10
conda activate mineru
pip install magic-pdf[full]
```

Set `MINERU_CONDA_ENV=mineru` in `.env`. Omelette will auto-start MinerU when needed.

> **Troubleshooting:** If you get `ModuleNotFoundError: No module named 'fastapi'`, ensure the conda environment is activated: `conda activate omelette`.

## 📂 Project Layout
Expand All @@ -194,7 +228,8 @@ omelette/
│ │ └── main.py # App entry, lifespan, CORS
│ ├── mcp_server.py # MCP (Model Context Protocol) server
│ ├── alembic/ # Database migrations
│ ├── tests/ # pytest-asyncio tests (178 tests)
│ ├── scripts/ # Utilities (gpu_watchdog.py)
│ ├── tests/ # pytest-asyncio tests (526 tests)
│ └── pyproject.toml # Python dependencies
├── frontend/ # React SPA
│ └── src/
Expand Down Expand Up @@ -230,7 +265,7 @@ make dev # Start both backend and frontend
### Running Tests

```bash
# Backend (178 tests)
# Backend (526 tests)
cd backend && pytest tests/ -v

# Frontend unit tests (28 tests — Vitest + Testing Library + MSW)
Expand Down Expand Up @@ -269,6 +304,8 @@ REST APIs under `/api/v1/`:
| `GET/POST /subscriptions` | Subscription management |
| `GET/POST /settings` | Settings and health |
| `GET /settings/health` | Health check |
| `GET /gpu/status` | GPU model and memory status |
| `POST /gpu/unload` | Manually unload GPU models |

MCP server: `/mcp` (WebSocket/SSE for AI IDE clients)

Expand Down
51 changes: 42 additions & 9 deletions README_zh.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ Omelette 覆盖科研文献全流程自动化 — 从关键词管理、多源检
Unpaywall、arXiv、直链多通道下载,智能回退策略。

**📝 OCR 解析**
pdfplumber 原生文本提取,PaddleOCR GPU 加速处理扫描件。
MinerU(自动管理子进程)或 pdfplumber 原生提取,PaddleOCR GPU 加速处理扫描件。

**🧠 RAG 知识库**
LlamaIndex 引擎,ChromaDB 向量存储,GPU 感知嵌入,混合检索,带引用回答。
Expand All @@ -69,7 +69,10 @@ Omelette 覆盖科研文献全流程自动化 — 从关键词管理、多源检
论文摘要、引用生成(GB/T 7714、APA、MLA)、综述提纲、缺口分析。

**🔄 LangGraph 流水线**
流水线编排,支持人机协同中断与恢复。
流水线编排,支持人机协同中断/恢复与持久化检查点。

**⚡ GPU 资源管理**
TTL 自动卸载 GPU 模型、MinerU 子进程自动管理、监控 API、退出清理看门狗。

**🔗 MCP 集成**
Model Context Protocol 服务端,面向 AI IDE 客户端(Cursor、Claude Code 等)。
Expand Down Expand Up @@ -103,7 +106,7 @@ Keywords ─→ Search ─→ Dedup ─→ Crawler ─→ OCR ─→ RAG ─→
| **RAG** | LlamaIndex,GPU 感知嵌入 |
| **LLM** | LangChain(OpenAI、Anthropic、阿里云、火山引擎、Ollama) |
| **编排** | LangGraph,支持人机协同中断与恢复 |
| **OCR** | pdfplumber(原生)+ PaddleOCR(扫描件,可选) |
| **OCR** | MinerU(自动管理)+ pdfplumber(原生)+ PaddleOCR(扫描件) |
| **MCP** | Model Context Protocol 服务端 |
| **文档** | VitePress(中英双语) |

Expand Down Expand Up @@ -156,10 +159,31 @@ cp .env.example .env

```bash
cd backend

# 执行数据库迁移
alembic upgrade head

# 启动服务
uvicorn app.main:app --reload --host 0.0.0.0 --port 8000
```

### 4. 启动前端
启动时后端自动完成以下操作:
- 写入 PID 文件到 `DATA_DIR/omelette.pid`
- 启动 GPU 模型 TTL 监控(自动卸载空闲模型)
- 若 `MINERU_AUTO_MANAGE=true`,自动管理 MinerU 子进程生命周期
- 注册清理钩子(`atexit` + `SIGHUP`),即使进程意外退出也会释放 GPU 资源

### 4.(可选)GPU 看门狗

为防止 `kill -9` 或崩溃导致资源泄漏,可运行外部看门狗:

```bash
python backend/scripts/gpu_watchdog.py --daemon
```

看门狗会监控 Omelette 进程,在其异常终止后自动清理 GPU 资源。

### 5. 启动前端

```bash
cd frontend
Expand All @@ -169,13 +193,19 @@ npm run dev

在浏览器中打开 [http://localhost:3000](http://localhost:3000)。

### 5.(可选)OCR 与嵌入
### 6.(可选)MinerU 配置

若使用 MinerU 解析 PDF(`PDF_PARSER=mineru`):

```bash
cd backend
pip install -e ".[ocr,ml]"
# 为 MinerU 创建独立 conda 环境
conda create -n mineru python=3.10
conda activate mineru
pip install magic-pdf[full]
```

在 `.env` 中设置 `MINERU_CONDA_ENV=mineru`,Omelette 将在需要时自动启动 MinerU。

> **常见问题:** 若出现 `ModuleNotFoundError: No module named 'fastapi'`,请确认已激活 conda 环境:`conda activate omelette`。

## 📂 项目结构
Expand All @@ -194,7 +224,8 @@ omelette/
│ │ └── main.py # App entry, lifespan, CORS
│ ├── mcp_server.py # MCP (Model Context Protocol) server
│ ├── alembic/ # Database migrations
│ ├── tests/ # pytest-asyncio 测试(178 个)
│ ├── scripts/ # 工具脚本(gpu_watchdog.py)
│ ├── tests/ # pytest-asyncio 测试(526 个)
│ └── pyproject.toml # Python dependencies
├── frontend/ # React SPA
│ └── src/
Expand Down Expand Up @@ -230,7 +261,7 @@ make dev # Start both backend and frontend
### 运行测试

```bash
# 后端(178 个测试)
# 后端(526 个测试)
cd backend && pytest tests/ -v

# 前端单元测试(28 个测试 — Vitest + Testing Library + MSW)
Expand Down Expand Up @@ -266,6 +297,8 @@ REST API 位于 `/api/v1/` 下:
| `GET/POST /subscriptions` | 订阅管理 |
| `GET/POST /settings` | 设置与健康状态 |
| `GET /settings/health` | 健康检查 |
| `GET /gpu/status` | GPU 模型与显存状态 |
| `POST /gpu/unload` | 手动卸载 GPU 模型 |

MCP 服务端:`/mcp`(WebSocket/SSE,面向 AI IDE 客户端)

Expand Down
26 changes: 26 additions & 0 deletions backend/alembic/versions/a1b2c3d4e5f6_add_composite_indexes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
"""add composite indexes for paper and task tables

Revision ID: a1b2c3d4e5f6
Revises: f2bee250c39f
Create Date: 2026-03-18 10:00:00.000000

"""

from collections.abc import Sequence

from alembic import op

revision: str = "a1b2c3d4e5f6"
down_revision: str | None = "f2bee250c39f"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None


def upgrade() -> None:
op.create_index("ix_paper_project_status", "papers", ["project_id", "status"])
op.create_index("ix_task_project_status", "tasks", ["project_id", "status"])


def downgrade() -> None:
op.drop_index("ix_task_project_status", table_name="tasks")
op.drop_index("ix_paper_project_status", table_name="papers")
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
"""add paper project_doi unique constraint

Revision ID: cb8130e58f92
Revises: a1b2c3d4e5f6
Create Date: 2026-03-18 22:54:13.519198

"""

from collections.abc import Sequence

from alembic import op

# revision identifiers, used by Alembic.
revision: str = "cb8130e58f92"
down_revision: str | Sequence[str] | None = "a1b2c3d4e5f6"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None


def upgrade() -> None:
"""Upgrade schema."""
with op.batch_alter_table("papers", schema=None) as batch_op:
batch_op.create_unique_constraint("uq_paper_project_doi", ["project_id", "doi"])


def downgrade() -> None:
"""Downgrade schema."""
with op.batch_alter_table("papers", schema=None) as batch_op:
batch_op.drop_constraint("uq_paper_project_doi", type_="unique")
Loading
Loading