Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 42 additions & 10 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,11 @@

# --- Application ---
APP_ENV=development
APP_DEBUG=true
# Set to true for development only. Production MUST use false.
APP_DEBUG=false
APP_HOST=0.0.0.0
APP_PORT=8000
# SECURITY: Change this to a random secret key in production!
APP_SECRET_KEY=change-me-to-a-random-secret-key

# --- API Authentication ---
Expand All @@ -24,27 +26,57 @@ PDF_DIR=${DATA_DIR}/pdfs
OCR_OUTPUT_DIR=${DATA_DIR}/ocr_output
CHROMA_DB_DIR=${DATA_DIR}/chroma_db

# --- LLM: Aliyun Bailian (Coding Plan) ---
# OpenAI-compatible endpoint
# --- LLM: Default Provider ---
# Options: aliyun, volcengine, openai, anthropic, ollama, mock
LLM_PROVIDER=mock
LLM_TEMPERATURE=0.7
LLM_MAX_TOKENS=4096

# --- LLM: Aliyun Bailian ---
# OpenAI-compatible endpoint (general: dashscope.aliyuncs.com, coding: coding.dashscope.aliyuncs.com)
ALIYUN_API_KEY=sk-sp-xxxxx
ALIYUN_BASE_URL=https://coding.dashscope.aliyuncs.com/v1
ALIYUN_BASE_URL=https://dashscope.aliyuncs.com/compatible-mode/v1
ALIYUN_MODEL=qwen3.5-plus

# --- LLM: Volcengine (Doubao) ---
# OpenAI-compatible endpoint
VOLCENGINE_API_KEY=your-volcengine-api-key
VOLCENGINE_BASE_URL=https://ark.cn-beijing.volces.com/api/v3
VOLCENGINE_MODEL=doubao-seed-1-6-flash-250828

# --- Default LLM Provider ---
# Options: aliyun, volcengine, mock
LLM_PROVIDER=mock
# --- LLM: OpenAI ---
OPENAI_API_KEY=
OPENAI_BASE_URL=https://api.openai.com/v1
OPENAI_MODEL=gpt-4o-mini

# --- LLM: Anthropic ---
ANTHROPIC_API_KEY=
ANTHROPIC_MODEL=claude-sonnet-4-20250514

# --- Embedding Model ---
# Local model name (downloaded from HuggingFace)
# --- LLM: Ollama (local) ---
OLLAMA_BASE_URL=http://localhost:11434
OLLAMA_MODEL=llama3

# --- Embedding ---
# Provider: local (HuggingFace) | api (OpenAI) | mock
EMBEDDING_PROVIDER=local
EMBEDDING_MODEL=BAAI/bge-m3
EMBEDDING_API_KEY=
RERANKER_MODEL=BAAI/bge-reranker-v2-m3

# --- OCR ---
# PaddleOCR language: ch (Chinese+English) | en (English only)
OCR_LANG=ch

# --- PDF Parsing ---
# Parser selection: auto (pdfplumber first, fallback to MinerU) | mineru | pdfplumber
PDF_PARSER=auto
# MinerU independent API service URL
MINERU_API_URL=http://localhost:8010
# MinerU backend: pipeline | hybrid-auto-engine | vlm-auto-engine
MINERU_BACKEND=pipeline
# Timeout per PDF in seconds
MINERU_TIMEOUT=300

# --- GPU ---
# Comma-separated GPU IDs for OCR/embedding tasks
CUDA_VISIBLE_DEVICES=0,3
Expand Down
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,10 @@ REST APIs under `/api/v1/`:
| `POST /projects/{id}/rag/index` | Build vector index |
| `POST /projects/{id}/rag/query` | RAG retrieval |
| `POST /projects/{id}/writing/assist` | Writing assistance |
| `POST /projects/{id}/writing/review-draft/stream` | Streaming literature review (SSE) |
| `POST /chat` | Chat messages (playground) |
| `POST /chat/complete` | Smart autocomplete suggestions |
| `GET /projects/{id}/papers/{paper_id}/citation-graph` | Citation graph (Semantic Scholar) |
| `GET/POST /conversations` | Conversation CRUD |
| `GET/POST /pipelines` | Pipeline management |
| `GET/POST /subscriptions` | Subscription management |
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
"""add has_formula and figure_path to paper_chunks

Revision ID: f2bee250c39f
Revises: e8f2a3b1c4d5
Create Date: 2026-03-15 17:30:22.425479

"""

from collections.abc import Sequence

import sqlalchemy as sa
from alembic import op

# revision identifiers, used by Alembic.
revision: str = "f2bee250c39f"
down_revision: str | Sequence[str] | None = "e8f2a3b1c4d5"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None


def upgrade() -> None:
"""Upgrade schema."""
# ### commands auto generated by Alembic - please adjust! ###
op.add_column("paper_chunks", sa.Column("has_formula", sa.Boolean(), server_default=sa.text("0"), nullable=False))
op.add_column(
"paper_chunks", sa.Column("figure_path", sa.String(length=500), server_default=sa.text("''"), nullable=False)
)
# ### end Alembic commands ###


def downgrade() -> None:
"""Downgrade schema."""
# ### commands auto generated by Alembic - please adjust! ###
op.drop_column("paper_chunks", "figure_path")
op.drop_column("paper_chunks", "has_formula")
# ### end Alembic commands ###
3 changes: 0 additions & 3 deletions backend/app/api/v1/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,6 @@
api_router.include_router(projects.router, prefix="/projects")
api_router.include_router(papers.router, prefix="/projects/{project_id}/papers")
api_router.include_router(upload.router, prefix="/projects/{project_id}/papers")
api_router.include_router(projects.router, prefix="/knowledge-bases", tags=["knowledge-bases"])
api_router.include_router(papers.router, prefix="/knowledge-bases/{project_id}/papers", tags=["knowledge-bases"])
api_router.include_router(upload.router, prefix="/knowledge-bases/{project_id}/papers", tags=["knowledge-bases"])
api_router.include_router(keywords.router)
api_router.include_router(search.router)
api_router.include_router(dedup.router)
Expand Down
29 changes: 29 additions & 0 deletions backend/app/api/v1/chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from fastapi import APIRouter, Depends
from fastapi.responses import StreamingResponse
from pydantic import BaseModel, Field
from sqlalchemy.ext.asyncio import AsyncSession

from app.api.deps import get_db
Expand All @@ -19,13 +20,26 @@
format_finish,
format_start,
)
from app.schemas.common import ApiResponse
from app.schemas.conversation import ChatStreamRequest

logger = logging.getLogger(__name__)

router = APIRouter(prefix="/chat", tags=["chat"])


class CompletionRequest(BaseModel):
prefix: str = Field(..., min_length=10, max_length=2000)
conversation_id: int | None = None
knowledge_base_ids: list[int] = Field(default_factory=list)
recent_messages: list[dict] = Field(default_factory=list)


class CompletionResponse(BaseModel):
completion: str
confidence: float


async def _init_services(db: AsyncSession) -> dict:
"""Create LLM + RAG services from user settings."""
from app.services.llm.client import get_llm_client
Expand Down Expand Up @@ -102,3 +116,18 @@ async def chat_stream(
"X-Vercel-AI-UI-Message-Stream": "v1",
},
)


@router.post("/complete", response_model=ApiResponse[CompletionResponse])
async def complete(request: CompletionRequest):
"""Return a short text completion suggestion for autocomplete."""
from app.services.completion_service import CompletionService

svc = CompletionService()
result = await svc.complete(
prefix=request.prefix,
conversation_id=request.conversation_id,
knowledge_base_ids=request.knowledge_base_ids or [],
recent_messages=request.recent_messages or [],
)
return ApiResponse(data=CompletionResponse(**result))
48 changes: 48 additions & 0 deletions backend/app/api/v1/papers.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
"""Paper CRUD and management API endpoints."""

from pathlib import Path

from fastapi import APIRouter, Depends, HTTPException, Query
from fastapi.responses import FileResponse
from sqlalchemy import func, select
from sqlalchemy.ext.asyncio import AsyncSession

from app.api.deps import get_db, get_project
from app.config import settings
from app.models import Paper, Project
from app.schemas.common import ApiResponse, PaginatedData
from app.schemas.paper import PaperBulkImport, PaperCreate, PaperRead, PaperUpdate
Expand Down Expand Up @@ -141,3 +145,47 @@ async def delete_paper(
raise HTTPException(status_code=404, detail="Paper not found")
await db.delete(paper)
return ApiResponse(message="Paper deleted")


@router.get("/{paper_id}/pdf")
async def serve_pdf(
project_id: int,
paper_id: int,
db: AsyncSession = Depends(get_db),
project: Project = Depends(get_project),
):
"""Serve the PDF file for a paper."""
paper = await db.get(Paper, paper_id)
if not paper or paper.project_id != project_id:
raise HTTPException(status_code=404, detail="Paper not found")
if not paper.pdf_path or not Path(paper.pdf_path).exists():
raise HTTPException(status_code=404, detail="PDF file not available")

pdf_path = Path(paper.pdf_path).resolve()
pdf_dir = Path(settings.pdf_dir).resolve()
if not str(pdf_path).startswith(str(pdf_dir)):
raise HTTPException(status_code=403, detail="Access denied")

with open(pdf_path, "rb") as f:
magic = f.read(5)
if magic != b"%PDF-":
raise HTTPException(status_code=400, detail="Invalid PDF file")

return FileResponse(str(pdf_path), media_type="application/pdf", filename=f"{paper.title[:80]}.pdf")


@router.get("/{paper_id}/citation-graph", response_model=ApiResponse)
async def get_citation_graph(
project_id: int,
paper_id: int,
depth: int = Query(1, ge=1, le=2),
max_nodes: int = Query(50, ge=10, le=200),
db: AsyncSession = Depends(get_db),
project: Project = Depends(get_project),
):
"""Get citation relationship graph for a paper via Semantic Scholar."""
from app.services.citation_graph_service import CitationGraphService

svc = CitationGraphService(db)
graph = await svc.get_citation_graph(paper_id, project_id, depth=depth, max_nodes=max_nodes)
return ApiResponse(data=graph)
32 changes: 32 additions & 0 deletions backend/app/api/v1/writing.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Writing assistance API endpoints."""

from fastapi import APIRouter, Depends
from fastapi.responses import StreamingResponse
from pydantic import BaseModel, Field
from sqlalchemy.ext.asyncio import AsyncSession

Expand Down Expand Up @@ -163,3 +164,34 @@ async def analyze_gaps(
research_topic=body.research_topic,
)
return ApiResponse(data=result)


class ReviewDraftRequest(BaseModel):
topic: str = ""
style: str = Field(default="narrative", pattern=r"^(narrative|systematic|thematic)$")
citation_format: str = Field(default="numbered", pattern=r"^(numbered|apa|gb_t_7714)$")
language: str = Field(default="zh", pattern=r"^(zh|en)$")


@router.post("/review-draft/stream")
async def stream_review_draft(
project_id: int,
body: ReviewDraftRequest,
svc: WritingService = Depends(get_writing_service),
project: Project = Depends(get_project),
):
"""Stream a structured literature review draft via SSE."""
return StreamingResponse(
svc.generate_literature_review(
project_id=project_id,
topic=body.topic,
style=body.style,
citation_format=body.citation_format,
language=body.language,
),
media_type="text/event-stream",
headers={
"Cache-Control": "no-cache",
"X-Accel-Buffering": "no",
},
)
20 changes: 19 additions & 1 deletion backend/app/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ class Settings(BaseSettings):

# Application
app_env: Literal["development", "production", "testing"] = "development"
app_debug: bool = True
app_debug: bool = False
app_host: str = "0.0.0.0"
app_port: int = 8000
app_secret_key: str = "change-me-to-a-random-secret-key"
Expand Down Expand Up @@ -69,6 +69,22 @@ class Settings(BaseSettings):
embedding_api_key: str = ""
reranker_model: str = "BAAI/bge-reranker-v2-m3"

# OCR
ocr_lang: str = "ch" # PaddleOCR language: ch (Chinese+English) | en (English only)

# PDF Parsing / MinerU
pdf_parser: str = "auto" # auto | mineru | pdfplumber
mineru_api_url: str = "http://localhost:8010"
mineru_backend: str = "pipeline" # pipeline | hybrid-auto-engine | vlm-auto-engine
mineru_timeout: int = 300

# Dedup thresholds
dedup_title_hard_threshold: float = 0.90
dedup_title_llm_threshold: float = 0.80

# LangGraph
langgraph_checkpoint_dir: str = ""

# GPU
cuda_visible_devices: str = "0,3"

Expand All @@ -95,6 +111,8 @@ def __init__(self, **kwargs):
self.ocr_output_dir = f"{self.data_dir}/ocr_output"
if not self.chroma_db_dir:
self.chroma_db_dir = f"{self.data_dir}/chroma_db"
if not self.langgraph_checkpoint_dir:
self.langgraph_checkpoint_dir = f"{self.data_dir}/langgraph_checkpoints"

@property
def cors_origin_list(self) -> list[str]:
Expand Down
17 changes: 16 additions & 1 deletion backend/app/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,15 @@
import logging
from contextlib import asynccontextmanager

from fastapi import FastAPI
from fastapi import FastAPI, Request
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse

from app.api.v1 import api_router
from app.config import settings
from app.database import init_db
from app.middleware.auth import ApiKeyMiddleware
from app.middleware.rate_limit import setup_rate_limiting
from app.schemas.common import ApiResponse

logging.basicConfig(
Expand Down Expand Up @@ -48,8 +50,21 @@ async def lifespan(app: FastAPI):
allow_headers=["*"],
)

setup_rate_limiting(app)
app.include_router(api_router)


@app.exception_handler(Exception)
async def global_exception_handler(request: Request, exc: Exception):
"""Return sanitised error in production, full detail in debug mode."""
logger.exception("Unhandled exception on %s %s", request.method, request.url.path)
detail = str(exc) if settings.app_debug else "Internal server error"
return JSONResponse(
status_code=500,
content={"code": 500, "message": detail, "data": None},
)


# MCP Server — expose tools and resources to AI IDEs
try:
from app.mcp_server import mcp as mcp_server
Expand Down
Loading
Loading