From 51e592228518d3114bacba12b46a1f8f5f17318c Mon Sep 17 00:00:00 2001 From: onemorelight2024 <1249767952@qq.com> Date: Tue, 31 Mar 2026 18:19:55 +0800 Subject: [PATCH] feat: add GraphRAG pipeline --- fastapi_app/config/settings.py | 20 + fastapi_app/main.py | 3 + fastapi_app/routers/graphrag_kb.py | 347 ++++++++ fastapi_app/source_manager.py | 130 ++- .../workflow_adapters/wa_graphrag_kb.py | 152 ++++ frontend_en/package-lock.json | 172 ++++ frontend_en/package.json | 1 + .../graphrag-kb/GraphRAGKbPanel.tsx | 757 +++++++++++++++++ frontend_en/src/config/api.ts | 2 + frontend_en/src/pages/NotebookView.tsx | 72 +- frontend_en/src/services/graphragKbService.ts | 84 ++ frontend_en/src/types/graphragKb.ts | 77 ++ frontend_en/src/types/index.ts | 12 +- .../src/utils/graphragMarkdownHighlight.ts | 20 + frontend_en/src/vite-env.d.ts | 2 + frontend_en/vite.config.ts | 4 +- frontend_zh/package-lock.json | 172 ++++ frontend_zh/package.json | 1 + .../graphrag-kb/GraphRAGKbPanel.tsx | 760 ++++++++++++++++++ frontend_zh/src/config/api.ts | 3 + frontend_zh/src/pages/NotebookView.tsx | 76 +- frontend_zh/src/services/graphragKbService.ts | 93 +++ frontend_zh/src/types/graphragKb.ts | 77 ++ frontend_zh/src/types/index.ts | 12 +- .../src/utils/graphragMarkdownHighlight.ts | 20 + frontend_zh/src/vite-env.d.ts | 2 + frontend_zh/vite.config.ts | 4 +- requirements-base.txt | 30 +- .../toolkits/graphrag_ms_tool/__init__.py | 21 + .../toolkits/graphrag_ms_tool/indexer.py | 284 +++++++ .../toolkits/graphrag_ms_tool/judge.py | 140 ++++ .../toolkits/graphrag_ms_tool/querier.py | 449 +++++++++++ .../graphrag_ms_tool/subgraph_pruner.py | 185 +++++ .../toolkits/kggen_tool/__init__.py | 12 + .../toolkits/kggen_tool/kg_extractor.py | 219 +++++ .../toolkits/kggen_tool/kg_merger.py | 87 ++ workflow_engine/workflow/wf_graphrag_kb.py | 390 +++++++++ 37 files changed, 4859 insertions(+), 33 deletions(-) create mode 100644 fastapi_app/routers/graphrag_kb.py create mode 100644 fastapi_app/workflow_adapters/wa_graphrag_kb.py create mode 100644 frontend_en/src/components/graphrag-kb/GraphRAGKbPanel.tsx create mode 100644 frontend_en/src/services/graphragKbService.ts create mode 100644 frontend_en/src/types/graphragKb.ts create mode 100644 frontend_en/src/utils/graphragMarkdownHighlight.ts create mode 100644 frontend_zh/src/components/graphrag-kb/GraphRAGKbPanel.tsx create mode 100644 frontend_zh/src/services/graphragKbService.ts create mode 100644 frontend_zh/src/types/graphragKb.ts create mode 100644 frontend_zh/src/utils/graphragMarkdownHighlight.ts create mode 100644 workflow_engine/toolkits/graphrag_ms_tool/__init__.py create mode 100644 workflow_engine/toolkits/graphrag_ms_tool/indexer.py create mode 100644 workflow_engine/toolkits/graphrag_ms_tool/judge.py create mode 100644 workflow_engine/toolkits/graphrag_ms_tool/querier.py create mode 100644 workflow_engine/toolkits/graphrag_ms_tool/subgraph_pruner.py create mode 100644 workflow_engine/toolkits/kggen_tool/__init__.py create mode 100644 workflow_engine/toolkits/kggen_tool/kg_extractor.py create mode 100644 workflow_engine/toolkits/kggen_tool/kg_merger.py create mode 100644 workflow_engine/workflow/wf_graphrag_kb.py diff --git a/fastapi_app/config/settings.py b/fastapi_app/config/settings.py index 1bdeead..51f2d37 100644 --- a/fastapi_app/config/settings.py +++ b/fastapi_app/config/settings.py @@ -75,6 +75,26 @@ class AppSettings(BaseSettings): LOCAL_EMBEDDING_CUDA_VISIBLE_DEVICES: Optional[str] = None LOCAL_EMBEDDING_GPU_MEMORY_UTILIZATION: float = 0.3 + # ── GraphRAG ────────────────────────────────────────────────────────────── + GRAPHRAG_LLM_MODEL: str = "gpt-5" # chat model for entity/community extraction + GRAPHRAG_EMBEDDING_MODEL: str = "text-embedding-3-small" + GRAPHRAG_OUTPUT_DIR: str = "outputs/graphrag_kb" # workspace root, layout: {dir}/{email}/{nb_id}/ + GRAPHRAG_CMD: str = "" # graphrag CLI path; auto-detected from PATH if empty + GRAPHRAG_CHUNK_SIZE: int = 512 # chars per chunk; also written to settings.yaml chunks.size + GRAPHRAG_CHUNK_OVERLAP: int = 64 + GRAPHRAG_RESPONSE_TYPE: str = "Single Paragraph" # passed to local/global_search response_type + GRAPHRAG_SUBGRAPH_PRUNE_ENABLED: bool = True # run LLM subgraph pruning after each query + GRAPHRAG_SUBGRAPH_PRUNE_MAX_EDGES_INPUT: int = 80 # truncate input to pruner to this many edges + GRAPHRAG_MAX_HIGHLIGHT_HINTS: int = 10 # max highlight_hints returned (0 = unlimited) + + # ── KGGen (optional triple extraction, disabled by default) ─────────────── + KGGEN_MODEL: str = "deepseek-v3.2" + KGGEN_PER_CHUNK: bool = True # True = per-chunk calls; False = full-text single call + KGGEN_LOG_CHUNK_INTERVAL: int = 10 # log every N chunks (0 = first/last only) + + # ── Judge (answer confidence scoring) ───────────────────────────────────── + JUDGE_MODEL: str = "gpt-5" # returns judge_score [0,1] and judge_rationale + class Config: env_file = ".env" env_file_encoding = "utf-8" diff --git a/fastapi_app/main.py b/fastapi_app/main.py index fdbd9b1..b2d1047 100644 --- a/fastapi_app/main.py +++ b/fastapi_app/main.py @@ -41,6 +41,7 @@ from fastapi.responses import FileResponse from fastapi_app.routers import auth, data_extract, files, kb, kb_embedding, paper2drawio, paper2ppt +from fastapi_app.routers import graphrag_kb from fastapi_app.middleware.api_key import APIKeyMiddleware from fastapi_app.middleware.logging import LoggingMiddleware from workflow_engine.utils import get_project_root @@ -476,6 +477,8 @@ def create_app() -> FastAPI: app.include_router(paper2drawio.router, prefix="/api/v1", tags=["Paper2Drawio"]) app.include_router(paper2ppt.router, prefix="/api/v1", tags=["Paper2PPT"]) app.include_router(auth.router, prefix="/api/v1", tags=["Auth"]) + # GraphRAG 知识库:/api/v1/graphrag-kb/{index,query,merge,chunk-snippet} → wa_graphrag_kb → wf_graphrag_kb + app.include_router(graphrag_kb.router, prefix="/api/v1", tags=["GraphRAG KB"]) # 静态文件:/outputs 下的文件(兼容 URL 中 %40 与 磁盘 @ 两种路径) project_root = get_project_root() diff --git a/fastapi_app/routers/graphrag_kb.py b/fastapi_app/routers/graphrag_kb.py new file mode 100644 index 0000000..24a1a7d --- /dev/null +++ b/fastapi_app/routers/graphrag_kb.py @@ -0,0 +1,347 @@ +"""GraphRAG 知识库 HTTP 路由(前缀在 ``main`` 中与 ``/api/v1`` 拼接)。 + +【端点与数据流】 + POST ``/graphrag-kb/index`` → ``wa_graphrag_kb.run_index`` → 建索引 → ``IndexResponse`` + POST ``/graphrag-kb/query`` → ``run_query`` → 检索 + Judge(+ 子图 CoT)→ ``QueryResponse`` + POST ``/graphrag-kb/merge`` → ``run_merge`` → 合并两 workspace → ``MergeResponse`` + POST ``/graphrag-kb/chunk-snippet`` → 按 ``chunk_id`` 从 workspace ``input/*.txt`` 抽取 ``[chunk:…]`` 块正文(供前端阅读器高亮) + +【安全】 + ``_safe_workspace_dir`` 将路径解析到项目根目录下,防止目录穿越。 + +【说明】 + 请求体携带与其它路由一致的 LLM 凭证;前端不直连 ``workflow_engine``,仅调本路由。 +""" +from __future__ import annotations + +import json +import re +from pathlib import Path +from typing import Any, Dict, List, Optional + +from fastapi import APIRouter, HTTPException +from pydantic import BaseModel, Field + +from fastapi_app.config import settings +from fastapi_app.workflow_adapters.wa_graphrag_kb import run_index, run_query, run_merge +from workflow_engine.logger import get_logger +from workflow_engine.utils import get_project_root + +log = get_logger(__name__) + +# 匹配 GraphRAG input 文件中每个 chunk 段的起始行(后跟该段正文直至下一 chunk 或 EOF) +_CHUNK_HEAD = re.compile(r"\[chunk:([a-f0-9]+)\]\s*\n", re.IGNORECASE) + + +def _extract_chunk_block_from_input_text(text: str, chunk_id: str) -> str: + """在整份 ``input/.txt`` 文本中,定位 ``[chunk:目标id]`` 之后到下一 ``[chunk:`` 之前的正文。""" + want = chunk_id.strip().lower() + matches = list(_CHUNK_HEAD.finditer(text)) + for i, m in enumerate(matches): + if m.group(1).lower() != want: + continue + start = m.end() + end = matches[i + 1].start() if i + 1 < len(matches) else len(text) + return text[start:end].strip() + return "" + + +def _safe_workspace_dir(raw: str) -> Path: + """将 *raw* 解析为项目根目录下的绝对路径;越界则抛 ``HTTPException(400)``。""" + root = get_project_root().resolve() + p = Path(raw.strip()) + if not p.is_absolute(): + p = (root / p).resolve() + else: + p = p.resolve() + try: + p.relative_to(root) + except ValueError as exc: + raise HTTPException(status_code=400, detail="workspace_dir must be under project root") from exc + return p + +router = APIRouter(prefix="/graphrag-kb", tags=["GraphRAG KB"]) + +# --------------------------------------------------------------------------- +# Pydantic request/response models +# --------------------------------------------------------------------------- + +class _LLMBase(BaseModel): + api_url: str = Field(default_factory=lambda: settings.DEFAULT_LLM_API_URL) + api_key: str = "" + model: str = Field(default_factory=lambda: settings.GRAPHRAG_LLM_MODEL) + + +class IndexRequest(_LLMBase): + notebook_id: str + notebook_title: str = "" + email: str = "" + source_stems: Optional[List[str]] = None + workspace_dir: str = "" + force_reindex: bool = False + # Run MinerU on un-parsed PDFs before chunk extraction. + # Set to False if MinerU was already triggered via /kb/upload. + parse_pdfs: bool = True + # Default True: do not run KGGen (user-facing path is GraphRAG-only). + skip_kggen: bool = True + + +class IndexResponse(BaseModel): + workspace_dir: str + num_chunks: int + kg_entities: int + kg_relations: int + + +class QueryRequest(_LLMBase): + notebook_id: str + notebook_title: str = "" + email: str = "" + question: str + search_method: str = Field(default="local", pattern="^(local|global)$") + workspace_dir: str = "" + + +class QueryResponse(BaseModel): + answer: str + context_data: Dict[str, Any] = Field(default_factory=dict) + reasoning_subgraph: List[Dict[str, Any]] = Field(default_factory=list) + source_chunks: List[str] = Field(default_factory=list) + highlight_hints: List[Dict[str, Any]] = Field(default_factory=list) + judge_score: float = 0.0 + judge_rationale: str = "" + reasoning_subgraph_cot: str = "" + + +class MergeRequest(_LLMBase): + notebook_id: str = "" + notebook_title: str = "" + email: str = "" + workspace_dir_a: str + workspace_dir_b: str + dedupe: bool = False + + +class MergeResponse(BaseModel): + merged_workspace_dir: str + num_chunks: int + + +class ChunkSnippetRequest(BaseModel): + """Resolve *chunk_id* to raw text inside GraphRAG ``input/.txt`` markers.""" + + workspace_dir: str = Field(..., description="GraphRAG workspace root (contains chunk_meta.json + input/)") + chunk_id: str = Field(..., min_length=8, description="Hex chunk id from chunk_meta / query") + # Optional: pass reasoning_subgraph triples so the backend can ask an LLM to pick + # the exact sentence from the chunk that best expresses one of these relationships. + triples: Optional[List[Dict[str, Any]]] = None + + +class ChunkSnippetResponse(BaseModel): + text: str = "" + source_stem: str = "" + found: bool = False + # LLM-extracted verbatim sentence from the chunk that best matches the triples. + # Empty string if triples were not provided or LLM extraction failed. + highlighted_sentence: str = "" + + +# --------------------------------------------------------------------------- +# Endpoints +# --------------------------------------------------------------------------- + + +def _extract_sentence_for_triples( + chunk_text: str, + triples: List[Dict[str, Any]], +) -> str: + """Ask the configured LLM to pick the verbatim sentence from chunk_text that best + expresses one of the given knowledge-graph triples. Returns empty string on failure. + """ + if not chunk_text.strip() or not triples: + return "" + try: + from openai import OpenAI + except ImportError: + log.debug("[ChunkSnippet] openai not installed; skipping sentence extraction") + return "" + + triple_lines = "\n".join( + f" ({t.get('source', '?')}) --[{t.get('relation', '?')}]--> ({t.get('target', '?')})" + for t in triples[:20] + ) + system_prompt = ( + "You are a precise text extraction assistant. " + "Return ONLY the verbatim sentence or short phrase from the provided chunk " + "that best expresses one of the given relationships. " + "Do NOT paraphrase, add explanation, or include any other text." + ) + user_msg = ( + f"Knowledge graph relationships:\n{triple_lines}\n\n" + f"Chunk text:\n{chunk_text}\n\n" + "Extract the EXACT sentence or phrase from the chunk that best matches " + "one of the relationships above. Return only that text." + ) + try: + api_base = settings.DEFAULT_LLM_API_URL.rstrip("/") + import os + api_key = os.getenv("DF_API_KEY", "") or "none" + client = OpenAI(api_key=api_key, base_url=api_base) + resp = client.chat.completions.create( + model=settings.GRAPHRAG_LLM_MODEL, + max_tokens=256, + temperature=0, + messages=[ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_msg}, + ], + ) + sentence = (resp.choices[0].message.content or "").strip() + # Sanity check: LLM must return something that actually appears in the chunk + if sentence and sentence in chunk_text: + return sentence + log.debug("[ChunkSnippet] LLM sentence not found verbatim in chunk; discarding") + return "" + except Exception as exc: + log.warning("[ChunkSnippet] LLM extraction failed: %s", exc) + return "" + + +@router.post("/chunk-snippet", response_model=ChunkSnippetResponse, summary="Extract [chunk:…] text from GraphRAG input") +async def chunk_snippet_endpoint(req: ChunkSnippetRequest) -> ChunkSnippetResponse: + """Used by the notebook reader to show the exact indexed chunk, not the full MinerU MD.""" + ws = _safe_workspace_dir(req.workspace_dir) + meta_path = ws / "chunk_meta.json" + if not meta_path.is_file(): + return ChunkSnippetResponse() + try: + meta = json.loads(meta_path.read_text(encoding="utf-8")) + except Exception: + return ChunkSnippetResponse() + cid = req.chunk_id.strip().lower() + entry = meta.get(req.chunk_id.strip()) or meta.get(cid) + if not isinstance(entry, dict): + return ChunkSnippetResponse() + stem = str(entry.get("source_stem") or "").strip() + if not stem: + return ChunkSnippetResponse() + txt_path = ws / "input" / f"{stem}.txt" + if not txt_path.is_file(): + return ChunkSnippetResponse(source_stem=stem, found=False) + try: + raw = txt_path.read_text(encoding="utf-8", errors="replace") + except Exception: + return ChunkSnippetResponse(source_stem=stem, found=False) + block = _extract_chunk_block_from_input_text(raw, cid) + if not block: + return ChunkSnippetResponse(source_stem=stem, found=False) + highlighted_sentence = "" + if req.triples: + highlighted_sentence = _extract_sentence_for_triples(block, req.triples) + return ChunkSnippetResponse(text=block, source_stem=stem, found=True, highlighted_sentence=highlighted_sentence) + + +# --------------------------------------------------------------------------- +# Index / query / merge +# --------------------------------------------------------------------------- + +@router.post("/index", response_model=IndexResponse, summary="Build GraphRAG index from notebook sources") +async def index_endpoint(req: IndexRequest): + """Chunk notebook sources and run GraphRAG index (KGGen off by default). + + Requires that sources have already been imported into the notebook + (via the ``/kb`` upload endpoint) so that MinerU output exists. + """ + try: + result = await run_index( + notebook_id=req.notebook_id, + notebook_title=req.notebook_title, + email=req.email, + api_url=req.api_url, + api_key=req.api_key, + model=req.model, + source_stems=req.source_stems, + workspace_dir=req.workspace_dir, + force_reindex=req.force_reindex, + parse_pdfs=req.parse_pdfs, + skip_kggen=req.skip_kggen, + ) + return IndexResponse( + workspace_dir=result.get("workspace_dir", ""), + num_chunks=result.get("num_chunks", 0), + kg_entities=result.get("kg_entities", 0), + kg_relations=result.get("kg_relations", 0), + ) + except Exception as exc: + log.exception("[Router] /graphrag-kb/index error: %s", exc) + raise HTTPException(status_code=500, detail=str(exc)) + + +@router.post("/query", response_model=QueryResponse, summary="Query GraphRAG index with Judge scoring") +async def query_endpoint(req: QueryRequest): + """Run a local or global GraphRAG search and return a structured result. + + Returns: + - ``answer`` — model answer text + - ``context_data`` — serialised evidence tables (entities, relations, sources…) + - ``reasoning_subgraph`` — edge list induced from context_data + - ``source_chunks`` — chunk_ids that contributed to the answer + - ``highlight_hints`` — page/bbox hints for PDF highlighting + - ``judge_score`` — confidence score in [0.0, 1.0] + - ``judge_rationale`` — one-sentence judge explanation + - ``reasoning_subgraph_cot`` — LLM chain-of-thought for minimal subgraph (hop analysis) + """ + try: + result = await run_query( + notebook_id=req.notebook_id, + notebook_title=req.notebook_title, + email=req.email, + api_url=req.api_url, + api_key=req.api_key, + model=req.model, + question=req.question, + search_method=req.search_method, + workspace_dir=req.workspace_dir, + ) + return QueryResponse( + answer=result.get("answer", ""), + context_data=result.get("context_data", {}), + reasoning_subgraph=result.get("reasoning_subgraph", []), + source_chunks=result.get("source_chunks", []), + highlight_hints=result.get("highlight_hints", []), + judge_score=float(result.get("judge_score", 0.0)), + judge_rationale=result.get("judge_rationale", ""), + reasoning_subgraph_cot=result.get("reasoning_subgraph_cot", ""), + ) + except Exception as exc: + log.exception("[Router] /graphrag-kb/query error: %s", exc) + raise HTTPException(status_code=500, detail=str(exc)) + + +@router.post("/merge", response_model=MergeResponse, summary="Merge two GraphRAG KG workspaces") +async def merge_endpoint(req: MergeRequest): + """Merge two GraphRAG workspaces using KGGen aggregate and re-index. + + Both ``workspace_dir_a`` and ``workspace_dir_b`` must be absolute paths to + valid, previously indexed workspaces. The merged workspace is written to + ``{workspace_dir_a}_merged/``. + """ + try: + result = await run_merge( + notebook_id=req.notebook_id, + notebook_title=req.notebook_title, + email=req.email, + api_url=req.api_url, + api_key=req.api_key, + model=req.model, + workspace_dir_a=req.workspace_dir_a, + workspace_dir_b=req.workspace_dir_b, + dedupe=req.dedupe, + ) + return MergeResponse( + merged_workspace_dir=result.get("merged_workspace_dir", ""), + num_chunks=result.get("num_chunks", 0), + ) + except Exception as exc: + log.exception("[Router] /graphrag-kb/merge error: %s", exc) + raise HTTPException(status_code=500, detail=str(exc)) diff --git a/fastapi_app/source_manager.py b/fastapi_app/source_manager.py index 5778163..2645cbc 100644 --- a/fastapi_app/source_manager.py +++ b/fastapi_app/source_manager.py @@ -7,16 +7,19 @@ - Generating unified markdown for every source type - Reading back markdown / MinerU data for feature cards - Fallback to legacy kb_data / kb_mineru paths +- Structured chunk extraction with chunk_id / page_index / order / bbox (for GraphRAG) """ from __future__ import annotations import asyncio +import hashlib +import json import re import shutil import time from dataclasses import dataclass, field from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple from workflow_engine.logger import get_logger from workflow_engine.utils import get_project_root @@ -237,6 +240,85 @@ def ensure_sam3_dir(self, source_stem: str) -> Path: sam3_dir.mkdir(parents=True, exist_ok=True) return sam3_dir + def get_chunks_with_meta( + self, + source_stem: str, + chunk_size: int = 512, # 默认值与 settings.GRAPHRAG_CHUNK_SIZE 一致 + chunk_overlap: int = 64, # 默认值与 settings.GRAPHRAG_CHUNK_OVERLAP 一致 + ) -> List[Dict[str, Any]]: + """Return structured chunks for a single source, used by GraphRAG indexing. + + Each dict has keys: chunk_id, text, page_index, order, bbox, source_stem. + chunk_id = SHA1("{stem}:{order}")[:16], embedded as [chunk:ID] in input/*.txt. + Priority: MinerU content_list.json (exact page+bbox) → MinerU MD (estimated page) + → unified MD (page_index=-1). + """ + chunks: List[Dict[str, Any]] = [] + + # 1) MinerU content_list.json — exact page + bbox per block + mineru_root = self.get_mineru_root(source_stem) + if mineru_root: + content_list_path = None + # rglob to handle varying MinerU output directory layouts + for candidate in mineru_root.parent.rglob("*_content_list.json"): + content_list_path = candidate + break + if content_list_path and content_list_path.exists(): + try: + raw_blocks = json.loads( + content_list_path.read_text(encoding="utf-8") + ) + order = 0 + for block in raw_blocks: + # MinerU uses "text" or "content" depending on version + text = (block.get("text") or block.get("content") or "").strip() + if not text: + continue # skip image / formula blocks + # MinerU uses "page_idx" or "page_index" depending on version + page_idx = int(block.get("page_idx", block.get("page_index", -1))) + bbox = block.get("bbox") # [x1,y1,x2,y2] normalized, may be None + # chunk_id = SHA1("{stem}:{order}")[:16], embedded as [chunk:ID] in input/*.txt + chunk_id = hashlib.sha1( + f"{source_stem}:{order}".encode() + ).hexdigest()[:16] + chunks.append( + { + "chunk_id": chunk_id, + "text": text, + "page_index": page_idx, + "order": order, + "bbox": bbox, + "source_stem": source_stem, + } + ) + order += 1 + if chunks: + return chunks + except Exception as e: + log.debug( + "[SourceManager] content_list.json parse failed for %s: %s", + source_stem, + e, + ) + + # 2) MinerU markdown fallback — sliding window, estimated page_index + mineru_md = self.get_mineru_md(source_stem) + if mineru_md.strip(): + chunks = self._split_text_to_chunks( + mineru_md, source_stem, chunk_size, chunk_overlap, estimate_pages=True + ) + if chunks: + return chunks + + # 3) Unified markdown fallback — no page info (Word/PPT/TXT) + md = self.get_markdown(source_stem) + if md.strip(): + return self._split_text_to_chunks( + md, source_stem, chunk_size, chunk_overlap, estimate_pages=False + ) + + return [] + def get_all_markdowns(self) -> List[Tuple[str, str]]: """Return [(stem, markdown_text), ...] for all sources.""" results: List[Tuple[str, str]] = [] @@ -393,3 +475,49 @@ def _find_in_sources(self, source_stem: str, subdir: str, pattern: str) -> str: except Exception: continue return "" + + @staticmethod + def _split_text_to_chunks( + text: str, + source_stem: str, + chunk_size: int, + chunk_overlap: int, + estimate_pages: bool, + ) -> List[Dict[str, Any]]: + """Sliding-window character chunking fallback when content_list is unavailable. + + estimate_pages=True roughly estimates page_index at ~2000 chars/page. + """ + chunks: List[Dict[str, Any]] = [] + text = text.strip() + if not text: + return chunks + + total_chars = len(text) + step = max(1, chunk_size - chunk_overlap) + order = 0 + pos = 0 + chars_per_page = 2000 # rough estimate: ~2000 chars per page + + while pos < total_chars: + end = min(pos + chunk_size, total_chars) + snippet = text[pos:end].strip() + if snippet: + page_idx = int(pos / chars_per_page) if estimate_pages else -1 + chunk_id = hashlib.sha1( + f"{source_stem}:{order}".encode() + ).hexdigest()[:16] + chunks.append( + { + "chunk_id": chunk_id, + "text": snippet, + "page_index": page_idx, + "order": order, + "bbox": None, + "source_stem": source_stem, + } + ) + order += 1 + pos += step + + return chunks diff --git a/fastapi_app/workflow_adapters/wa_graphrag_kb.py b/fastapi_app/workflow_adapters/wa_graphrag_kb.py new file mode 100644 index 0000000..f15cdbf --- /dev/null +++ b/fastapi_app/workflow_adapters/wa_graphrag_kb.py @@ -0,0 +1,152 @@ +"""GraphRAG KB 管线的工作流适配层。 + +【职责】 + 在 FastAPI 路由(Pydantic 请求体)与 ``wf_graphrag_kb``(``GraphRAGKBState`` 数据类)之间做转换, + 统一调用 ``run_workflow("graphrag_kb", state)``,再从 ``agent_results`` / ``temp_data.errors`` 取结果。 + +【数据流】 + ``run_index`` / ``run_query`` / ``run_merge`` → 组装 ``GraphRAGKBRequest.action`` → + ``GraphRAGKBState`` → LangGraph 执行 → 成功则返回对应 ``agent_results`` 字典;失败则 ``RuntimeError``(携带首条错误信息)。 + +【约定】 + 与 ``wa_paper2ppt.py`` 类似:``_workflow_outcome`` 兼容 LangGraph 返回 dataclass 或 dict。 +""" +from __future__ import annotations + +from typing import Any, Dict, List, Optional, Tuple + +from workflow_engine.logger import get_logger +from workflow_engine.workflow import run_workflow +from workflow_engine.workflow.wf_graphrag_kb import GraphRAGKBRequest, GraphRAGKBState + +log = get_logger(__name__) + + +def _workflow_outcome(state: Any) -> Tuple[Dict[str, Any], Optional[list]]: + """统一解析工作流终态:得到 ``(agent_results, errors)``,兼容 dict 与 dataclass 两种返回形式。""" + if isinstance(state, dict): + td = state.get("temp_data") + td = td if isinstance(td, dict) else {} + errors = td.get("errors") + ar = state.get("agent_results") + ar = ar if isinstance(ar, dict) else {} + return ar, errors + td = getattr(state, "temp_data", None) + td = td if isinstance(td, dict) else {} + errors = td.get("errors") + ar = getattr(state, "agent_results", None) + ar = ar if isinstance(ar, dict) else {} + return ar, errors + + +# --------------------------------------------------------------------------- +# Public adapter functions (called by routers) +# --------------------------------------------------------------------------- + +async def run_index( + *, + notebook_id: str, + notebook_title: str, + email: str, + api_url: str, + api_key: str, + model: str, + source_stems: Optional[List[str]] = None, + workspace_dir: str = "", + force_reindex: bool = False, + parse_pdfs: bool = True, + skip_kggen: bool = True, +) -> Dict[str, Any]: + """Run indexing workflow; returns ``agent_results["index"]`` dict on success.""" + req = GraphRAGKBRequest( + action="index", + notebook_id=notebook_id, + notebook_title=notebook_title, + email=email, + chat_api_url=api_url, + api_key=api_key, + model=model, + source_stems=source_stems or [], + workspace_dir=workspace_dir, + force_reindex=force_reindex, + parse_pdfs=parse_pdfs, + skip_kggen=skip_kggen, + ) + state = GraphRAGKBState(request=req) + state = await run_workflow("graphrag_kb", state) + + agent_results, errors = _workflow_outcome(state) + if errors: + raise RuntimeError(f"Indexing failed: {errors[0]}") + + return agent_results.get("index", {}) + + +async def run_query( + *, + notebook_id: str, + notebook_title: str, + email: str, + api_url: str, + api_key: str, + model: str, + question: str, + search_method: str = "local", + workspace_dir: str = "", +) -> Dict[str, Any]: + """Run query workflow; returns ``agent_results["query"]`` dict on success.""" + req = GraphRAGKBRequest( + action="query", + notebook_id=notebook_id, + notebook_title=notebook_title, + email=email, + chat_api_url=api_url, + api_key=api_key, + model=model, + question=question, + search_method=search_method, + workspace_dir=workspace_dir, + ) + state = GraphRAGKBState(request=req) + state = await run_workflow("graphrag_kb", state) + + agent_results, errors = _workflow_outcome(state) + if errors: + raise RuntimeError(f"Query failed: {errors[0]}") + + return agent_results.get("query", {}) + + +async def run_merge( + *, + notebook_id: str, + notebook_title: str, + email: str, + api_url: str, + api_key: str, + model: str, + workspace_dir_a: str, + workspace_dir_b: str, + dedupe: bool = False, +) -> Dict[str, Any]: + """Merge two GraphRAG workspaces and re-index; returns ``agent_results["merge"]``.""" + req = GraphRAGKBRequest( + action="merge", + notebook_id=notebook_id, + notebook_title=notebook_title, + email=email, + chat_api_url=api_url, + api_key=api_key, + model=model, + workspace_dir=workspace_dir_a, + workspace_dir_b=workspace_dir_b, + dedupe=dedupe, + ) + state = GraphRAGKBState(request=req) + state = await run_workflow("graphrag_kb", state) + + agent_results, errors = _workflow_outcome(state) + if errors: + raise RuntimeError(f"Merge failed: {errors[0]}") + + return agent_results.get("merge", {}) diff --git a/frontend_en/package-lock.json b/frontend_en/package-lock.json index 873bc08..df09701 100644 --- a/frontend_en/package-lock.json +++ b/frontend_en/package-lock.json @@ -19,6 +19,7 @@ "react-dom": "^18.2.0", "react-markdown": "^9.1.0", "react-pdf": "^10.3.0", + "rehype-raw": "^7.0.0", "tailwind-merge": "^2.0.0", "zustand": "^4.4.7" }, @@ -2632,6 +2633,17 @@ "integrity": "sha512-f/ZeWvW/BCXbhGEf1Ujp29EASo/lk1FDnETgNKwJrsVvGZhUWCZyg3xLJjAsxfOmt8KjswHmI5EwCQcPMpOYhQ==", "license": "EPL-2.0" }, + "node_modules/entities": { + "version": "6.0.1", + "resolved": "https://registry.npmjs.org/entities/-/entities-6.0.1.tgz", + "integrity": "sha512-aN97NXWF6AWBTahfVOIrB/NShkzi5H7F9r1s9mD3cDj4Ko5f2qhhVoYMibXF7GlLveb/D2ioWay8lxI97Ven3g==", + "engines": { + "node": ">=0.12" + }, + "funding": { + "url": "https://github.com/fb55/entities?sponsor=1" + } + }, "node_modules/esbuild": { "version": "0.21.5", "resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.21.5.tgz", @@ -2849,6 +2861,71 @@ "node": ">= 0.4" } }, + "node_modules/hast-util-from-parse5": { + "version": "8.0.3", + "resolved": "https://registry.npmjs.org/hast-util-from-parse5/-/hast-util-from-parse5-8.0.3.tgz", + "integrity": "sha512-3kxEVkEKt0zvcZ3hCRYI8rqrgwtlIOFMWkbclACvjlDw8Li9S2hk/d51OI0nr/gIpdMHNepwgOKqZ/sy0Clpyg==", + "dependencies": { + "@types/hast": "^3.0.0", + "@types/unist": "^3.0.0", + "devlop": "^1.0.0", + "hastscript": "^9.0.0", + "property-information": "^7.0.0", + "vfile": "^6.0.0", + "vfile-location": "^5.0.0", + "web-namespaces": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/hast-util-from-parse5/node_modules/@types/unist": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/@types/unist/-/unist-3.0.3.tgz", + "integrity": "sha512-ko/gIFJRv177XgZsZcBwnqJN5x/Gien8qNOn0D5bQU/zAzVf9Zt3BlcUiLqhV9y4ARk0GbT3tnUiPNgnTXzc/Q==" + }, + "node_modules/hast-util-parse-selector": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/hast-util-parse-selector/-/hast-util-parse-selector-4.0.0.tgz", + "integrity": "sha512-wkQCkSYoOGCRKERFWcxMVMOcYE2K1AaNLU8DXS9arxnLOUEWbOXKXiJUNzEpqZ3JOKpnha3jkFrumEjVliDe7A==", + "dependencies": { + "@types/hast": "^3.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/hast-util-raw": { + "version": "9.1.0", + "resolved": "https://registry.npmjs.org/hast-util-raw/-/hast-util-raw-9.1.0.tgz", + "integrity": "sha512-Y8/SBAHkZGoNkpzqqfCldijcuUKh7/su31kEBp67cFY09Wy0mTRgtsLYsiIxMJxlu0f6AA5SUTbDR8K0rxnbUw==", + "dependencies": { + "@types/hast": "^3.0.0", + "@types/unist": "^3.0.0", + "@ungap/structured-clone": "^1.0.0", + "hast-util-from-parse5": "^8.0.0", + "hast-util-to-parse5": "^8.0.0", + "html-void-elements": "^3.0.0", + "mdast-util-to-hast": "^13.0.0", + "parse5": "^7.0.0", + "unist-util-position": "^5.0.0", + "unist-util-visit": "^5.0.0", + "vfile": "^6.0.0", + "web-namespaces": "^2.0.0", + "zwitch": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/hast-util-raw/node_modules/@types/unist": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/@types/unist/-/unist-3.0.3.tgz", + "integrity": "sha512-ko/gIFJRv177XgZsZcBwnqJN5x/Gien8qNOn0D5bQU/zAzVf9Zt3BlcUiLqhV9y4ARk0GbT3tnUiPNgnTXzc/Q==" + }, "node_modules/hast-util-to-jsx-runtime": { "version": "2.3.6", "resolved": "https://registry.npmjs.org/hast-util-to-jsx-runtime/-/hast-util-to-jsx-runtime-2.3.6.tgz", @@ -2882,6 +2959,24 @@ "integrity": "sha512-ko/gIFJRv177XgZsZcBwnqJN5x/Gien8qNOn0D5bQU/zAzVf9Zt3BlcUiLqhV9y4ARk0GbT3tnUiPNgnTXzc/Q==", "license": "MIT" }, + "node_modules/hast-util-to-parse5": { + "version": "8.0.1", + "resolved": "https://registry.npmjs.org/hast-util-to-parse5/-/hast-util-to-parse5-8.0.1.tgz", + "integrity": "sha512-MlWT6Pjt4CG9lFCjiz4BH7l9wmrMkfkJYCxFwKQic8+RTZgWPuWxwAfjJElsXkex7DJjfSJsQIt931ilUgmwdA==", + "dependencies": { + "@types/hast": "^3.0.0", + "comma-separated-tokens": "^2.0.0", + "devlop": "^1.0.0", + "property-information": "^7.0.0", + "space-separated-tokens": "^2.0.0", + "web-namespaces": "^2.0.0", + "zwitch": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, "node_modules/hast-util-whitespace": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/hast-util-whitespace/-/hast-util-whitespace-3.0.0.tgz", @@ -2895,6 +2990,22 @@ "url": "https://opencollective.com/unified" } }, + "node_modules/hastscript": { + "version": "9.0.1", + "resolved": "https://registry.npmjs.org/hastscript/-/hastscript-9.0.1.tgz", + "integrity": "sha512-g7df9rMFX/SPi34tyGCyUBREQoKkapwdY/T04Qn9TDWfHhAYt4/I0gMVirzK5wEzeUqIjEB+LXC/ypb7Aqno5w==", + "dependencies": { + "@types/hast": "^3.0.0", + "comma-separated-tokens": "^2.0.0", + "hast-util-parse-selector": "^4.0.0", + "property-information": "^7.0.0", + "space-separated-tokens": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, "node_modules/html-url-attributes": { "version": "3.0.1", "resolved": "https://registry.npmjs.org/html-url-attributes/-/html-url-attributes-3.0.1.tgz", @@ -2905,6 +3016,15 @@ "url": "https://opencollective.com/unified" } }, + "node_modules/html-void-elements": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/html-void-elements/-/html-void-elements-3.0.0.tgz", + "integrity": "sha512-bEqo66MRXsUGxWHV5IP0PUiAWwoEjba4VCzg0LjFJBpchPaTfyfCKTG6bc5F8ucKec3q5y6qOdGyYTSBEvhCrg==", + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } + }, "node_modules/iceberg-js": { "version": "0.8.1", "resolved": "https://registry.npmjs.org/iceberg-js/-/iceberg-js-0.8.1.tgz", @@ -5773,6 +5893,17 @@ "url": "https://github.com/sponsors/wooorm" } }, + "node_modules/parse5": { + "version": "7.3.0", + "resolved": "https://registry.npmjs.org/parse5/-/parse5-7.3.0.tgz", + "integrity": "sha512-IInvU7fabl34qmi9gY8XOVxhYyMyuH2xUNpb2q8/Y+7552KlejkRvqvD19nMoUW/uQGGbqNpA6Tufu5FL5BZgw==", + "dependencies": { + "entities": "^6.0.0" + }, + "funding": { + "url": "https://github.com/inikulin/parse5?sponsor=1" + } + }, "node_modules/path-parse": { "version": "1.0.7", "resolved": "https://registry.npmjs.org/path-parse/-/path-parse-1.0.7.tgz", @@ -6149,6 +6280,20 @@ "node": ">=8.10.0" } }, + "node_modules/rehype-raw": { + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/rehype-raw/-/rehype-raw-7.0.0.tgz", + "integrity": "sha512-/aE8hCfKlQeA8LmyeyQvQF3eBiLRGNlfBJEvWH7ivp9sBqs7TNqBL5X3v157rM4IFETqDnIOO+z5M/biZbo9Ww==", + "dependencies": { + "@types/hast": "^3.0.0", + "hast-util-raw": "^9.0.0", + "vfile": "^6.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, "node_modules/remark-parse": { "version": "11.0.0", "resolved": "https://registry.npmjs.org/remark-parse/-/remark-parse-11.0.0.tgz", @@ -7361,6 +7506,24 @@ "url": "https://opencollective.com/unified" } }, + "node_modules/vfile-location": { + "version": "5.0.3", + "resolved": "https://registry.npmjs.org/vfile-location/-/vfile-location-5.0.3.tgz", + "integrity": "sha512-5yXvWDEgqeiYiBe1lbxYF7UMAIm/IcopxMHrMQDq3nvKcjPKIhZklUKL+AE7J7uApI4kwe2snsK+eI6UTj9EHg==", + "dependencies": { + "@types/unist": "^3.0.0", + "vfile": "^6.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/vfile-location/node_modules/@types/unist": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/@types/unist/-/unist-3.0.3.tgz", + "integrity": "sha512-ko/gIFJRv177XgZsZcBwnqJN5x/Gien8qNOn0D5bQU/zAzVf9Zt3BlcUiLqhV9y4ARk0GbT3tnUiPNgnTXzc/Q==" + }, "node_modules/vfile-message": { "version": "4.0.3", "resolved": "https://registry.npmjs.org/vfile-message/-/vfile-message-4.0.3.tgz", @@ -7469,6 +7632,15 @@ "loose-envify": "^1.0.0" } }, + "node_modules/web-namespaces": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/web-namespaces/-/web-namespaces-2.0.1.tgz", + "integrity": "sha512-bKr1DkiNa2krS7qxNtdrtHAmzuYGFQLiQ13TsorsdT6ULTkPLKuu5+GsFpDlg6JFjUTwX2DyhMPG2be8uPrqsQ==", + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } + }, "node_modules/web-worker": { "version": "1.5.0", "resolved": "https://registry.npmjs.org/web-worker/-/web-worker-1.5.0.tgz", diff --git a/frontend_en/package.json b/frontend_en/package.json index b6d9db4..d2b2fac 100644 --- a/frontend_en/package.json +++ b/frontend_en/package.json @@ -20,6 +20,7 @@ "react-dom": "^18.2.0", "react-markdown": "^9.1.0", "react-pdf": "^10.3.0", + "rehype-raw": "^7.0.0", "tailwind-merge": "^2.0.0", "zustand": "^4.4.7" }, diff --git a/frontend_en/src/components/graphrag-kb/GraphRAGKbPanel.tsx b/frontend_en/src/components/graphrag-kb/GraphRAGKbPanel.tsx new file mode 100644 index 0000000..98b148d --- /dev/null +++ b/frontend_en/src/components/graphrag-kb/GraphRAGKbPanel.tsx @@ -0,0 +1,757 @@ +/** + * GraphRAG 知识库侧栏 UI:索引构建、Local/Global 查询、推理子图展示、文档定位卡片、合并工作区。 + * + * 数据流:用户操作 → ``graphragKbService`` → 后端管线 → ``queryResult`` 状态; + * 「在知识库中打开」通过 ``onOpenGraphragSource`` 回调把 sourceStem、chunkId、workspaceDir 交给 NotebookView,联动阅读器与高亮。 + */ +import React, { useCallback, useEffect, useMemo, useState } from 'react'; +import ReactMarkdown from 'react-markdown'; +import { Loader2, Copy, Download, ChevronDown, ChevronRight, Network, ExternalLink } from 'lucide-react'; +import { getApiSettings } from '../../services/apiSettingsService'; +import { + indexGraphragKb, + queryGraphragKb, + mergeGraphragKb, + defaultGraphragModel, +} from '../../services/graphragKbService'; +import type { QueryResponse, GraphragWorkspacePersist } from '../../types/graphragKb'; +import { MermaidPreview } from '../knowledge-base/tools/MermaidPreview'; + +const KNOWN_HINT_KEYS = ['page', 'page_num', 'bbox', 'sentence', 'text', 'chunk_id', 'source', 'file', 'file_name']; + +/** 与阅读器联动时传入的载荷(source_stem 对应知识库里的文件名 stem) */ +export type GraphragOpenSourcePayload = { + sourceStem: string; + pageIndex: number; + chunkId?: string; + /** 当前笔记本 GraphRAG 工作区根目录,用于拉取 ``[chunk:…]`` 原文高亮 */ + workspaceDir?: string; +}; + +function _parsePageIndex(v: unknown): number | undefined { + if (typeof v === 'number' && !Number.isNaN(v)) return v; + if (typeof v === 'string') { + const n = parseInt(v, 10); + return Number.isNaN(n) ? undefined : n; + } + return undefined; +} + +function getWorkspaceStorageKey(userId: string, notebookId: string) { + return `graphrag_workspace_${userId}_${notebookId}`; +} + +function sanitizeMermaidLabel(s: string, max = 48): string { + return s.replace(/["[\]#]/g, ' ').slice(0, max).trim() || '?'; +} + +/** 将 reasoning_subgraph 转为 Mermaid graph LR(边数上限避免卡顿) */ +export function reasoningSubgraphToMermaid(edges: Array>, maxEdges = 36): string | null { + if (!edges.length) return null; + const slice = edges.slice(0, maxEdges); + const idFor = (() => { + const m = new Map(); + let n = 0; + return (raw: string) => { + const k = raw || `_${n}`; + if (!m.has(k)) m.set(k, `N${n++}`); + return m.get(k)!; + }; + })(); + const lines: string[] = ['graph LR']; + for (let i = 0; i < slice.length; i++) { + const e = slice[i]; + const src = String(e.source ?? e.src ?? e.from ?? e.head ?? `s${i}`); + const tgt = String(e.target ?? e.tgt ?? e.to ?? e.tail ?? `t${i}`); + const rel = String(e.relation ?? e.relationship ?? e.label ?? e.predicate ?? ''); + const sid = idFor(src); + const tid = idFor(tgt); + const sl = sanitizeMermaidLabel(src); + const tl = sanitizeMermaidLabel(tgt); + const rl = sanitizeMermaidLabel(rel, 24); + lines.push(` ${sid}["${sl}"] -->|"${rl}"| ${tid}["${tl}"]`); + } + return lines.join('\n'); +} + +const STR = { + zh: { + headerTitle: 'GraphRAG 知识库', + headerSub: '分块(MinerU)+ GraphRAG 建索引与检索(用户路径不含 KGGen)', + apiWarn: '请先在设置中配置 API URL 与 API Key', + noNotebook: '缺少笔记本 ID', + indexBtn: '构建索引', + indexing: '索引构建中…', + indexOk: '索引构建完成', + forceReindex: '强制重建', + parsePdfs: '解析 PDF(MinerU)', + summary: '上次索引摘要', + chunks: '分块数', + workspace: '工作区目录', + copy: '复制', + copied: '已复制', + queryQ: '问题', + queryPlaceholder: '输入要问的问题…', + searchLocal: 'Local', + searchGlobal: 'Global', + queryBtn: '查询', + querying: '查询中…', + answer: '回答', + judge: 'Judge 分数', + rationale: '说明', + subgraph: '推理子图', + viewTable: '表格', + viewMermaid: 'Mermaid', + viewJson: 'JSON', + noSubgraph: '无子图数据', + subgraphCot: '最小子图推理(CoT / 跳数)', + hintDoc: '文档', + hintPage: '页码', + hintBbox: '区域框', + openInKb: '在知识库中打开', + hints: '文档定位', + context: 'context_data(体积可能较大)', + downloadJson: '下载 JSON', + copyJson: '复制 JSON', + mergeTitle: '合并工作区', + mergeA: 'workspace_dir A', + mergeB: 'workspace_dir B', + dedupe: '去重合并', + mergeBtn: '合并并重建索引', + merging: '合并中…', + mergeOk: '合并完成', + modelLabel: 'LLM 模型名', + mermaidTitle: '子图(Mermaid)', + copyFailed: '复制失败', + }, + en: { + headerTitle: 'GraphRAG Knowledge Base', + headerSub: 'Chunking (MinerU) + GraphRAG index & query (no KGGen on the default path)', + apiWarn: 'Configure API URL and API Key in Settings first', + noNotebook: 'Notebook ID is missing', + indexBtn: 'Build index', + indexing: 'Indexing…', + indexOk: 'Index completed', + forceReindex: 'Force reindex', + parsePdfs: 'Parse PDFs (MinerU)', + summary: 'Last index summary', + chunks: 'Chunks', + workspace: 'Workspace directory', + copy: 'Copy', + copied: 'Copied', + queryQ: 'Question', + queryPlaceholder: 'Ask a question…', + searchLocal: 'Local', + searchGlobal: 'Global', + queryBtn: 'Query', + querying: 'Querying…', + answer: 'Answer', + judge: 'Judge score', + rationale: 'Rationale', + subgraph: 'Reasoning subgraph', + viewTable: 'Table', + viewMermaid: 'Mermaid', + viewJson: 'JSON', + noSubgraph: 'No subgraph', + subgraphCot: 'Minimal subgraph reasoning (CoT / hops)', + hintDoc: 'Document', + hintPage: 'Page', + hintBbox: 'BBox', + openInKb: 'Open in knowledge base', + hints: 'Source location', + context: 'context_data (may be large)', + downloadJson: 'Download JSON', + copyJson: 'Copy JSON', + mergeTitle: 'Merge workspaces', + mergeA: 'workspace_dir A', + mergeB: 'workspace_dir B', + dedupe: 'Deduplicate when merging', + mergeBtn: 'Merge and re-index', + merging: 'Merging…', + mergeOk: 'Merge completed', + modelLabel: 'LLM model', + mermaidTitle: 'Subgraph (Mermaid)', + copyFailed: 'Copy failed', + }, +} as const; + +export interface GraphRAGKbPanelProps { + notebook: { id?: string; title?: string; name?: string }; + userId: string | null; + email: string; + locale?: 'zh' | 'en'; + showToast: (message: string, type?: 'success' | 'error' | 'warning') => void; + /** 在笔记本侧栏打开对应来源并展示 MinerU 解析内容(按 stem 匹配文件名) */ + onOpenGraphragSource?: (payload: GraphragOpenSourcePayload) => void | Promise; +} + +export function GraphRAGKbPanel({ + notebook, + userId, + email, + locale = 'zh', + showToast, + onOpenGraphragSource, +}: GraphRAGKbPanelProps) { + const L = STR[locale]; + const notebookId = notebook?.id || ''; + const notebookTitle = notebook?.title || notebook?.name || ''; + + const [persist, setPersist] = useState(null); + const [forceReindex, setForceReindex] = useState(false); + const [parsePdfs, setParsePdfs] = useState(true); + const [indexLoading, setIndexLoading] = useState(false); + const [modelName, setModelName] = useState(defaultGraphragModel()); + + const [question, setQuestion] = useState(''); + const [searchMethod, setSearchMethod] = useState<'local' | 'global'>('local'); + const [queryLoading, setQueryLoading] = useState(false); + const [queryResult, setQueryResult] = useState(null); + const [subView, setSubView] = useState<'table' | 'mermaid' | 'json'>('table'); + const [contextOpen, setContextOpen] = useState(false); + + const [mergeA, setMergeA] = useState(''); + const [mergeB, setMergeB] = useState(''); + const [mergeDedupe, setMergeDedupe] = useState(false); + const [mergeLoading, setMergeLoading] = useState(false); + + const storageKey = useMemo(() => { + const uid = userId || 'global'; + if (!notebookId) return null; + return getWorkspaceStorageKey(uid, notebookId); + }, [userId, notebookId]); + + const loadPersist = useCallback(() => { + if (!storageKey) { + setPersist(null); + return; + } + try { + const raw = localStorage.getItem(storageKey); + if (!raw) { + setPersist(null); + return; + } + const p = JSON.parse(raw) as GraphragWorkspacePersist; + if (p?.workspace_dir) setPersist(p); + else setPersist(null); + } catch { + setPersist(null); + } + }, [storageKey]); + + useEffect(() => { + loadPersist(); + }, [loadPersist]); + + useEffect(() => { + if (persist?.workspace_dir) { + setMergeA((a) => (a ? a : persist.workspace_dir)); + } + }, [persist?.workspace_dir]); + + const llmBody = useCallback(() => { + const settings = getApiSettings(userId); + const api_url = settings?.apiUrl?.trim() || ''; + const api_key = settings?.apiKey?.trim() || ''; + const model = modelName.trim() || defaultGraphragModel(); + return { api_url, api_key, model }; + }, [userId, modelName]); + + const copyText = async (text: string, okMsg?: string) => { + try { + await navigator.clipboard.writeText(text); + showToast(okMsg || L.copied, 'success'); + } catch { + showToast(L.copyFailed, 'error'); + } + }; + + const handleIndex = async () => { + if (!notebookId) { + showToast(L.noNotebook, 'warning'); + return; + } + const { api_url, api_key, model } = llmBody(); + if (!api_url || !api_key) { + showToast(L.apiWarn, 'warning'); + return; + } + setIndexLoading(true); + try { + const res = await indexGraphragKb({ + notebook_id: notebookId, + notebook_title: notebookTitle, + email: email || '', + api_url, + api_key, + model, + source_stems: null, + workspace_dir: persist?.workspace_dir || '', + force_reindex: forceReindex, + parse_pdfs: parsePdfs, + skip_kggen: true, + }); + const next: GraphragWorkspacePersist = { + workspace_dir: res.workspace_dir, + updatedAt: Date.now(), + num_chunks: res.num_chunks, + }; + if (storageKey) { + localStorage.setItem(storageKey, JSON.stringify(next)); + } + setPersist(next); + showToast(L.indexOk, 'success'); + } catch (e: unknown) { + const msg = e instanceof Error ? e.message : String(e); + showToast(msg, 'error'); + } finally { + setIndexLoading(false); + } + }; + + const handleQuery = async () => { + if (!notebookId) { + showToast(L.noNotebook, 'warning'); + return; + } + const ws = persist?.workspace_dir?.trim(); + if (!ws) { + showToast(locale === 'zh' ? '请先构建索引或确认已持久化 workspace_dir' : 'Build index first or set workspace_dir', 'warning'); + return; + } + const q = question.trim(); + if (!q) { + showToast(locale === 'zh' ? '请输入问题' : 'Enter a question', 'warning'); + return; + } + const { api_url, api_key, model } = llmBody(); + if (!api_url || !api_key) { + showToast(L.apiWarn, 'warning'); + return; + } + setQueryLoading(true); + setQueryResult(null); + try { + const res = await queryGraphragKb({ + notebook_id: notebookId, + notebook_title: notebookTitle, + email: email || '', + api_url, + api_key, + model, + question: q, + search_method: searchMethod, + workspace_dir: ws, + }); + setQueryResult(res); + } catch (e: unknown) { + const msg = e instanceof Error ? e.message : String(e); + showToast(msg, 'error'); + } finally { + setQueryLoading(false); + } + }; + + const handleMerge = async () => { + if (!notebookId) { + showToast(L.noNotebook, 'warning'); + return; + } + const a = mergeA.trim(); + const b = mergeB.trim(); + if (!a || !b) { + showToast(locale === 'zh' ? '请填写两个 workspace 路径' : 'Enter both workspace paths', 'warning'); + return; + } + const { api_url, api_key, model } = llmBody(); + if (!api_url || !api_key) { + showToast(L.apiWarn, 'warning'); + return; + } + setMergeLoading(true); + try { + const res = await mergeGraphragKb({ + notebook_id: notebookId, + notebook_title: notebookTitle, + email: email || '', + api_url, + api_key, + model, + workspace_dir_a: a, + workspace_dir_b: b, + dedupe: mergeDedupe, + }); + const next: GraphragWorkspacePersist = { + workspace_dir: res.merged_workspace_dir, + updatedAt: Date.now(), + num_chunks: res.num_chunks, + }; + if (storageKey) localStorage.setItem(storageKey, JSON.stringify(next)); + setPersist(next); + setMergeA(res.merged_workspace_dir); + showToast(L.mergeOk, 'success'); + } catch (e: unknown) { + const msg = e instanceof Error ? e.message : String(e); + showToast(msg, 'error'); + } finally { + setMergeLoading(false); + } + }; + + const mermaidCode = useMemo(() => { + if (!queryResult?.reasoning_subgraph?.length) return null; + return reasoningSubgraphToMermaid(queryResult.reasoning_subgraph); + }, [queryResult?.reasoning_subgraph]); + + const contextJson = useMemo(() => { + if (!queryResult?.context_data) return ''; + try { + return JSON.stringify(queryResult.context_data, null, 2); + } catch { + return '{}'; + } + }, [queryResult?.context_data]); + + const downloadContext = () => { + const blob = new Blob([contextJson], { type: 'application/json' }); + const url = URL.createObjectURL(blob); + const a = document.createElement('a'); + a.href = url; + a.download = `graphrag_context_${notebookId || 'nb'}.json`; + a.click(); + URL.revokeObjectURL(url); + }; + + const judgePct = queryResult ? Math.round(Math.max(0, Math.min(1, queryResult.judge_score)) * 100) : 0; + + return ( +
+
+ +
+
{L.headerTitle}
+
{L.headerSub}
+
+
+ +
+
+

{L.indexBtn}

+
+ + +
+
+ + setModelName(e.target.value)} + className="w-full max-w-md px-3 py-2 border border-ios-gray-200 rounded-lg text-sm" + placeholder={defaultGraphragModel()} + /> +
+ + + {persist && ( +
+
{L.summary}
+
+ {L.chunks}: {persist.num_chunks ?? '—'} +
+
+ {L.workspace}: + {persist.workspace_dir} + +
+
+ )} +
+ +
+

{L.queryBtn}

+
+ +