diff --git a/README_JUPYTER.md b/README_JUPYTER.md
new file mode 100644
index 0000000..101160a
--- /dev/null
+++ b/README_JUPYTER.md
@@ -0,0 +1,195 @@
+# web2json-agent Jupyter Guide
+
+这个文档专门基于 [scripts/run_jsonl_web2json_pipeline.py](/Users/luqing/Downloads/multiModal/web2json-agent/scripts/run_jsonl_web2json_pipeline.py) 来写，目标是在 Jupyter 里直接跑完整 `jsonl -> html -> classify -> schema -> code -> data` 流水线。
+
+它不覆盖项目原始 [README.md](/Users/luqing/Downloads/multiModal/web2json-agent/README.md)。
+
+## 这份文档对应哪条执行链路
+
+这里用的不是最简单的 `extract_data(...)` 单接口方案，而是项目里的完整脚本流水线:
+
+- 入口脚本: [scripts/run_jsonl_web2json_pipeline.py](/Users/luqing/Downloads/multiModal/web2json-agent/scripts/run_jsonl_web2json_pipeline.py)
+- Jupyter 包装: [jupyter_helper.py](/Users/luqing/Downloads/multiModal/web2json-agent/jupyter_helper.py)
+- Notebook helper 实现: [notebooks/jupyter_helper.py](/Users/luqing/Downloads/multiModal/web2json-agent/notebooks/jupyter_helper.py)
+- 示例 notebook: [notebooks/web2json_quickstart.ipynb](/Users/luqing/Downloads/multiModal/web2json-agent/notebooks/web2json_quickstart.ipynb)
+
+## 流水线做了什么
+
+脚本会按下面顺序执行:
+
+1. 读取 `jsonl`
+2. 从每条记录里取出 `html` 字段
+3. 拆成一批 `.html` 文件，并生成 `manifest.jsonl`
+4. 对 HTML 做 `classify_html_dir`
+5. 对每个 cluster 执行 `extract_schema`
+6. 执行 `infer_code`
+7. 用生成的 parser 执行 `extract_data_with_code`
+8. 输出 `pipeline_summary.json`
+
+适合这种输入数据:
+
+- 原始数据是 `jsonl`
+- 每行是一条网页记录
+- 每条记录里有 `html` 字段
+- 可能还带 `url`、`track_id`、`status`
+
+## Jupyter 最短路径
+
+### 1. 进入项目目录
+
+```bash
+cd /Users/luqing/Downloads/multiModal/web2json-agent
+```
+
+### 2. 安装项目
+
+请显式使用 `python3.11`，不要用系统默认的旧版 `python3`。
+
+```bash
+python3.11 -m pip install .
+```
+
+### 3. 启动 Jupyter
+
+```bash
+python3.11 -m notebook
+```
+
+或者:
+
+```bash
+python3.11 -m jupyter lab
+```
+
+### 4. 打开示例 notebook
+
+打开:
+
+`notebooks/web2json_quickstart.ipynb`
+
+## Notebook 最小示例
+
+### Cell 1: 初始化环境
+
+```python
+from jupyter_helper import prepare_notebook
+
+prepare_notebook(
+    api_key="YOUR_API_KEY",
+    api_base="https://api.openai.com/v1",
+)
+```
+
+### Cell 2: 运行完整 JSONL pipeline
+
+```python
+from jupyter_helper import run_jsonl_pipeline, summarize_pipeline_result
+
+result = run_jsonl_pipeline(
+    source_jsonl="ToClassify/sample.json",
+    work_id="sample_run",
+    input_root="input_html",
+    output_root="output",
+    html_key="html",
+    iteration_rounds=3,
+    cluster_limit=1,
+)
+
+summarize_pipeline_result(result)
+```
+
+### Cell 3: 查看完整结果
+
+```python
+result.to_dict()
+```
+
+## 也可以直接调用原脚本
+
+如果你不想通过 helper，也可以在 notebook 里直接 import 原脚本里的函数:
+
+```python
+from scripts.run_jsonl_web2json_pipeline import run_jsonl_pipeline
+
+result = run_jsonl_pipeline(
+    source_jsonl="ToClassify/sample.json",
+    work_id="sample_run",
+)
+```
+
+这就是 [scripts/run_jsonl_web2json_pipeline.py](/Users/luqing/Downloads/multiModal/web2json-agent/scripts/run_jsonl_web2json_pipeline.py) 里新增的 notebook-friendly 入口。
+
+## 参数说明
+
+`run_jsonl_pipeline(...)` 主要参数:
+
+- `source_jsonl`: 源 `jsonl` 路径
+- `work_id`: 这次运行的标识；为空时按文件名自动生成
+- `input_root`: 拆分后 HTML 的输出根目录，默认 `input_html`
+- `output_root`: pipeline 输出根目录，默认 `output`
+- `html_key`: `jsonl` 中 HTML 字段名，默认 `html`
+- `iteration_rounds`: schema 学习轮数上限，默认 `3`
+- `cluster_limit`: 最多处理多少个 cluster，默认 `0`，表示全部
+
+## 结果会落到哪里
+
+如果你设置:
+
+```python
+result = run_jsonl_pipeline(
+    source_jsonl="ToClassify/sample.json",
+    work_id="sample_run",
+)
+```
+
+通常会生成:
+
+- `input_html/sample_run/`
+- `output/sample_run_pipeline/`
+- `output/sample_run_pipeline/pipeline_summary.json`
+
+每个 cluster 下面还会有:
+
+- schema 输出目录
+- code 输出目录
+- data 输出目录
+- 最终 parser 文件
+
+## API Key 配置
+
+你可以二选一:
+
+### 方式 A: 在 notebook 里设置
+
+```python
+from jupyter_helper import prepare_notebook
+
+prepare_notebook(
+    api_key="YOUR_API_KEY",
+    api_base="https://api.openai.com/v1",
+)
+```
+
+### 方式 B: 在项目根目录放 `.env`
+
+```env
+OPENAI_API_KEY=YOUR_API_KEY
+OPENAI_API_BASE=https://api.openai.com/v1
+DEFAULT_MODEL=gpt-4.1
+```
+
+## 已知前提
+
+- Python 要求 `>= 3.10`
+- 当前这台机器上默认 `python3` 是旧的 `3.7.3`
+- 建议始终显式使用 `python3.11`
+- 这条流水线依赖模型 API，可用前需要配置好 key/base
+
+## 相关文件
+
+- [README.md](/Users/luqing/Downloads/multiModal/web2json-agent/README.md)
+- [README_JUPYTER.md](/Users/luqing/Downloads/multiModal/web2json-agent/README_JUPYTER.md)
+- [scripts/run_jsonl_web2json_pipeline.py](/Users/luqing/Downloads/multiModal/web2json-agent/scripts/run_jsonl_web2json_pipeline.py)
+- [jupyter_helper.py](/Users/luqing/Downloads/multiModal/web2json-agent/jupyter_helper.py)
+- [notebooks/jupyter_helper.py](/Users/luqing/Downloads/multiModal/web2json-agent/notebooks/jupyter_helper.py)
+- [notebooks/web2json_quickstart.ipynb](/Users/luqing/Downloads/multiModal/web2json-agent/notebooks/web2json_quickstart.ipynb)
diff --git a/jupyter_helper.py b/jupyter_helper.py
new file mode 100644
index 0000000..d82c824
--- /dev/null
+++ b/jupyter_helper.py
@@ -0,0 +1,3 @@
+"""Compatibility wrapper so notebooks can import jupyter_helper from multiple locations."""
+
+from notebooks.jupyter_helper import *  # noqa: F401,F403
diff --git a/notebooks/jupyter_helper.py b/notebooks/jupyter_helper.py
new file mode 100644
index 0000000..801d70e
--- /dev/null
+++ b/notebooks/jupyter_helper.py
@@ -0,0 +1,136 @@
+"""Utilities for running web2json-agent inside Jupyter notebooks."""
+
+from __future__ import annotations
+
+import json
+import os
+import sys
+from pathlib import Path
+from typing import Any, Optional, Sequence
+
+PROJECT_ROOT = Path(__file__).resolve().parents[1]
+
+
+def prepare_notebook(
+    api_key: Optional[str] = None,
+    api_base: Optional[str] = None,
+    project_root: Optional[str] = None,
+) -> Path:
+    """Prepare the notebook process for local package imports and env loading."""
+    root = Path(project_root).expanduser().resolve() if project_root else PROJECT_ROOT
+
+    if str(root) not in sys.path:
+        sys.path.insert(0, str(root))
+
+    os.chdir(root)
+
+    if api_key:
+        os.environ["OPENAI_API_KEY"] = api_key
+
+    if api_base:
+        os.environ["OPENAI_API_BASE"] = api_base
+
+    return root
+
+
+def make_extract_config(
+    name: str,
+    html_path: str,
+    output_path: str = "output",
+    save: Optional[Sequence[str]] = ("schema", "code", "data"),
+    schema: Optional[dict[str, Any]] = None,
+    iteration_rounds: int = 3,
+    enable_schema_edit: bool = False,
+    remove_null_fields: bool = True,
+    parser_code: Optional[str] = None,
+):
+    """Build a Web2JsonConfig with notebook-friendly path resolution."""
+    prepare_notebook()
+
+    from web2json import Web2JsonConfig
+
+    html_target = _resolve_project_path(html_path)
+    output_target = _resolve_project_path(output_path)
+
+    return Web2JsonConfig(
+        name=name,
+        html_path=str(html_target),
+        output_path=str(output_target),
+        iteration_rounds=iteration_rounds,
+        schema=schema,
+        enable_schema_edit=enable_schema_edit,
+        parser_code=parser_code,
+        save=list(save) if save is not None else None,
+        remove_null_fields=remove_null_fields,
+    )
+
+
+def preview_records(records: Sequence[dict[str, Any]], limit: int = 3) -> list[dict[str, Any]]:
+    """Return the first few parsed records so a notebook cell renders them directly."""
+    return list(records[:limit])
+
+
+def print_schema(schema: dict[str, Any]) -> None:
+    """Pretty print schema content inside notebooks."""
+    print(json.dumps(schema, ensure_ascii=False, indent=2))
+
+
+def summarize_cluster_result(cluster_result: Any) -> dict[str, Any]:
+    """Convert a cluster result into a compact notebook-friendly summary."""
+    return {
+        "cluster_count": cluster_result.cluster_count,
+        "clusters": {name: len(files) for name, files in cluster_result.clusters.items()},
+        "noise_files": len(cluster_result.noise_files),
+    }
+
+
+def run_jsonl_pipeline(
+    source_jsonl: str,
+    work_id: str = "",
+    input_root: str = "input_html",
+    output_root: str = "output",
+    html_key: str = "html",
+    iteration_rounds: int = 3,
+    cluster_limit: int = 0,
+):
+    """Run the full JSONL pipeline from a notebook and return the structured summary."""
+    prepare_notebook()
+
+    from scripts.run_jsonl_web2json_pipeline import run_jsonl_pipeline as _run_jsonl_pipeline
+
+    return _run_jsonl_pipeline(
+        source_jsonl=str(_resolve_project_path(source_jsonl)),
+        work_id=work_id,
+        input_root=str(_resolve_project_path(input_root)),
+        output_root=str(_resolve_project_path(output_root)),
+        html_key=html_key,
+        iteration_rounds=iteration_rounds,
+        cluster_limit=cluster_limit,
+    )
+
+
+def summarize_pipeline_result(result: Any) -> dict[str, Any]:
+    """Build a compact summary view for notebook display."""
+    return {
+        "source_jsonl": result.source_jsonl,
+        "pipeline_root": result.pipeline_root,
+        "cluster_count": result.cluster_count,
+        "clusters": [
+            {
+                "cluster_name": cluster["cluster_name"],
+                "cluster_size": cluster["cluster_size"],
+                "parse_success_count": cluster["parse_success_count"],
+                "parse_failed_count": cluster["parse_failed_count"],
+            }
+            for cluster in result.clusters
+        ],
+        "total_token_usage": result.total_token_usage,
+        "summary_path": result.summary_path,
+    }
+
+
+def _resolve_project_path(path_str: str) -> Path:
+    path = Path(path_str).expanduser()
+    if path.is_absolute():
+        return path
+    return (PROJECT_ROOT / path).resolve()
diff --git a/notebooks/web2json_quickstart.ipynb b/notebooks/web2json_quickstart.ipynb
new file mode 100644
index 0000000..da203aa
--- /dev/null
+++ b/notebooks/web2json_quickstart.ipynb
@@ -0,0 +1,100 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# web2json-agent JSONL Pipeline Quickstart\n",
+    "\n",
+    "这个 notebook 基于 `scripts/run_jsonl_web2json_pipeline.py`，按顺序运行下面几个单元即可。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "import sys\n",
+    "\n",
+    "PROJECT_ROOT = Path.cwd()\n",
+    "if PROJECT_ROOT.name == \"notebooks\":\n",
+    "    PROJECT_ROOT = PROJECT_ROOT.parent\n",
+    "if str(PROJECT_ROOT) not in sys.path:\n",
+    "    sys.path.insert(0, str(PROJECT_ROOT))\n",
+    "\n",
+    "from jupyter_helper import prepare_notebook\n",
+    "\n",
+    "PROJECT_ROOT = prepare_notebook(\n",
+    "    api_key=\"YOUR_API_KEY\",\n",
+    "    api_base=\"https://api.openai.com/v1\",  # 如果你使用兼容网关，请替换这里\n",
+    ")\n",
+    "\n",
+    "PROJECT_ROOT"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from jupyter_helper import run_jsonl_pipeline, summarize_pipeline_result\n",
+    "\n",
+    "result = run_jsonl_pipeline(\n",
+    "    source_jsonl=\"ToClassify/sample.json\",\n",
+    "    work_id=\"sample_run\",\n",
+    "    input_root=\"input_html\",\n",
+    "    output_root=\"output\",\n",
+    "    html_key=\"html\",\n",
+    "    iteration_rounds=3,\n",
+    "    cluster_limit=1,\n",
+    ")\n",
+    "\n",
+    "summarize_pipeline_result(result)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "result.to_dict()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "result.summary_path"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "output_dir = PROJECT_ROOT / \"output\" / \"sample_run_pipeline\"\n",
+    "output_dir"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/scripts/build_html_manifest.py b/scripts/build_html_manifest.py
new file mode 100644
index 0000000..8d1508f
--- /dev/null
+++ b/scripts/build_html_manifest.py
@@ -0,0 +1,141 @@
+#!/usr/bin/env python3
+"""
+从 crawl jsonl 拆分出 HTML 文件，并生成 manifest.jsonl 索引。
+
+示例：
+python scripts/build_html_manifest.py \
+  --source ToClassify/example.jsonl \
+  --output-dir input_html/example_set
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+from pathlib import Path
+from typing import Any
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Split a crawl jsonl into numbered HTML files plus manifest.jsonl."
+    )
+    parser.add_argument(
+        "--source",
+        required=True,
+        help="源 jsonl 文件路径，每行应至少包含 html 字段。",
+    )
+    parser.add_argument(
+        "--output-dir",
+        required=True,
+        help="输出目录，会写入 0001.html... 和 manifest.jsonl。",
+    )
+    parser.add_argument(
+        "--html-key",
+        default="html",
+        help="HTML 内容字段名，默认 html。",
+    )
+    parser.add_argument(
+        "--start-index",
+        type=int,
+        default=1,
+        help="输出编号起始值，默认 1。",
+    )
+    parser.add_argument(
+        "--width",
+        type=int,
+        default=4,
+        help="输出文件编号宽度，默认 4，例如 0001.html。",
+    )
+    parser.add_argument(
+        "--limit",
+        type=int,
+        default=0,
+        help="最多处理多少条记录，0 表示不限制。",
+    )
+    parser.add_argument(
+        "--skip-empty-html",
+        action="store_true",
+        help="遇到缺失或空 html 时跳过该记录；默认直接报错。",
+    )
+    return parser.parse_args()
+
+
+def ensure_text(value: Any) -> str:
+    if value is None:
+        return ""
+    if isinstance(value, str):
+        return value
+    return str(value)
+
+
+def main() -> None:
+    args = parse_args()
+
+    source = Path(args.source)
+    output_dir = Path(args.output_dir)
+
+    if not source.exists():
+        raise FileNotFoundError(f"Source jsonl not found: {source}")
+
+    output_dir.mkdir(parents=True, exist_ok=True)
+    manifest_path = output_dir / "manifest.jsonl"
+
+    processed_count = 0
+    skipped_count = 0
+    current_index = args.start_index
+
+    with source.open("r", encoding="utf-8") as src, manifest_path.open(
+        "w", encoding="utf-8"
+    ) as manifest_fp:
+        for source_line, line in enumerate(src, start=1):
+            if args.limit and processed_count >= args.limit:
+                break
+
+            line = line.strip()
+            if not line:
+                skipped_count += 1
+                continue
+
+            try:
+                record = json.loads(line)
+            except json.JSONDecodeError as exc:
+                raise ValueError(f"Invalid JSON at line {source_line}: {exc}") from exc
+
+            html = ensure_text(record.get(args.html_key))
+            if not html.strip():
+                if args.skip_empty_html:
+                    skipped_count += 1
+                    continue
+                raise ValueError(
+                    f"Missing or empty '{args.html_key}' at line {source_line}"
+                )
+
+            filename = f"{current_index:0{args.width}d}.html"
+            html_path = output_dir / filename
+            html_path.write_text(html, encoding="utf-8")
+
+            manifest_record = {
+                "sample_no": current_index,
+                "source_line": source_line,
+                "filename": filename,
+                "track_id": record.get("track_id"),
+                "url": record.get("url"),
+                "status": record.get("status"),
+                "html_len": len(html),
+            }
+
+            manifest_fp.write(json.dumps(manifest_record, ensure_ascii=False) + "\n")
+
+            processed_count += 1
+            current_index += 1
+
+    print(f"source: {source}")
+    print(f"output_dir: {output_dir}")
+    print(f"manifest: {manifest_path}")
+    print(f"processed: {processed_count}")
+    print(f"skipped: {skipped_count}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/run_jsonl_web2json_pipeline.py b/scripts/run_jsonl_web2json_pipeline.py
new file mode 100644
index 0000000..bf7ff33
--- /dev/null
+++ b/scripts/run_jsonl_web2json_pipeline.py
@@ -0,0 +1,296 @@
+#!/usr/bin/env python3
+"""
+对 jsonl 执行完整 web2json 流水线：
+1. 拆分 html + manifest
+2. classify_html_dir
+3. 对每个 cluster 执行 extract_schema
+4. infer_code
+5. extract_data_with_code
+6. 汇总 token 使用
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import re
+import sys
+from dataclasses import asdict, dataclass
+from pathlib import Path
+from typing import Any
+
+PROJECT_ROOT = Path(__file__).resolve().parents[1]
+if str(PROJECT_ROOT) not in sys.path:
+    sys.path.insert(0, str(PROJECT_ROOT))
+
+from web2json import (
+    Web2JsonConfig,
+    classify_html_dir,
+    extract_schema,
+    infer_code,
+    extract_data_with_code,
+)
+from web2json.utils.llm_client import LLMClient
+
+
+@dataclass
+class PipelineRunResult:
+    source_jsonl: str
+    manifest: str
+    html_dir: str
+    pipeline_root: str
+    cluster_count: int
+    clusters: list[dict[str, Any]]
+    total_token_usage: dict[str, int]
+    summary_path: str
+
+    def to_dict(self) -> dict[str, Any]:
+        return asdict(self)
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Run web2json pipeline on a crawl jsonl.")
+    parser.add_argument("--source-jsonl", required=True, help="源 jsonl 文件路径。")
+    parser.add_argument(
+        "--work-id",
+        default="",
+        help="输出目录标识。默认根据 jsonl 文件名自动生成。",
+    )
+    parser.add_argument(
+        "--input-root",
+        default="input_html",
+        help="HTML 输出根目录，默认 input_html。",
+    )
+    parser.add_argument(
+        "--output-root",
+        default="output",
+        help="结果输出根目录，默认 output。",
+    )
+    parser.add_argument(
+        "--html-key",
+        default="html",
+        help="jsonl 中 HTML 字段名，默认 html。",
+    )
+    parser.add_argument(
+        "--iteration-rounds",
+        type=int,
+        default=3,
+        help="schema 学习轮数上限，默认 3。",
+    )
+    parser.add_argument(
+        "--cluster-limit",
+        type=int,
+        default=0,
+        help="最多处理多少个 cluster，0 表示全部处理。",
+    )
+    return parser.parse_args()
+
+
+def slugify(value: str) -> str:
+    value = re.sub(r"[^a-zA-Z0-9]+", "_", value).strip("_").lower()
+    return value or "run"
+
+
+def load_jsonl(path: Path) -> list[tuple[int, dict[str, Any]]]:
+    rows: list[tuple[int, dict[str, Any]]] = []
+    with path.open("r", encoding="utf-8") as fp:
+        for line_no, line in enumerate(fp, start=1):
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                rows.append((line_no, json.loads(line)))
+            except json.JSONDecodeError as exc:
+                print(f"skip invalid json line {line_no}: {exc}")
+    return rows
+
+
+def build_html_manifest(source_jsonl: Path, output_dir: Path, html_key: str) -> Path:
+    rows = load_jsonl(source_jsonl)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    manifest_path = output_dir / "manifest.jsonl"
+
+    with manifest_path.open("w", encoding="utf-8") as manifest_fp:
+        for idx, (source_line, row) in enumerate(rows, start=1):
+            html = row.get(html_key)
+            if not isinstance(html, str) or not html.strip():
+                continue
+
+            filename = f"{idx:04d}.html"
+            (output_dir / filename).write_text(html, encoding="utf-8")
+
+            manifest_row = {
+                "sample_no": idx,
+                "source_line": source_line,
+                "filename": filename,
+                "track_id": row.get("track_id"),
+                "url": row.get("url"),
+                "status": row.get("status"),
+                "html_len": len(html),
+            }
+            manifest_fp.write(json.dumps(manifest_row, ensure_ascii=False) + "\n")
+
+    return manifest_path
+
+
+def usage_delta(before: dict[str, int], after: dict[str, int]) -> dict[str, int]:
+    return {
+        "total_input_tokens": after["total_input_tokens"] - before["total_input_tokens"],
+        "total_completion_tokens": after["total_completion_tokens"] - before["total_completion_tokens"],
+        "total_tokens": after["total_tokens"] - before["total_tokens"],
+    }
+
+
+def run_jsonl_pipeline(
+    source_jsonl: str,
+    work_id: str = "",
+    input_root: str = "input_html",
+    output_root: str = "output",
+    html_key: str = "html",
+    iteration_rounds: int = 3,
+    cluster_limit: int = 0,
+) -> PipelineRunResult:
+    source_jsonl_path = Path(source_jsonl).expanduser()
+    if not source_jsonl_path.is_absolute():
+        source_jsonl_path = (PROJECT_ROOT / source_jsonl_path).resolve()
+
+    work_id = work_id or slugify(source_jsonl_path.stem)
+
+    input_root_path = Path(input_root).expanduser()
+    if not input_root_path.is_absolute():
+        input_root_path = (PROJECT_ROOT / input_root_path).resolve()
+
+    output_root_path = Path(output_root).expanduser()
+    if not output_root_path.is_absolute():
+        output_root_path = (PROJECT_ROOT / output_root_path).resolve()
+
+    html_dir = input_root_path / work_id
+    pipeline_root = output_root_path / f"{work_id}_pipeline"
+    pipeline_root.mkdir(parents=True, exist_ok=True)
+
+    print(f"source_jsonl: {source_jsonl_path}")
+    print(f"work_id: {work_id}")
+    print(f"html_dir: {html_dir}")
+    print(f"pipeline_root: {pipeline_root}")
+
+    manifest_path = build_html_manifest(source_jsonl_path, html_dir, html_key)
+    print(f"manifest: {manifest_path}")
+
+    classify_config = Web2JsonConfig(
+        name="classify",
+        html_path=str(html_dir),
+        output_path=str(pipeline_root),
+        save=["report", "files"],
+    )
+    classify_result = classify_html_dir(classify_config)
+
+    clusters_dir = pipeline_root / "classify" / "clusters"
+    cluster_names = sorted(classify_result.clusters.keys())
+    if cluster_limit:
+        cluster_names = cluster_names[:cluster_limit]
+
+    LLMClient.reset_usage()
+    cluster_summaries: list[dict[str, Any]] = []
+
+    for cluster_name in cluster_names:
+        cluster_html_dir = clusters_dir / cluster_name
+        cluster_files = classify_result.clusters[cluster_name]
+        cluster_size = len(cluster_files)
+        rounds = min(iteration_rounds, cluster_size)
+
+        print(f"\n=== {cluster_name} ({cluster_size} files) ===")
+
+        before_schema = LLMClient.get_total_usage()
+        schema_result = extract_schema(
+            Web2JsonConfig(
+                name=f"{cluster_name}_schema",
+                html_path=str(cluster_html_dir),
+                output_path=str(pipeline_root),
+                iteration_rounds=rounds,
+                save=["schema"],
+            )
+        )
+        after_schema = LLMClient.get_total_usage()
+
+        before_code = LLMClient.get_total_usage()
+        code_result = infer_code(
+            Web2JsonConfig(
+                name=f"{cluster_name}_code",
+                html_path=str(cluster_html_dir),
+                output_path=str(pipeline_root),
+                schema=schema_result.final_schema,
+                save=["schema", "code"],
+            )
+        )
+        after_code = LLMClient.get_total_usage()
+
+        parser_path = pipeline_root / f"{cluster_name}_code" / "final_parser.py"
+        parse_result = extract_data_with_code(
+            Web2JsonConfig(
+                name=f"{cluster_name}_extract_data",
+                html_path=str(cluster_html_dir),
+                output_path=str(pipeline_root),
+                parser_code=str(parser_path),
+                save=["data"],
+            )
+        )
+
+        cluster_summary = {
+            "cluster_name": cluster_name,
+            "cluster_size": cluster_size,
+            "html_dir": str(cluster_html_dir),
+            "schema_output": str(pipeline_root / f"{cluster_name}_schema"),
+            "code_output": str(pipeline_root / f"{cluster_name}_code"),
+            "data_output": str(pipeline_root / f"{cluster_name}_extract_data"),
+            "parser_path": str(parser_path),
+            "schema_fields": list(schema_result.final_schema.keys()),
+            "schema_token_usage": usage_delta(before_schema, after_schema),
+            "code_token_usage": usage_delta(before_code, after_code),
+            "parse_success_count": parse_result.success_count,
+            "parse_failed_count": parse_result.failed_count,
+        }
+        cluster_summaries.append(cluster_summary)
+
+    total_usage = LLMClient.get_total_usage()
+    summary = {
+        "source_jsonl": str(source_jsonl_path),
+        "manifest": str(manifest_path),
+        "html_dir": str(html_dir),
+        "pipeline_root": str(pipeline_root),
+        "cluster_count": len(cluster_names),
+        "clusters": cluster_summaries,
+        "total_token_usage": total_usage,
+    }
+
+    summary_path = pipeline_root / "pipeline_summary.json"
+    summary_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8")
+    print(f"\nsummary: {summary_path}")
+    print(json.dumps(total_usage, ensure_ascii=False, indent=2))
+
+    return PipelineRunResult(
+        source_jsonl=summary["source_jsonl"],
+        manifest=summary["manifest"],
+        html_dir=summary["html_dir"],
+        pipeline_root=summary["pipeline_root"],
+        cluster_count=summary["cluster_count"],
+        clusters=summary["clusters"],
+        total_token_usage=summary["total_token_usage"],
+        summary_path=str(summary_path),
+    )
+
+
+def main() -> None:
+    args = parse_args()
+    run_jsonl_pipeline(
+        source_jsonl=args.source_jsonl,
+        work_id=args.work_id,
+        input_root=args.input_root,
+        output_root=args.output_root,
+        html_key=args.html_key,
+        iteration_rounds=args.iteration_rounds,
+        cluster_limit=args.cluster_limit,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/verify_extract_alignment.py b/scripts/verify_extract_alignment.py
new file mode 100644
index 0000000..1da78a8
--- /dev/null
+++ b/scripts/verify_extract_alignment.py
@@ -0,0 +1,282 @@
+#!/usr/bin/env python3
+"""
+校验 source jsonl / manifest / html / result json 之间的一致性。
+
+示例：
+python scripts/verify_extract_alignment.py \
+  --source-jsonl ToClassify/source.jsonl \
+  --manifest input_html/npi_sample_2000/manifest.jsonl \
+  --html-dir input_html/npi_category_detail_cluster_1 \
+  --result-dir output/npi_category_detail_cluster_1_code/result \
+  --output output/npi_category_detail_cluster_1_code/qa_report.json
+
+或者直接使用 cluster manifest：
+python scripts/verify_extract_alignment.py \
+  --source-jsonl ToClassify/source.jsonl \
+  --cluster-manifest output/npi_category_detail_cluster_1_code/cluster_manifest.json \
+  --manifest input_html/npi_sample_2000/manifest.jsonl
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+from dataclasses import dataclass, asdict
+from pathlib import Path
+from typing import Any
+from bs4 import BeautifulSoup
+
+
+@dataclass
+class FileReport:
+    filename: str
+    source_line: int | None
+    url: str | None
+    track_id: str | None
+    html_exists: bool
+    result_exists: bool
+    source_match: bool
+    html_len_match: bool
+    field_checks: dict[str, dict[str, Any]]
+    ok: bool
+    errors: list[str]
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Verify alignment across source jsonl, manifest, html files, and result json."
+    )
+    parser.add_argument("--source-jsonl", required=True, help="原始 crawl jsonl 文件路径。")
+    parser.add_argument("--manifest", required=True, help="完整 manifest.jsonl 文件路径。")
+    parser.add_argument(
+        "--cluster-manifest",
+        default="",
+        help="cluster_manifest.json 路径。提供后会从其中自动读取 html-dir / result-dir / 文件子集 / schema-path。",
+    )
+    parser.add_argument("--html-dir", default="", help="HTML 文件目录。")
+    parser.add_argument("--result-dir", default="", help="解析结果 JSON 目录。")
+    parser.add_argument("--schema-json", default="", help="cluster 对应的 schema.json 路径。")
+    parser.add_argument("--output", default="", help="QA 报告输出路径（可选）。")
+    parser.add_argument(
+        "--fields",
+        nargs="*",
+        default=None,
+        help="要校验是否出现在 HTML 中的结果字段。未提供时会优先从 schema 自动推断，否则回退到 title content。",
+    )
+    return parser.parse_args()
+
+
+def load_jsonl(path: Path) -> list[dict[str, Any]]:
+    rows: list[dict[str, Any]] = []
+    with path.open("r", encoding="utf-8") as fp:
+        for line_no, line in enumerate(fp, start=1):
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                rows.append(json.loads(line))
+            except json.JSONDecodeError as exc:
+                raise ValueError(f"Invalid JSON in {path} line {line_no}: {exc}") from exc
+    return rows
+
+
+def load_target_files(
+    manifest_rows: list[dict[str, Any]], cluster_manifest_path: Path | None
+) -> tuple[list[dict[str, Any]], str, str, str]:
+    if not cluster_manifest_path:
+        return manifest_rows, "", "", ""
+
+    cluster_manifest = json.loads(cluster_manifest_path.read_text(encoding="utf-8"))
+    wanted = {item["filename"] for item in cluster_manifest.get("files", [])}
+    filtered_rows = [row for row in manifest_rows if row.get("filename") in wanted]
+    html_dir = cluster_manifest.get("input_dir", "")
+    result_dir = cluster_manifest.get("result_dir", "")
+    schema_path = cluster_manifest.get("schema_path", "")
+    return filtered_rows, html_dir, result_dir, schema_path
+
+
+def derive_fields_from_schema(schema_path: Path) -> list[str]:
+    schema = json.loads(schema_path.read_text(encoding="utf-8"))
+    fields: list[str] = []
+    for field_name, field_meta in schema.items():
+        if not isinstance(field_meta, dict):
+            continue
+        if field_meta.get("type") == "string":
+            fields.append(field_name)
+    return fields
+
+
+def normalize_text(value: Any) -> str:
+    if value is None:
+        return ""
+    if not isinstance(value, str):
+        value = str(value)
+    return " ".join(value.split())
+
+
+def check_field_in_html(field_value: Any, html_text: str) -> dict[str, Any]:
+    normalized_value = normalize_text(field_value)
+    normalized_html = normalize_text(html_text)
+    normalized_text = normalize_text(BeautifulSoup(html_text, "html.parser").get_text(" ", strip=True))
+
+    if not normalized_value:
+        return {
+            "value_present": False,
+            "raw_html_match": False,
+            "text_match": False,
+            "substring_match": False,
+            "value_len": 0,
+        }
+
+    return {
+        "value_present": True,
+        "raw_html_match": normalized_value in normalized_html,
+        "text_match": normalized_value in normalized_text,
+        "substring_match": normalized_value in normalized_html or normalized_value in normalized_text,
+        "value_len": len(normalized_value),
+    }
+
+
+def main() -> None:
+    args = parse_args()
+
+    source_jsonl = Path(args.source_jsonl)
+    manifest_path = Path(args.manifest)
+    cluster_manifest_path = Path(args.cluster_manifest) if args.cluster_manifest else None
+    output_path = Path(args.output) if args.output else None
+
+    source_rows = load_jsonl(source_jsonl)
+    manifest_rows = load_jsonl(manifest_path)
+    target_rows, cluster_html_dir, cluster_result_dir, cluster_schema_path = load_target_files(
+        manifest_rows, cluster_manifest_path
+    )
+
+    html_dir_str = args.html_dir or cluster_html_dir
+    result_dir_str = args.result_dir or cluster_result_dir
+    schema_json_str = args.schema_json or cluster_schema_path
+    if not html_dir_str or not result_dir_str:
+        raise ValueError("html-dir 和 result-dir 不能为空；可直接传参，或通过 cluster-manifest 提供。")
+
+    html_dir = Path(html_dir_str)
+    result_dir = Path(result_dir_str)
+    schema_json_path = Path(schema_json_str) if schema_json_str else None
+
+    if args.fields is not None:
+        fields_to_check = args.fields
+    elif schema_json_path and schema_json_path.exists():
+        fields_to_check = derive_fields_from_schema(schema_json_path)
+        if not fields_to_check:
+            fields_to_check = ["title", "content"]
+    else:
+        fields_to_check = ["title", "content"]
+
+    reports: list[FileReport] = []
+    ok_count = 0
+
+    for manifest_row in target_rows:
+        filename = manifest_row["filename"]
+        source_line = manifest_row.get("source_line")
+        url = manifest_row.get("url")
+        track_id = manifest_row.get("track_id")
+
+        html_path = html_dir / filename
+        result_path = result_dir / filename.replace(".html", ".json")
+
+        errors: list[str] = []
+        html_exists = html_path.exists()
+        result_exists = result_path.exists()
+        source_match = False
+        html_len_match = False
+        field_checks: dict[str, dict[str, Any]] = {}
+
+        html_text = ""
+        if html_exists:
+            html_text = html_path.read_text(encoding="utf-8")
+        else:
+            errors.append(f"missing_html:{html_path}")
+
+        result_data: dict[str, Any] = {}
+        if result_exists:
+            result_data = json.loads(result_path.read_text(encoding="utf-8"))
+        else:
+            errors.append(f"missing_result:{result_path}")
+
+        if source_line is not None and 1 <= source_line <= len(source_rows):
+            source_row = source_rows[source_line - 1]
+            source_match = (
+                source_row.get("track_id") == track_id
+                and source_row.get("url") == url
+            )
+            if not source_match:
+                errors.append("source_manifest_mismatch")
+
+            source_html = source_row.get("html", "")
+            html_len_match = len(source_html) == manifest_row.get("html_len")
+            if not html_len_match:
+                errors.append("source_manifest_html_len_mismatch")
+
+            if html_exists and len(html_text) != len(source_html):
+                errors.append("source_html_file_len_mismatch")
+            elif html_exists and html_text != source_html:
+                errors.append("source_html_file_content_mismatch")
+        else:
+            errors.append("invalid_source_line")
+
+        if html_exists and result_exists:
+            for field in fields_to_check:
+                field_checks[field] = check_field_in_html(result_data.get(field), html_text)
+                if field_checks[field]["value_present"] and not field_checks[field]["substring_match"]:
+                    errors.append(f"field_not_found_in_html:{field}")
+
+        ok = not errors
+        if ok:
+            ok_count += 1
+
+        reports.append(
+            FileReport(
+                filename=filename,
+                source_line=source_line,
+                url=url,
+                track_id=track_id,
+                html_exists=html_exists,
+                result_exists=result_exists,
+                source_match=source_match,
+                html_len_match=html_len_match,
+                field_checks=field_checks,
+                ok=ok,
+                errors=errors,
+            )
+        )
+
+    summary = {
+        "source_jsonl": str(source_jsonl),
+        "manifest": str(manifest_path),
+        "cluster_manifest": str(cluster_manifest_path) if cluster_manifest_path else "",
+        "html_dir": str(html_dir),
+        "result_dir": str(result_dir),
+        "schema_json": str(schema_json_path) if schema_json_path else "",
+        "fields_checked": fields_to_check,
+        "total_files": len(reports),
+        "ok_files": ok_count,
+        "failed_files": len(reports) - ok_count,
+        "reports": [asdict(report) for report in reports],
+    }
+
+    if output_path:
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        output_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8")
+        print(f"qa_report: {output_path}")
+
+    print(f"total_files: {summary['total_files']}")
+    print(f"ok_files: {summary['ok_files']}")
+    print(f"failed_files: {summary['failed_files']}")
+
+    for report in reports:
+        status = "OK" if report.ok else "FAIL"
+        print(f"{status} {report.filename}")
+        if report.errors:
+            print(f"  errors: {', '.join(report.errors)}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/start.sh b/start.sh
index f2cd712..f7d735c 100755
--- a/start.sh
+++ b/start.sh
@@ -3,6 +3,9 @@
 # Web2JSON Agent - Startup Script
 # Starts both the backend API and the frontend UI simultaneously
 
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PYTHON_BIN="${PYTHON_BIN:-python3.11}"
+
 echo "🚀 Starting Web2JSON Agent..."
 echo ""
 
@@ -21,14 +24,14 @@ fi
 
 # Start the backend
 echo "📡 Starting backend API (port 8000)..."
-cd /Users/brown/Projects/AILabProject/web2json-agent
+cd "$SCRIPT_DIR"
 
 # Create logs directory if it doesn't exist
 mkdir -p logs
 
 # Production Mode: Disable automatic reloading to avoid restarts triggered by changes in the output directory
 # If you need reload for development, use: --reload --reload-exclude 'output/**' --reload-exclude 'logs/**'
-uvicorn web2json_api.main:app --host 0.0.0.0 --port 8000 \
+"$PYTHON_BIN" -m uvicorn web2json_api.main:app --host 0.0.0.0 --port 8000 \
   --reload-exclude 'output/**' \
   --reload-exclude 'logs/**' \
   --reload-exclude '*.log' \
@@ -49,7 +52,7 @@ fi
 # Start the frontend
 echo ""
 echo "🎨 Starting frontend UI (port 5173)..."
-cd web2json_ui && npm run dev > ../logs/ui.log 2>&1 &
+cd "$SCRIPT_DIR/web2json_ui" && npm run dev > "$SCRIPT_DIR/logs/ui.log" 2>&1 &
 FRONTEND_PID=$!
 echo "   Frontend PID: $FRONTEND_PID"
 
@@ -70,8 +73,8 @@ echo "Or press Ctrl+C and run: pkill -f 'uvicorn|vite'"
 echo ""
 
 # Save PID
-echo $BACKEND_PID > .backend.pid
-echo $FRONTEND_PID > .frontend.pid
+echo $BACKEND_PID > "$SCRIPT_DIR/.backend.pid"
+echo $FRONTEND_PID > "$SCRIPT_DIR/.frontend.pid"
 
 # Wait for user interruption
 wait
diff --git a/web2json/simple.py b/web2json/simple.py
index 235d7c5..95b8ebe 100644
--- a/web2json/simple.py
+++ b/web2json/simple.py
@@ -7,6 +7,7 @@
 from pathlib import Path
 from typing import Optional, Dict, List, Any
 from dataclasses import dataclass, asdict
+import numpy as np
 from loguru import logger
 
 from web2json.agent import ParserAgent
@@ -1050,15 +1051,39 @@ def classify_html_dir(config: Web2JsonConfig) -> ClusterResult:
     # 执行聚类分析
     logger.info("正在进行布局聚类分析...")
     from web2json.tools.cluster import cluster_html_layouts_optimized
+    from web2json.tools.html_layout_cosin import get_feature
+
+    valid_html_files = []
+    valid_html_contents = []
+    invalid_html_files = []
+    for file_path, html_content in zip(html_files, html_contents):
+        try:
+            feature = get_feature(html_content)
+        except Exception as e:
+            logger.warning(f"  跳过布局特征提取失败页面: {file_path} ({e})")
+            invalid_html_files.append(file_path)
+            continue
+        if not feature:
+            logger.warning(f"  跳过无有效布局特征页面: {file_path}")
+            invalid_html_files.append(file_path)
+            continue
+        valid_html_files.append(file_path)
+        valid_html_contents.append(html_content)
+
+    if not valid_html_contents:
+        raise Exception("聚类失败: 没有可用于布局聚类的有效HTML页面")
 
     try:
         labels, sim_mat, clusters = cluster_html_layouts_optimized(
-            html_contents,
+            valid_html_contents,
             use_knn_graph=True
         )
     except Exception as e:
         raise Exception(f"聚类失败: {e}")
 
+    label_map = {file_path: int(label) for file_path, label in zip(valid_html_files, labels)}
+    labels = np.array([label_map.get(file_path, -1) for file_path in html_files], dtype=int)
+
     # 统计聚类结果
     unique_labels = sorted(set(labels))
     noise_count = sum(1 for l in labels if l == -1)
@@ -1160,4 +1185,3 @@ def classify_html_dir(config: Web2JsonConfig) -> ClusterResult:
         noise_files=noise_files,
         cluster_count=cluster_count
     )
-
diff --git a/web2json/tools/html_simplifier.py b/web2json/tools/html_simplifier.py
index 33f34ae..1a2581c 100644
--- a/web2json/tools/html_simplifier.py
+++ b/web2json/tools/html_simplifier.py
@@ -265,6 +265,77 @@ def remove_empty_tags(
     return root
 
 
+def _contains_token_attr(element: html.HtmlElement, attr_name: str, patterns: List[str]) -> bool:
+    """检查元素指定属性中是否包含目标模式。"""
+    attr_value = element.get(attr_name, '')
+    if not attr_value:
+        return False
+
+    attr_value = attr_value.lower()
+    return any(pattern in attr_value for pattern in patterns)
+
+
+def is_sharepoint_html(html_str: str) -> bool:
+    """粗略识别 SharePoint 页面。"""
+    markers = [
+        'microsoft sharepoint',
+        '_sppagecontextinfo',
+        's4-workspace',
+        '_layouts/15',
+    ]
+    html_lower = html_str.lower()
+    return any(marker in html_lower for marker in markers)
+
+
+def remove_sharepoint_noise(root: html.HtmlElement) -> html.HtmlElement:
+    """
+    删除 SharePoint 门户模板中的高噪音区域。
+
+    主要清理全站导航、页眉页脚、社媒区、noindex 容器和 mega menu。
+    """
+    class_patterns = [
+        'noindex',
+        'mega-menu',
+        'mega-sub-menu',
+        'mega-menu-wrap',
+        'mega-menu-toggle',
+        'top-header',
+        'button-close-top-header',
+        'social-media-header',
+        'breadcrumbs',
+        'breadcrumb',
+        'ms-csrlistview-controldiv',
+    ]
+    id_patterns = [
+        'top-header',
+        'top-menu',
+        'main-menu',
+        'mega-menu',
+        'social-media-header',
+        'navigationmenu',
+        'footer',
+        'ctl00_placeholdersitenamen',
+    ]
+    tag_names = {'header', 'footer', 'nav'}
+
+    remove_targets = []
+    for element in root.iter():
+        tag = str(element.tag).lower() if hasattr(element, 'tag') else ''
+        if tag in tag_names:
+            remove_targets.append(element)
+            continue
+        if _contains_token_attr(element, 'class', class_patterns):
+            remove_targets.append(element)
+            continue
+        if _contains_token_attr(element, 'id', id_patterns):
+            remove_targets.append(element)
+
+    # 去重，避免重复删除同一元素
+    unique_targets = list(dict.fromkeys(remove_targets))
+    remove_reversely(unique_targets)
+    return root
+
+
 def clean_attributes(
     root: html.HtmlElement,
     keep_attrs: List[str] = None
@@ -440,6 +511,11 @@ def simplify_html(
                 clean_attrs=True,
                 keep_attrs=keep_attrs_list
             )
+            if is_sharepoint_html(html_str):
+                simplified_root = html_to_element(result)
+                simplified_root = remove_sharepoint_noise(simplified_root)
+                simplified_root = remove_empty_tags(simplified_root)
+                result = element_to_html(simplified_root)
         # 根据aggressive参数选择模式
         elif aggressive:
             # 激进模式：删除所有无用内容
diff --git a/web2json/utils/llm_client.py b/web2json/utils/llm_client.py
index 59913a8..46c62e7 100644
--- a/web2json/utils/llm_client.py
+++ b/web2json/utils/llm_client.py
@@ -17,9 +17,9 @@
 env_path = project_root / ".env"
 load_dotenv(env_path)
 
-# 验证
-if not os.getenv("OPENAI_API_KEY"):
-    raise ValueError(f".env 文件路径: {env_path}, API Key未加载")
+# 验证（延迟到实际使用时）
+_api_key_missing = not os.getenv("OPENAI_API_KEY")
+_env_path_for_error = env_path
 
 # 定义场景类型
 ScenarioType = Literal["default", "code_gen", "agent"]
@@ -81,6 +81,9 @@ def __init__(
         if self._initialized:
             return
 
+        if _api_key_missing and not os.getenv("OPENAI_API_KEY"):
+            raise ValueError(f".env 文件路径: {_env_path_for_error}, API Key未加载")
+
         self.api_key = api_key or settings.openai_api_key
         self.api_base = api_base or settings.openai_api_base
         self.model = model or settings.default_model