From f3dd52e25b3e43899e4a9d136e009189c442a1fb Mon Sep 17 00:00:00 2001
From: ql101 <ql101@duke.edu>
Date: Tue, 14 Apr 2026 13:36:36 +0800
Subject: [PATCH 1/5] Run in jupter

---
 jupyter_helper.py                      |   3 +
 notebooks/jupyter_helper.py            | 136 ++++++++++++
 notebooks/web2json_quickstart.ipynb    | 100 +++++++++
 scripts/build_html_manifest.py         | 141 ++++++++++++
 scripts/run_jsonl_web2json_pipeline.py | 296 +++++++++++++++++++++++++
 scripts/verify_extract_alignment.py    | 282 +++++++++++++++++++++++
 6 files changed, 958 insertions(+)
 create mode 100644 jupyter_helper.py
 create mode 100644 notebooks/jupyter_helper.py
 create mode 100644 notebooks/web2json_quickstart.ipynb
 create mode 100644 scripts/build_html_manifest.py
 create mode 100644 scripts/run_jsonl_web2json_pipeline.py
 create mode 100644 scripts/verify_extract_alignment.py

diff --git a/jupyter_helper.py b/jupyter_helper.py
new file mode 100644
index 0000000..d82c824
--- /dev/null
+++ b/jupyter_helper.py
@@ -0,0 +1,3 @@
+"""Compatibility wrapper so notebooks can import jupyter_helper from multiple locations."""
+
+from notebooks.jupyter_helper import *  # noqa: F401,F403
diff --git a/notebooks/jupyter_helper.py b/notebooks/jupyter_helper.py
new file mode 100644
index 0000000..801d70e
--- /dev/null
+++ b/notebooks/jupyter_helper.py
@@ -0,0 +1,136 @@
+"""Utilities for running web2json-agent inside Jupyter notebooks."""
+
+from __future__ import annotations
+
+import json
+import os
+import sys
+from pathlib import Path
+from typing import Any, Optional, Sequence
+
+PROJECT_ROOT = Path(__file__).resolve().parents[1]
+
+
+def prepare_notebook(
+    api_key: Optional[str] = None,
+    api_base: Optional[str] = None,
+    project_root: Optional[str] = None,
+) -> Path:
+    """Prepare the notebook process for local package imports and env loading."""
+    root = Path(project_root).expanduser().resolve() if project_root else PROJECT_ROOT
+
+    if str(root) not in sys.path:
+        sys.path.insert(0, str(root))
+
+    os.chdir(root)
+
+    if api_key:
+        os.environ["OPENAI_API_KEY"] = api_key
+
+    if api_base:
+        os.environ["OPENAI_API_BASE"] = api_base
+
+    return root
+
+
+def make_extract_config(
+    name: str,
+    html_path: str,
+    output_path: str = "output",
+    save: Optional[Sequence[str]] = ("schema", "code", "data"),
+    schema: Optional[dict[str, Any]] = None,
+    iteration_rounds: int = 3,
+    enable_schema_edit: bool = False,
+    remove_null_fields: bool = True,
+    parser_code: Optional[str] = None,
+):
+    """Build a Web2JsonConfig with notebook-friendly path resolution."""
+    prepare_notebook()
+
+    from web2json import Web2JsonConfig
+
+    html_target = _resolve_project_path(html_path)
+    output_target = _resolve_project_path(output_path)
+
+    return Web2JsonConfig(
+        name=name,
+        html_path=str(html_target),
+        output_path=str(output_target),
+        iteration_rounds=iteration_rounds,
+        schema=schema,
+        enable_schema_edit=enable_schema_edit,
+        parser_code=parser_code,
+        save=list(save) if save is not None else None,
+        remove_null_fields=remove_null_fields,
+    )
+
+
+def preview_records(records: Sequence[dict[str, Any]], limit: int = 3) -> list[dict[str, Any]]:
+    """Return the first few parsed records so a notebook cell renders them directly."""
+    return list(records[:limit])
+
+
+def print_schema(schema: dict[str, Any]) -> None:
+    """Pretty print schema content inside notebooks."""
+    print(json.dumps(schema, ensure_ascii=False, indent=2))
+
+
+def summarize_cluster_result(cluster_result: Any) -> dict[str, Any]:
+    """Convert a cluster result into a compact notebook-friendly summary."""
+    return {
+        "cluster_count": cluster_result.cluster_count,
+        "clusters": {name: len(files) for name, files in cluster_result.clusters.items()},
+        "noise_files": len(cluster_result.noise_files),
+    }
+
+
+def run_jsonl_pipeline(
+    source_jsonl: str,
+    work_id: str = "",
+    input_root: str = "input_html",
+    output_root: str = "output",
+    html_key: str = "html",
+    iteration_rounds: int = 3,
+    cluster_limit: int = 0,
+):
+    """Run the full JSONL pipeline from a notebook and return the structured summary."""
+    prepare_notebook()
+
+    from scripts.run_jsonl_web2json_pipeline import run_jsonl_pipeline as _run_jsonl_pipeline
+
+    return _run_jsonl_pipeline(
+        source_jsonl=str(_resolve_project_path(source_jsonl)),
+        work_id=work_id,
+        input_root=str(_resolve_project_path(input_root)),
+        output_root=str(_resolve_project_path(output_root)),
+        html_key=html_key,
+        iteration_rounds=iteration_rounds,
+        cluster_limit=cluster_limit,
+    )
+
+
+def summarize_pipeline_result(result: Any) -> dict[str, Any]:
+    """Build a compact summary view for notebook display."""
+    return {
+        "source_jsonl": result.source_jsonl,
+        "pipeline_root": result.pipeline_root,
+        "cluster_count": result.cluster_count,
+        "clusters": [
+            {
+                "cluster_name": cluster["cluster_name"],
+                "cluster_size": cluster["cluster_size"],
+                "parse_success_count": cluster["parse_success_count"],
+                "parse_failed_count": cluster["parse_failed_count"],
+            }
+            for cluster in result.clusters
+        ],
+        "total_token_usage": result.total_token_usage,
+        "summary_path": result.summary_path,
+    }
+
+
+def _resolve_project_path(path_str: str) -> Path:
+    path = Path(path_str).expanduser()
+    if path.is_absolute():
+        return path
+    return (PROJECT_ROOT / path).resolve()
diff --git a/notebooks/web2json_quickstart.ipynb b/notebooks/web2json_quickstart.ipynb
new file mode 100644
index 0000000..da203aa
--- /dev/null
+++ b/notebooks/web2json_quickstart.ipynb
@@ -0,0 +1,100 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# web2json-agent JSONL Pipeline Quickstart\n",
+    "\n",
+    "这个 notebook 基于 `scripts/run_jsonl_web2json_pipeline.py`，按顺序运行下面几个单元即可。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "import sys\n",
+    "\n",
+    "PROJECT_ROOT = Path.cwd()\n",
+    "if PROJECT_ROOT.name == \"notebooks\":\n",
+    "    PROJECT_ROOT = PROJECT_ROOT.parent\n",
+    "if str(PROJECT_ROOT) not in sys.path:\n",
+    "    sys.path.insert(0, str(PROJECT_ROOT))\n",
+    "\n",
+    "from jupyter_helper import prepare_notebook\n",
+    "\n",
+    "PROJECT_ROOT = prepare_notebook(\n",
+    "    api_key=\"YOUR_API_KEY\",\n",
+    "    api_base=\"https://api.openai.com/v1\",  # 如果你使用兼容网关，请替换这里\n",
+    ")\n",
+    "\n",
+    "PROJECT_ROOT"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from jupyter_helper import run_jsonl_pipeline, summarize_pipeline_result\n",
+    "\n",
+    "result = run_jsonl_pipeline(\n",
+    "    source_jsonl=\"ToClassify/sample.json\",\n",
+    "    work_id=\"sample_run\",\n",
+    "    input_root=\"input_html\",\n",
+    "    output_root=\"output\",\n",
+    "    html_key=\"html\",\n",
+    "    iteration_rounds=3,\n",
+    "    cluster_limit=1,\n",
+    ")\n",
+    "\n",
+    "summarize_pipeline_result(result)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "result.to_dict()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "result.summary_path"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "output_dir = PROJECT_ROOT / \"output\" / \"sample_run_pipeline\"\n",
+    "output_dir"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/scripts/build_html_manifest.py b/scripts/build_html_manifest.py
new file mode 100644
index 0000000..8d1508f
--- /dev/null
+++ b/scripts/build_html_manifest.py
@@ -0,0 +1,141 @@
+#!/usr/bin/env python3
+"""
+从 crawl jsonl 拆分出 HTML 文件，并生成 manifest.jsonl 索引。
+
+示例：
+python scripts/build_html_manifest.py \
+  --source ToClassify/example.jsonl \
+  --output-dir input_html/example_set
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+from pathlib import Path
+from typing import Any
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Split a crawl jsonl into numbered HTML files plus manifest.jsonl."
+    )
+    parser.add_argument(
+        "--source",
+        required=True,
+        help="源 jsonl 文件路径，每行应至少包含 html 字段。",
+    )
+    parser.add_argument(
+        "--output-dir",
+        required=True,
+        help="输出目录，会写入 0001.html... 和 manifest.jsonl。",
+    )
+    parser.add_argument(
+        "--html-key",
+        default="html",
+        help="HTML 内容字段名，默认 html。",
+    )
+    parser.add_argument(
+        "--start-index",
+        type=int,
+        default=1,
+        help="输出编号起始值，默认 1。",
+    )
+    parser.add_argument(
+        "--width",
+        type=int,
+        default=4,
+        help="输出文件编号宽度，默认 4，例如 0001.html。",
+    )
+    parser.add_argument(
+        "--limit",
+        type=int,
+        default=0,
+        help="最多处理多少条记录，0 表示不限制。",
+    )
+    parser.add_argument(
+        "--skip-empty-html",
+        action="store_true",
+        help="遇到缺失或空 html 时跳过该记录；默认直接报错。",
+    )
+    return parser.parse_args()
+
+
+def ensure_text(value: Any) -> str:
+    if value is None:
+        return ""
+    if isinstance(value, str):
+        return value
+    return str(value)
+
+
+def main() -> None:
+    args = parse_args()
+
+    source = Path(args.source)
+    output_dir = Path(args.output_dir)
+
+    if not source.exists():
+        raise FileNotFoundError(f"Source jsonl not found: {source}")
+
+    output_dir.mkdir(parents=True, exist_ok=True)
+    manifest_path = output_dir / "manifest.jsonl"
+
+    processed_count = 0
+    skipped_count = 0
+    current_index = args.start_index
+
+    with source.open("r", encoding="utf-8") as src, manifest_path.open(
+        "w", encoding="utf-8"
+    ) as manifest_fp:
+        for source_line, line in enumerate(src, start=1):
+            if args.limit and processed_count >= args.limit:
+                break
+
+            line = line.strip()
+            if not line:
+                skipped_count += 1
+                continue
+
+            try:
+                record = json.loads(line)
+            except json.JSONDecodeError as exc:
+                raise ValueError(f"Invalid JSON at line {source_line}: {exc}") from exc
+
+            html = ensure_text(record.get(args.html_key))
+            if not html.strip():
+                if args.skip_empty_html:
+                    skipped_count += 1
+                    continue
+                raise ValueError(
+                    f"Missing or empty '{args.html_key}' at line {source_line}"
+                )
+
+            filename = f"{current_index:0{args.width}d}.html"
+            html_path = output_dir / filename
+            html_path.write_text(html, encoding="utf-8")
+
+            manifest_record = {
+                "sample_no": current_index,
+                "source_line": source_line,
+                "filename": filename,
+                "track_id": record.get("track_id"),
+                "url": record.get("url"),
+                "status": record.get("status"),
+                "html_len": len(html),
+            }
+
+            manifest_fp.write(json.dumps(manifest_record, ensure_ascii=False) + "\n")
+
+            processed_count += 1
+            current_index += 1
+
+    print(f"source: {source}")
+    print(f"output_dir: {output_dir}")
+    print(f"manifest: {manifest_path}")
+    print(f"processed: {processed_count}")
+    print(f"skipped: {skipped_count}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/run_jsonl_web2json_pipeline.py b/scripts/run_jsonl_web2json_pipeline.py
new file mode 100644
index 0000000..bf7ff33
--- /dev/null
+++ b/scripts/run_jsonl_web2json_pipeline.py
@@ -0,0 +1,296 @@
+#!/usr/bin/env python3
+"""
+对 jsonl 执行完整 web2json 流水线：
+1. 拆分 html + manifest
+2. classify_html_dir
+3. 对每个 cluster 执行 extract_schema
+4. infer_code
+5. extract_data_with_code
+6. 汇总 token 使用
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import re
+import sys
+from dataclasses import asdict, dataclass
+from pathlib import Path
+from typing import Any
+
+PROJECT_ROOT = Path(__file__).resolve().parents[1]
+if str(PROJECT_ROOT) not in sys.path:
+    sys.path.insert(0, str(PROJECT_ROOT))
+
+from web2json import (
+    Web2JsonConfig,
+    classify_html_dir,
+    extract_schema,
+    infer_code,
+    extract_data_with_code,
+)
+from web2json.utils.llm_client import LLMClient
+
+
+@dataclass
+class PipelineRunResult:
+    source_jsonl: str
+    manifest: str
+    html_dir: str
+    pipeline_root: str
+    cluster_count: int
+    clusters: list[dict[str, Any]]
+    total_token_usage: dict[str, int]
+    summary_path: str
+
+    def to_dict(self) -> dict[str, Any]:
+        return asdict(self)
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Run web2json pipeline on a crawl jsonl.")
+    parser.add_argument("--source-jsonl", required=True, help="源 jsonl 文件路径。")
+    parser.add_argument(
+        "--work-id",
+        default="",
+        help="输出目录标识。默认根据 jsonl 文件名自动生成。",
+    )
+    parser.add_argument(
+        "--input-root",
+        default="input_html",
+        help="HTML 输出根目录，默认 input_html。",
+    )
+    parser.add_argument(
+        "--output-root",
+        default="output",
+        help="结果输出根目录，默认 output。",
+    )
+    parser.add_argument(
+        "--html-key",
+        default="html",
+        help="jsonl 中 HTML 字段名，默认 html。",
+    )
+    parser.add_argument(
+        "--iteration-rounds",
+        type=int,
+        default=3,
+        help="schema 学习轮数上限，默认 3。",
+    )
+    parser.add_argument(
+        "--cluster-limit",
+        type=int,
+        default=0,
+        help="最多处理多少个 cluster，0 表示全部处理。",
+    )
+    return parser.parse_args()
+
+
+def slugify(value: str) -> str:
+    value = re.sub(r"[^a-zA-Z0-9]+", "_", value).strip("_").lower()
+    return value or "run"
+
+
+def load_jsonl(path: Path) -> list[tuple[int, dict[str, Any]]]:
+    rows: list[tuple[int, dict[str, Any]]] = []
+    with path.open("r", encoding="utf-8") as fp:
+        for line_no, line in enumerate(fp, start=1):
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                rows.append((line_no, json.loads(line)))
+            except json.JSONDecodeError as exc:
+                print(f"skip invalid json line {line_no}: {exc}")
+    return rows
+
+
+def build_html_manifest(source_jsonl: Path, output_dir: Path, html_key: str) -> Path:
+    rows = load_jsonl(source_jsonl)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    manifest_path = output_dir / "manifest.jsonl"
+
+    with manifest_path.open("w", encoding="utf-8") as manifest_fp:
+        for idx, (source_line, row) in enumerate(rows, start=1):
+            html = row.get(html_key)
+            if not isinstance(html, str) or not html.strip():
+                continue
+
+            filename = f"{idx:04d}.html"
+            (output_dir / filename).write_text(html, encoding="utf-8")
+
+            manifest_row = {
+                "sample_no": idx,
+                "source_line": source_line,
+                "filename": filename,
+                "track_id": row.get("track_id"),
+                "url": row.get("url"),
+                "status": row.get("status"),
+                "html_len": len(html),
+            }
+            manifest_fp.write(json.dumps(manifest_row, ensure_ascii=False) + "\n")
+
+    return manifest_path
+
+
+def usage_delta(before: dict[str, int], after: dict[str, int]) -> dict[str, int]:
+    return {
+        "total_input_tokens": after["total_input_tokens"] - before["total_input_tokens"],
+        "total_completion_tokens": after["total_completion_tokens"] - before["total_completion_tokens"],
+        "total_tokens": after["total_tokens"] - before["total_tokens"],
+    }
+
+
+def run_jsonl_pipeline(
+    source_jsonl: str,
+    work_id: str = "",
+    input_root: str = "input_html",
+    output_root: str = "output",
+    html_key: str = "html",
+    iteration_rounds: int = 3,
+    cluster_limit: int = 0,
+) -> PipelineRunResult:
+    source_jsonl_path = Path(source_jsonl).expanduser()
+    if not source_jsonl_path.is_absolute():
+        source_jsonl_path = (PROJECT_ROOT / source_jsonl_path).resolve()
+
+    work_id = work_id or slugify(source_jsonl_path.stem)
+
+    input_root_path = Path(input_root).expanduser()
+    if not input_root_path.is_absolute():
+        input_root_path = (PROJECT_ROOT / input_root_path).resolve()
+
+    output_root_path = Path(output_root).expanduser()
+    if not output_root_path.is_absolute():
+        output_root_path = (PROJECT_ROOT / output_root_path).resolve()
+
+    html_dir = input_root_path / work_id
+    pipeline_root = output_root_path / f"{work_id}_pipeline"
+    pipeline_root.mkdir(parents=True, exist_ok=True)
+
+    print(f"source_jsonl: {source_jsonl_path}")
+    print(f"work_id: {work_id}")
+    print(f"html_dir: {html_dir}")
+    print(f"pipeline_root: {pipeline_root}")
+
+    manifest_path = build_html_manifest(source_jsonl_path, html_dir, html_key)
+    print(f"manifest: {manifest_path}")
+
+    classify_config = Web2JsonConfig(
+        name="classify",
+        html_path=str(html_dir),
+        output_path=str(pipeline_root),
+        save=["report", "files"],
+    )
+    classify_result = classify_html_dir(classify_config)
+
+    clusters_dir = pipeline_root / "classify" / "clusters"
+    cluster_names = sorted(classify_result.clusters.keys())
+    if cluster_limit:
+        cluster_names = cluster_names[:cluster_limit]
+
+    LLMClient.reset_usage()
+    cluster_summaries: list[dict[str, Any]] = []
+
+    for cluster_name in cluster_names:
+        cluster_html_dir = clusters_dir / cluster_name
+        cluster_files = classify_result.clusters[cluster_name]
+        cluster_size = len(cluster_files)
+        rounds = min(iteration_rounds, cluster_size)
+
+        print(f"\n=== {cluster_name} ({cluster_size} files) ===")
+
+        before_schema = LLMClient.get_total_usage()
+        schema_result = extract_schema(
+            Web2JsonConfig(
+                name=f"{cluster_name}_schema",
+                html_path=str(cluster_html_dir),
+                output_path=str(pipeline_root),
+                iteration_rounds=rounds,
+                save=["schema"],
+            )
+        )
+        after_schema = LLMClient.get_total_usage()
+
+        before_code = LLMClient.get_total_usage()
+        code_result = infer_code(
+            Web2JsonConfig(
+                name=f"{cluster_name}_code",
+                html_path=str(cluster_html_dir),
+                output_path=str(pipeline_root),
+                schema=schema_result.final_schema,
+                save=["schema", "code"],
+            )
+        )
+        after_code = LLMClient.get_total_usage()
+
+        parser_path = pipeline_root / f"{cluster_name}_code" / "final_parser.py"
+        parse_result = extract_data_with_code(
+            Web2JsonConfig(
+                name=f"{cluster_name}_extract_data",
+                html_path=str(cluster_html_dir),
+                output_path=str(pipeline_root),
+                parser_code=str(parser_path),
+                save=["data"],
+            )
+        )
+
+        cluster_summary = {
+            "cluster_name": cluster_name,
+            "cluster_size": cluster_size,
+            "html_dir": str(cluster_html_dir),
+            "schema_output": str(pipeline_root / f"{cluster_name}_schema"),
+            "code_output": str(pipeline_root / f"{cluster_name}_code"),
+            "data_output": str(pipeline_root / f"{cluster_name}_extract_data"),
+            "parser_path": str(parser_path),
+            "schema_fields": list(schema_result.final_schema.keys()),
+            "schema_token_usage": usage_delta(before_schema, after_schema),
+            "code_token_usage": usage_delta(before_code, after_code),
+            "parse_success_count": parse_result.success_count,
+            "parse_failed_count": parse_result.failed_count,
+        }
+        cluster_summaries.append(cluster_summary)
+
+    total_usage = LLMClient.get_total_usage()
+    summary = {
+        "source_jsonl": str(source_jsonl_path),
+        "manifest": str(manifest_path),
+        "html_dir": str(html_dir),
+        "pipeline_root": str(pipeline_root),
+        "cluster_count": len(cluster_names),
+        "clusters": cluster_summaries,
+        "total_token_usage": total_usage,
+    }
+
+    summary_path = pipeline_root / "pipeline_summary.json"
+    summary_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8")
+    print(f"\nsummary: {summary_path}")
+    print(json.dumps(total_usage, ensure_ascii=False, indent=2))
+
+    return PipelineRunResult(
+        source_jsonl=summary["source_jsonl"],
+        manifest=summary["manifest"],
+        html_dir=summary["html_dir"],
+        pipeline_root=summary["pipeline_root"],
+        cluster_count=summary["cluster_count"],
+        clusters=summary["clusters"],
+        total_token_usage=summary["total_token_usage"],
+        summary_path=str(summary_path),
+    )
+
+
+def main() -> None:
+    args = parse_args()
+    run_jsonl_pipeline(
+        source_jsonl=args.source_jsonl,
+        work_id=args.work_id,
+        input_root=args.input_root,
+        output_root=args.output_root,
+        html_key=args.html_key,
+        iteration_rounds=args.iteration_rounds,
+        cluster_limit=args.cluster_limit,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/verify_extract_alignment.py b/scripts/verify_extract_alignment.py
new file mode 100644
index 0000000..1da78a8
--- /dev/null
+++ b/scripts/verify_extract_alignment.py
@@ -0,0 +1,282 @@
+#!/usr/bin/env python3
+"""
+校验 source jsonl / manifest / html / result json 之间的一致性。
+
+示例：
+python scripts/verify_extract_alignment.py \
+  --source-jsonl ToClassify/source.jsonl \
+  --manifest input_html/npi_sample_2000/manifest.jsonl \
+  --html-dir input_html/npi_category_detail_cluster_1 \
+  --result-dir output/npi_category_detail_cluster_1_code/result \
+  --output output/npi_category_detail_cluster_1_code/qa_report.json
+
+或者直接使用 cluster manifest：
+python scripts/verify_extract_alignment.py \
+  --source-jsonl ToClassify/source.jsonl \
+  --cluster-manifest output/npi_category_detail_cluster_1_code/cluster_manifest.json \
+  --manifest input_html/npi_sample_2000/manifest.jsonl
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+from dataclasses import dataclass, asdict
+from pathlib import Path
+from typing import Any
+from bs4 import BeautifulSoup
+
+
+@dataclass
+class FileReport:
+    filename: str
+    source_line: int | None
+    url: str | None
+    track_id: str | None
+    html_exists: bool
+    result_exists: bool
+    source_match: bool
+    html_len_match: bool
+    field_checks: dict[str, dict[str, Any]]
+    ok: bool
+    errors: list[str]
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Verify alignment across source jsonl, manifest, html files, and result json."
+    )
+    parser.add_argument("--source-jsonl", required=True, help="原始 crawl jsonl 文件路径。")
+    parser.add_argument("--manifest", required=True, help="完整 manifest.jsonl 文件路径。")
+    parser.add_argument(
+        "--cluster-manifest",
+        default="",
+        help="cluster_manifest.json 路径。提供后会从其中自动读取 html-dir / result-dir / 文件子集 / schema-path。",
+    )
+    parser.add_argument("--html-dir", default="", help="HTML 文件目录。")
+    parser.add_argument("--result-dir", default="", help="解析结果 JSON 目录。")
+    parser.add_argument("--schema-json", default="", help="cluster 对应的 schema.json 路径。")
+    parser.add_argument("--output", default="", help="QA 报告输出路径（可选）。")
+    parser.add_argument(
+        "--fields",
+        nargs="*",
+        default=None,
+        help="要校验是否出现在 HTML 中的结果字段。未提供时会优先从 schema 自动推断，否则回退到 title content。",
+    )
+    return parser.parse_args()
+
+
+def load_jsonl(path: Path) -> list[dict[str, Any]]:
+    rows: list[dict[str, Any]] = []
+    with path.open("r", encoding="utf-8") as fp:
+        for line_no, line in enumerate(fp, start=1):
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                rows.append(json.loads(line))
+            except json.JSONDecodeError as exc:
+                raise ValueError(f"Invalid JSON in {path} line {line_no}: {exc}") from exc
+    return rows
+
+
+def load_target_files(
+    manifest_rows: list[dict[str, Any]], cluster_manifest_path: Path | None
+) -> tuple[list[dict[str, Any]], str, str, str]:
+    if not cluster_manifest_path:
+        return manifest_rows, "", "", ""
+
+    cluster_manifest = json.loads(cluster_manifest_path.read_text(encoding="utf-8"))
+    wanted = {item["filename"] for item in cluster_manifest.get("files", [])}
+    filtered_rows = [row for row in manifest_rows if row.get("filename") in wanted]
+    html_dir = cluster_manifest.get("input_dir", "")
+    result_dir = cluster_manifest.get("result_dir", "")
+    schema_path = cluster_manifest.get("schema_path", "")
+    return filtered_rows, html_dir, result_dir, schema_path
+
+
+def derive_fields_from_schema(schema_path: Path) -> list[str]:
+    schema = json.loads(schema_path.read_text(encoding="utf-8"))
+    fields: list[str] = []
+    for field_name, field_meta in schema.items():
+        if not isinstance(field_meta, dict):
+            continue
+        if field_meta.get("type") == "string":
+            fields.append(field_name)
+    return fields
+
+
+def normalize_text(value: Any) -> str:
+    if value is None:
+        return ""
+    if not isinstance(value, str):
+        value = str(value)
+    return " ".join(value.split())
+
+
+def check_field_in_html(field_value: Any, html_text: str) -> dict[str, Any]:
+    normalized_value = normalize_text(field_value)
+    normalized_html = normalize_text(html_text)
+    normalized_text = normalize_text(BeautifulSoup(html_text, "html.parser").get_text(" ", strip=True))
+
+    if not normalized_value:
+        return {
+            "value_present": False,
+            "raw_html_match": False,
+            "text_match": False,
+            "substring_match": False,
+            "value_len": 0,
+        }
+
+    return {
+        "value_present": True,
+        "raw_html_match": normalized_value in normalized_html,
+        "text_match": normalized_value in normalized_text,
+        "substring_match": normalized_value in normalized_html or normalized_value in normalized_text,
+        "value_len": len(normalized_value),
+    }
+
+
+def main() -> None:
+    args = parse_args()
+
+    source_jsonl = Path(args.source_jsonl)
+    manifest_path = Path(args.manifest)
+    cluster_manifest_path = Path(args.cluster_manifest) if args.cluster_manifest else None
+    output_path = Path(args.output) if args.output else None
+
+    source_rows = load_jsonl(source_jsonl)
+    manifest_rows = load_jsonl(manifest_path)
+    target_rows, cluster_html_dir, cluster_result_dir, cluster_schema_path = load_target_files(
+        manifest_rows, cluster_manifest_path
+    )
+
+    html_dir_str = args.html_dir or cluster_html_dir
+    result_dir_str = args.result_dir or cluster_result_dir
+    schema_json_str = args.schema_json or cluster_schema_path
+    if not html_dir_str or not result_dir_str:
+        raise ValueError("html-dir 和 result-dir 不能为空；可直接传参，或通过 cluster-manifest 提供。")
+
+    html_dir = Path(html_dir_str)
+    result_dir = Path(result_dir_str)
+    schema_json_path = Path(schema_json_str) if schema_json_str else None
+
+    if args.fields is not None:
+        fields_to_check = args.fields
+    elif schema_json_path and schema_json_path.exists():
+        fields_to_check = derive_fields_from_schema(schema_json_path)
+        if not fields_to_check:
+            fields_to_check = ["title", "content"]
+    else:
+        fields_to_check = ["title", "content"]
+
+    reports: list[FileReport] = []
+    ok_count = 0
+
+    for manifest_row in target_rows:
+        filename = manifest_row["filename"]
+        source_line = manifest_row.get("source_line")
+        url = manifest_row.get("url")
+        track_id = manifest_row.get("track_id")
+
+        html_path = html_dir / filename
+        result_path = result_dir / filename.replace(".html", ".json")
+
+        errors: list[str] = []
+        html_exists = html_path.exists()
+        result_exists = result_path.exists()
+        source_match = False
+        html_len_match = False
+        field_checks: dict[str, dict[str, Any]] = {}
+
+        html_text = ""
+        if html_exists:
+            html_text = html_path.read_text(encoding="utf-8")
+        else:
+            errors.append(f"missing_html:{html_path}")
+
+        result_data: dict[str, Any] = {}
+        if result_exists:
+            result_data = json.loads(result_path.read_text(encoding="utf-8"))
+        else:
+            errors.append(f"missing_result:{result_path}")
+
+        if source_line is not None and 1 <= source_line <= len(source_rows):
+            source_row = source_rows[source_line - 1]
+            source_match = (
+                source_row.get("track_id") == track_id
+                and source_row.get("url") == url
+            )
+            if not source_match:
+                errors.append("source_manifest_mismatch")
+
+            source_html = source_row.get("html", "")
+            html_len_match = len(source_html) == manifest_row.get("html_len")
+            if not html_len_match:
+                errors.append("source_manifest_html_len_mismatch")
+
+            if html_exists and len(html_text) != len(source_html):
+                errors.append("source_html_file_len_mismatch")
+            elif html_exists and html_text != source_html:
+                errors.append("source_html_file_content_mismatch")
+        else:
+            errors.append("invalid_source_line")
+
+        if html_exists and result_exists:
+            for field in fields_to_check:
+                field_checks[field] = check_field_in_html(result_data.get(field), html_text)
+                if field_checks[field]["value_present"] and not field_checks[field]["substring_match"]:
+                    errors.append(f"field_not_found_in_html:{field}")
+
+        ok = not errors
+        if ok:
+            ok_count += 1
+
+        reports.append(
+            FileReport(
+                filename=filename,
+                source_line=source_line,
+                url=url,
+                track_id=track_id,
+                html_exists=html_exists,
+                result_exists=result_exists,
+                source_match=source_match,
+                html_len_match=html_len_match,
+                field_checks=field_checks,
+                ok=ok,
+                errors=errors,
+            )
+        )
+
+    summary = {
+        "source_jsonl": str(source_jsonl),
+        "manifest": str(manifest_path),
+        "cluster_manifest": str(cluster_manifest_path) if cluster_manifest_path else "",
+        "html_dir": str(html_dir),
+        "result_dir": str(result_dir),
+        "schema_json": str(schema_json_path) if schema_json_path else "",
+        "fields_checked": fields_to_check,
+        "total_files": len(reports),
+        "ok_files": ok_count,
+        "failed_files": len(reports) - ok_count,
+        "reports": [asdict(report) for report in reports],
+    }
+
+    if output_path:
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        output_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8")
+        print(f"qa_report: {output_path}")
+
+    print(f"total_files: {summary['total_files']}")
+    print(f"ok_files: {summary['ok_files']}")
+    print(f"failed_files: {summary['failed_files']}")
+
+    for report in reports:
+        status = "OK" if report.ok else "FAIL"
+        print(f"{status} {report.filename}")
+        if report.errors:
+            print(f"  errors: {', '.join(report.errors)}")
+
+
+if __name__ == "__main__":
+    main()

From 47e41abafde7ef4f6347e2bde66ffa21f1ae0a21 Mon Sep 17 00:00:00 2001
From: ql101 <ql101@duke.edu>
Date: Tue, 14 Apr 2026 14:07:18 +0800
Subject: [PATCH 2/5] Improve pipeline ergonomics and HTML preprocessing

---
 README_JUPYTER.md                 | 195 ++++++++++++++++++++++++++++++
 start.sh                          |  13 +-
 web2json/simple.py                |  28 ++++-
 web2json/tools/html_simplifier.py |  76 ++++++++++++
 web2json/utils/llm_client.py      |   9 +-
 5 files changed, 311 insertions(+), 10 deletions(-)
 create mode 100644 README_JUPYTER.md

diff --git a/README_JUPYTER.md b/README_JUPYTER.md
new file mode 100644
index 0000000..101160a
--- /dev/null
+++ b/README_JUPYTER.md
@@ -0,0 +1,195 @@
+# web2json-agent Jupyter Guide
+
+这个文档专门基于 [scripts/run_jsonl_web2json_pipeline.py](/Users/luqing/Downloads/multiModal/web2json-agent/scripts/run_jsonl_web2json_pipeline.py) 来写，目标是在 Jupyter 里直接跑完整 `jsonl -> html -> classify -> schema -> code -> data` 流水线。
+
+它不覆盖项目原始 [README.md](/Users/luqing/Downloads/multiModal/web2json-agent/README.md)。
+
+## 这份文档对应哪条执行链路
+
+这里用的不是最简单的 `extract_data(...)` 单接口方案，而是项目里的完整脚本流水线:
+
+- 入口脚本: [scripts/run_jsonl_web2json_pipeline.py](/Users/luqing/Downloads/multiModal/web2json-agent/scripts/run_jsonl_web2json_pipeline.py)
+- Jupyter 包装: [jupyter_helper.py](/Users/luqing/Downloads/multiModal/web2json-agent/jupyter_helper.py)
+- Notebook helper 实现: [notebooks/jupyter_helper.py](/Users/luqing/Downloads/multiModal/web2json-agent/notebooks/jupyter_helper.py)
+- 示例 notebook: [notebooks/web2json_quickstart.ipynb](/Users/luqing/Downloads/multiModal/web2json-agent/notebooks/web2json_quickstart.ipynb)
+
+## 流水线做了什么
+
+脚本会按下面顺序执行:
+
+1. 读取 `jsonl`
+2. 从每条记录里取出 `html` 字段
+3. 拆成一批 `.html` 文件，并生成 `manifest.jsonl`
+4. 对 HTML 做 `classify_html_dir`
+5. 对每个 cluster 执行 `extract_schema`
+6. 执行 `infer_code`
+7. 用生成的 parser 执行 `extract_data_with_code`
+8. 输出 `pipeline_summary.json`
+
+适合这种输入数据:
+
+- 原始数据是 `jsonl`
+- 每行是一条网页记录
+- 每条记录里有 `html` 字段
+- 可能还带 `url`、`track_id`、`status`
+
+## Jupyter 最短路径
+
+### 1. 进入项目目录
+
+```bash
+cd /Users/luqing/Downloads/multiModal/web2json-agent
+```
+
+### 2. 安装项目
+
+请显式使用 `python3.11`，不要用系统默认的旧版 `python3`。
+
+```bash
+python3.11 -m pip install .
+```
+
+### 3. 启动 Jupyter
+
+```bash
+python3.11 -m notebook
+```
+
+或者:
+
+```bash
+python3.11 -m jupyter lab
+```
+
+### 4. 打开示例 notebook
+
+打开:
+
+`notebooks/web2json_quickstart.ipynb`
+
+## Notebook 最小示例
+
+### Cell 1: 初始化环境
+
+```python
+from jupyter_helper import prepare_notebook
+
+prepare_notebook(
+    api_key="YOUR_API_KEY",
+    api_base="https://api.openai.com/v1",
+)
+```
+
+### Cell 2: 运行完整 JSONL pipeline
+
+```python
+from jupyter_helper import run_jsonl_pipeline, summarize_pipeline_result
+
+result = run_jsonl_pipeline(
+    source_jsonl="ToClassify/sample.json",
+    work_id="sample_run",
+    input_root="input_html",
+    output_root="output",
+    html_key="html",
+    iteration_rounds=3,
+    cluster_limit=1,
+)
+
+summarize_pipeline_result(result)
+```
+
+### Cell 3: 查看完整结果
+
+```python
+result.to_dict()
+```
+
+## 也可以直接调用原脚本
+
+如果你不想通过 helper，也可以在 notebook 里直接 import 原脚本里的函数:
+
+```python
+from scripts.run_jsonl_web2json_pipeline import run_jsonl_pipeline
+
+result = run_jsonl_pipeline(
+    source_jsonl="ToClassify/sample.json",
+    work_id="sample_run",
+)
+```
+
+这就是 [scripts/run_jsonl_web2json_pipeline.py](/Users/luqing/Downloads/multiModal/web2json-agent/scripts/run_jsonl_web2json_pipeline.py) 里新增的 notebook-friendly 入口。
+
+## 参数说明
+
+`run_jsonl_pipeline(...)` 主要参数:
+
+- `source_jsonl`: 源 `jsonl` 路径
+- `work_id`: 这次运行的标识；为空时按文件名自动生成
+- `input_root`: 拆分后 HTML 的输出根目录，默认 `input_html`
+- `output_root`: pipeline 输出根目录，默认 `output`
+- `html_key`: `jsonl` 中 HTML 字段名，默认 `html`
+- `iteration_rounds`: schema 学习轮数上限，默认 `3`
+- `cluster_limit`: 最多处理多少个 cluster，默认 `0`，表示全部
+
+## 结果会落到哪里
+
+如果你设置:
+
+```python
+result = run_jsonl_pipeline(
+    source_jsonl="ToClassify/sample.json",
+    work_id="sample_run",
+)
+```
+
+通常会生成:
+
+- `input_html/sample_run/`
+- `output/sample_run_pipeline/`
+- `output/sample_run_pipeline/pipeline_summary.json`
+
+每个 cluster 下面还会有:
+
+- schema 输出目录
+- code 输出目录
+- data 输出目录
+- 最终 parser 文件
+
+## API Key 配置
+
+你可以二选一:
+
+### 方式 A: 在 notebook 里设置
+
+```python
+from jupyter_helper import prepare_notebook
+
+prepare_notebook(
+    api_key="YOUR_API_KEY",
+    api_base="https://api.openai.com/v1",
+)
+```
+
+### 方式 B: 在项目根目录放 `.env`
+
+```env
+OPENAI_API_KEY=YOUR_API_KEY
+OPENAI_API_BASE=https://api.openai.com/v1
+DEFAULT_MODEL=gpt-4.1
+```
+
+## 已知前提
+
+- Python 要求 `>= 3.10`
+- 当前这台机器上默认 `python3` 是旧的 `3.7.3`
+- 建议始终显式使用 `python3.11`
+- 这条流水线依赖模型 API，可用前需要配置好 key/base
+
+## 相关文件
+
+- [README.md](/Users/luqing/Downloads/multiModal/web2json-agent/README.md)
+- [README_JUPYTER.md](/Users/luqing/Downloads/multiModal/web2json-agent/README_JUPYTER.md)
+- [scripts/run_jsonl_web2json_pipeline.py](/Users/luqing/Downloads/multiModal/web2json-agent/scripts/run_jsonl_web2json_pipeline.py)
+- [jupyter_helper.py](/Users/luqing/Downloads/multiModal/web2json-agent/jupyter_helper.py)
+- [notebooks/jupyter_helper.py](/Users/luqing/Downloads/multiModal/web2json-agent/notebooks/jupyter_helper.py)
+- [notebooks/web2json_quickstart.ipynb](/Users/luqing/Downloads/multiModal/web2json-agent/notebooks/web2json_quickstart.ipynb)
diff --git a/start.sh b/start.sh
index f2cd712..f7d735c 100755
--- a/start.sh
+++ b/start.sh
@@ -3,6 +3,9 @@
 # Web2JSON Agent - Startup Script
 # Starts both the backend API and the frontend UI simultaneously
 
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PYTHON_BIN="${PYTHON_BIN:-python3.11}"
+
 echo "🚀 Starting Web2JSON Agent..."
 echo ""
 
@@ -21,14 +24,14 @@ fi
 
 # Start the backend
 echo "📡 Starting backend API (port 8000)..."
-cd /Users/brown/Projects/AILabProject/web2json-agent
+cd "$SCRIPT_DIR"
 
 # Create logs directory if it doesn't exist
 mkdir -p logs
 
 # Production Mode: Disable automatic reloading to avoid restarts triggered by changes in the output directory
 # If you need reload for development, use: --reload --reload-exclude 'output/**' --reload-exclude 'logs/**'
-uvicorn web2json_api.main:app --host 0.0.0.0 --port 8000 \
+"$PYTHON_BIN" -m uvicorn web2json_api.main:app --host 0.0.0.0 --port 8000 \
   --reload-exclude 'output/**' \
   --reload-exclude 'logs/**' \
   --reload-exclude '*.log' \
@@ -49,7 +52,7 @@ fi
 # Start the frontend
 echo ""
 echo "🎨 Starting frontend UI (port 5173)..."
-cd web2json_ui && npm run dev > ../logs/ui.log 2>&1 &
+cd "$SCRIPT_DIR/web2json_ui" && npm run dev > "$SCRIPT_DIR/logs/ui.log" 2>&1 &
 FRONTEND_PID=$!
 echo "   Frontend PID: $FRONTEND_PID"
 
@@ -70,8 +73,8 @@ echo "Or press Ctrl+C and run: pkill -f 'uvicorn|vite'"
 echo ""
 
 # Save PID
-echo $BACKEND_PID > .backend.pid
-echo $FRONTEND_PID > .frontend.pid
+echo $BACKEND_PID > "$SCRIPT_DIR/.backend.pid"
+echo $FRONTEND_PID > "$SCRIPT_DIR/.frontend.pid"
 
 # Wait for user interruption
 wait
diff --git a/web2json/simple.py b/web2json/simple.py
index 235d7c5..95b8ebe 100644
--- a/web2json/simple.py
+++ b/web2json/simple.py
@@ -7,6 +7,7 @@
 from pathlib import Path
 from typing import Optional, Dict, List, Any
 from dataclasses import dataclass, asdict
+import numpy as np
 from loguru import logger
 
 from web2json.agent import ParserAgent
@@ -1050,15 +1051,39 @@ def classify_html_dir(config: Web2JsonConfig) -> ClusterResult:
     # 执行聚类分析
     logger.info("正在进行布局聚类分析...")
     from web2json.tools.cluster import cluster_html_layouts_optimized
+    from web2json.tools.html_layout_cosin import get_feature
+
+    valid_html_files = []
+    valid_html_contents = []
+    invalid_html_files = []
+    for file_path, html_content in zip(html_files, html_contents):
+        try:
+            feature = get_feature(html_content)
+        except Exception as e:
+            logger.warning(f"  跳过布局特征提取失败页面: {file_path} ({e})")
+            invalid_html_files.append(file_path)
+            continue
+        if not feature:
+            logger.warning(f"  跳过无有效布局特征页面: {file_path}")
+            invalid_html_files.append(file_path)
+            continue
+        valid_html_files.append(file_path)
+        valid_html_contents.append(html_content)
+
+    if not valid_html_contents:
+        raise Exception("聚类失败: 没有可用于布局聚类的有效HTML页面")
 
     try:
         labels, sim_mat, clusters = cluster_html_layouts_optimized(
-            html_contents,
+            valid_html_contents,
             use_knn_graph=True
         )
     except Exception as e:
         raise Exception(f"聚类失败: {e}")
 
+    label_map = {file_path: int(label) for file_path, label in zip(valid_html_files, labels)}
+    labels = np.array([label_map.get(file_path, -1) for file_path in html_files], dtype=int)
+
     # 统计聚类结果
     unique_labels = sorted(set(labels))
     noise_count = sum(1 for l in labels if l == -1)
@@ -1160,4 +1185,3 @@ def classify_html_dir(config: Web2JsonConfig) -> ClusterResult:
         noise_files=noise_files,
         cluster_count=cluster_count
     )
-
diff --git a/web2json/tools/html_simplifier.py b/web2json/tools/html_simplifier.py
index 33f34ae..1a2581c 100644
--- a/web2json/tools/html_simplifier.py
+++ b/web2json/tools/html_simplifier.py
@@ -265,6 +265,77 @@ def remove_empty_tags(
     return root
 
 
+def _contains_token_attr(element: html.HtmlElement, attr_name: str, patterns: List[str]) -> bool:
+    """检查元素指定属性中是否包含目标模式。"""
+    attr_value = element.get(attr_name, '')
+    if not attr_value:
+        return False
+
+    attr_value = attr_value.lower()
+    return any(pattern in attr_value for pattern in patterns)
+
+
+def is_sharepoint_html(html_str: str) -> bool:
+    """粗略识别 SharePoint 页面。"""
+    markers = [
+        'microsoft sharepoint',
+        '_sppagecontextinfo',
+        's4-workspace',
+        '_layouts/15',
+    ]
+    html_lower = html_str.lower()
+    return any(marker in html_lower for marker in markers)
+
+
+def remove_sharepoint_noise(root: html.HtmlElement) -> html.HtmlElement:
+    """
+    删除 SharePoint 门户模板中的高噪音区域。
+
+    主要清理全站导航、页眉页脚、社媒区、noindex 容器和 mega menu。
+    """
+    class_patterns = [
+        'noindex',
+        'mega-menu',
+        'mega-sub-menu',
+        'mega-menu-wrap',
+        'mega-menu-toggle',
+        'top-header',
+        'button-close-top-header',
+        'social-media-header',
+        'breadcrumbs',
+        'breadcrumb',
+        'ms-csrlistview-controldiv',
+    ]
+    id_patterns = [
+        'top-header',
+        'top-menu',
+        'main-menu',
+        'mega-menu',
+        'social-media-header',
+        'navigationmenu',
+        'footer',
+        'ctl00_placeholdersitenamen',
+    ]
+    tag_names = {'header', 'footer', 'nav'}
+
+    remove_targets = []
+    for element in root.iter():
+        tag = str(element.tag).lower() if hasattr(element, 'tag') else ''
+        if tag in tag_names:
+            remove_targets.append(element)
+            continue
+        if _contains_token_attr(element, 'class', class_patterns):
+            remove_targets.append(element)
+            continue
+        if _contains_token_attr(element, 'id', id_patterns):
+            remove_targets.append(element)
+
+    # 去重，避免重复删除同一元素
+    unique_targets = list(dict.fromkeys(remove_targets))
+    remove_reversely(unique_targets)
+    return root
+
+
 def clean_attributes(
     root: html.HtmlElement,
     keep_attrs: List[str] = None
@@ -440,6 +511,11 @@ def simplify_html(
                 clean_attrs=True,
                 keep_attrs=keep_attrs_list
             )
+            if is_sharepoint_html(html_str):
+                simplified_root = html_to_element(result)
+                simplified_root = remove_sharepoint_noise(simplified_root)
+                simplified_root = remove_empty_tags(simplified_root)
+                result = element_to_html(simplified_root)
         # 根据aggressive参数选择模式
         elif aggressive:
             # 激进模式：删除所有无用内容
diff --git a/web2json/utils/llm_client.py b/web2json/utils/llm_client.py
index 59913a8..46c62e7 100644
--- a/web2json/utils/llm_client.py
+++ b/web2json/utils/llm_client.py
@@ -17,9 +17,9 @@
 env_path = project_root / ".env"
 load_dotenv(env_path)
 
-# 验证
-if not os.getenv("OPENAI_API_KEY"):
-    raise ValueError(f".env 文件路径: {env_path}, API Key未加载")
+# 验证（延迟到实际使用时）
+_api_key_missing = not os.getenv("OPENAI_API_KEY")
+_env_path_for_error = env_path
 
 # 定义场景类型
 ScenarioType = Literal["default", "code_gen", "agent"]
@@ -81,6 +81,9 @@ def __init__(
         if self._initialized:
             return
 
+        if _api_key_missing and not os.getenv("OPENAI_API_KEY"):
+            raise ValueError(f".env 文件路径: {_env_path_for_error}, API Key未加载")
+
         self.api_key = api_key or settings.openai_api_key
         self.api_base = api_base or settings.openai_api_base
         self.model = model or settings.default_model

From 6ad88f9710486cd9d828ff9d1168aaeecf9b302a Mon Sep 17 00:00:00 2001
From: root <root@PJNL231040017.pjlab.org>
Date: Wed, 15 Apr 2026 14:18:32 +0800
Subject: [PATCH 3/5] feat: jsonl pipeline retries, stats, and site-wide token
 aggregation

- Add LLM invoke retry with exponential backoff and llm_retry_stats in pipeline summary
- Extend settings for LLM timeout and retry env vars
- Schema merge failures now record error in phase result; schema_extraction uses shared retry
- run_jsonl_web2json_pipeline: merge-summary from disk when no prior summary, --only-failed, per-cluster and pipeline elapsed time
- Add aggregate_site_pipeline_stats.py to sum token usage and time across jsonl pipeline outputs

Made-with: Cursor
---
 scripts/aggregate_site_pipeline_stats.py | 223 +++++++++++++++
 scripts/run_jsonl_web2json_pipeline.py   | 343 ++++++++++++++++++++---
 web2json/agent/phases/schema_phase.py    |   5 +-
 web2json/config/settings.py              |  22 ++
 web2json/tools/schema_extraction.py      |  22 +-
 web2json/utils/llm_client.py             |  13 +-
 web2json/utils/llm_retry.py              | 135 +++++++++
 7 files changed, 708 insertions(+), 55 deletions(-)
 create mode 100644 scripts/aggregate_site_pipeline_stats.py
 create mode 100644 web2json/utils/llm_retry.py

diff --git a/scripts/aggregate_site_pipeline_stats.py b/scripts/aggregate_site_pipeline_stats.py
new file mode 100644
index 0000000..61a06e5
--- /dev/null
+++ b/scripts/aggregate_site_pipeline_stats.py
@@ -0,0 +1,223 @@
+#!/usr/bin/env python3
+"""
+汇总「某目录下全部 jsonl」各自对应流水线目录中的统计：
+- LLM token（来自各 pipeline_summary.json 的 total_token_usage）
+- 时间（各次 pipeline_elapsed_seconds 之和；为脚本侧计时的簇耗时之和）
+- 可选：llm_retry_stats 累计
+
+work_id 规则与 run_jsonl_web2json_pipeline.slugify(jsonl 文件名不含后缀) 一致。
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import re
+import sys
+from pathlib import Path
+from typing import Any
+
+PROJECT_ROOT = Path(__file__).resolve().parents[1]
+if str(PROJECT_ROOT) not in sys.path:
+    sys.path.insert(0, str(PROJECT_ROOT))
+
+
+def slugify(value: str) -> str:
+    value = re.sub(r"[^a-zA-Z0-9]+", "_", value).strip("_").lower()
+    return value or "run"
+
+
+def pipeline_extract_complete(pipeline_root: Path) -> tuple[bool, str]:
+    """各 cluster 下 HTML 数是否与 extract_data/result 中 JSON 数一致。"""
+    clusters_dir = pipeline_root / "classify" / "clusters"
+    if not clusters_dir.is_dir():
+        return False, f"缺少目录: {clusters_dir}"
+    for cluster_dir in sorted(clusters_dir.iterdir()):
+        if not cluster_dir.is_dir() or not cluster_dir.name.startswith("cluster_"):
+            continue
+        cname = cluster_dir.name
+        n_html = len(list(cluster_dir.glob("*.html"))) + len(list(cluster_dir.glob("*.htm")))
+        rd = pipeline_root / f"{cname}_extract_data" / "result"
+        n_json = len(list(rd.glob("*.json"))) if rd.is_dir() else 0
+        if n_html != n_json:
+            return False, f"{cname}: html={n_html} json={n_json}"
+    return True, ""
+
+
+def load_summary(path: Path) -> dict[str, Any]:
+    return json.loads(path.read_text(encoding="utf-8"))
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="汇总某目录下所有 jsonl 对应 pipeline 的 token / 时间 / retry。"
+    )
+    parser.add_argument(
+        "site_dir",
+        nargs="?",
+        default="Prod/ms-web-jwn",
+        help="包含 *.jsonl 的目录（相对项目根或绝对路径），默认 Prod/ms-web-jwn",
+    )
+    parser.add_argument(
+        "--output-root",
+        default="output",
+        help="流水线输出根目录，默认 output",
+    )
+    parser.add_argument(
+        "--strict",
+        action="store_true",
+        help="除存在 pipeline_summary 外，还校验各簇 HTML 数与 result JSON 数一致",
+    )
+    parser.add_argument(
+        "--allow-partial",
+        action="store_true",
+        help="有缺失或未通过 strict 时仍打印汇总且退出码为 0（默认识别到问题则退出 1）",
+    )
+    parser.add_argument(
+        "--json",
+        action="store_true",
+        dest="json_out",
+        help="输出完整 JSON（便于脚本解析）",
+    )
+    args = parser.parse_args()
+
+    site_dir = Path(args.site_dir).expanduser()
+    if not site_dir.is_absolute():
+        site_dir = (PROJECT_ROOT / site_dir).resolve()
+    if not site_dir.is_dir():
+        raise SystemExit(f"目录不存在: {site_dir}")
+
+    output_root = Path(args.output_root).expanduser()
+    if not output_root.is_absolute():
+        output_root = (PROJECT_ROOT / output_root).resolve()
+
+    jsonl_files = sorted(site_dir.glob("*.jsonl"))
+    if not jsonl_files:
+        raise SystemExit(f"目录下无 *.jsonl: {site_dir}")
+
+    rows: list[dict[str, Any]] = []
+    tot_in = tot_out = tot_tok = 0
+    tot_req = 0
+    tot_elapsed = 0.0
+    tot_retry = 0
+    errors: list[str] = []
+
+    for jp in jsonl_files:
+        work_id = slugify(jp.stem)
+        pr = output_root / f"{work_id}_pipeline"
+        sp = pr / "pipeline_summary.json"
+        row: dict[str, Any] = {
+            "jsonl": jp.name,
+            "work_id": work_id,
+            "pipeline_root": str(pr),
+        }
+        if not sp.is_file():
+            row["error"] = "missing pipeline_summary.json"
+            errors.append(f"{jp.name}: 无 {sp}")
+            rows.append(row)
+            continue
+
+        try:
+            summary = load_summary(sp)
+        except json.JSONDecodeError as e:
+            row["error"] = f"invalid json: {e}"
+            errors.append(f"{jp.name}: {e}")
+            rows.append(row)
+            continue
+
+        if args.strict:
+            ok, msg = pipeline_extract_complete(pr)
+            if not ok:
+                row["error"] = f"incomplete extract: {msg}"
+                errors.append(f"{jp.name}: {msg}")
+                rows.append(row)
+                continue
+
+        usage = summary.get("total_token_usage") or {}
+        if isinstance(usage, dict):
+            tot_in += int(usage.get("total_input_tokens", 0) or 0)
+            tot_out += int(usage.get("total_completion_tokens", 0) or 0)
+            tot_tok += int(usage.get("total_tokens", 0) or 0)
+            tot_req += int(usage.get("request_count", 0) or 0)
+
+        elapsed = summary.get("pipeline_elapsed_seconds")
+        if elapsed is not None:
+            tot_elapsed += float(elapsed)
+
+        retry_stats = summary.get("llm_retry_stats") or {}
+        if isinstance(retry_stats, dict):
+            tot_retry += int(retry_stats.get("llm_retry_events", 0) or 0)
+
+        row["total_input_tokens"] = usage.get("total_input_tokens", 0) if isinstance(usage, dict) else 0
+        row["total_completion_tokens"] = (
+            usage.get("total_completion_tokens", 0) if isinstance(usage, dict) else 0
+        )
+        row["total_tokens"] = usage.get("total_tokens", 0) if isinstance(usage, dict) else 0
+        row["request_count"] = usage.get("request_count", 0) if isinstance(usage, dict) else 0
+        row["pipeline_elapsed_seconds"] = float(elapsed) if elapsed is not None else None
+        row["llm_retry_events"] = retry_stats.get("llm_retry_events", 0) if isinstance(retry_stats, dict) else 0
+        rows.append(row)
+
+    aggregate = {
+        "site_dir": str(site_dir),
+        "output_root": str(output_root),
+        "jsonl_count": len(jsonl_files),
+        "ok_count": sum(1 for r in rows if "error" not in r),
+        "error_count": sum(1 for r in rows if "error" in r),
+        "total_token_usage": {
+            "request_count": tot_req,
+            "total_input_tokens": tot_in,
+            "total_completion_tokens": tot_out,
+            "total_tokens": tot_tok,
+        },
+        "pipeline_elapsed_seconds_sum": round(tot_elapsed, 3),
+        "llm_retry_events_sum": tot_retry,
+        "rows": rows,
+    }
+
+    if args.json_out:
+        print(json.dumps(aggregate, ensure_ascii=False, indent=2))
+    else:
+        print(f"目录: {site_dir}")
+        print(f"输出根: {output_root}")
+        print(f"jsonl 数: {len(jsonl_files)}  成功汇总: {aggregate['ok_count']}  失败/跳过: {aggregate['error_count']}")
+        print()
+        hdr = f"{'jsonl':<56} {'input':>10} {'output':>10} {'total':>10} {'秒':>10} {'retry':>6}"
+        print(hdr)
+        print("-" * len(hdr))
+        for r in rows:
+            if "error" in r:
+                print(f"{r['jsonl']:<56}  ERROR: {r['error']}")
+            else:
+                print(
+                    f"{r['jsonl']:<56} "
+                    f"{r['total_input_tokens']:>10} "
+                    f"{r['total_completion_tokens']:>10} "
+                    f"{r['total_tokens']:>10} "
+                    f"{r['pipeline_elapsed_seconds'] or 0:>10.3f} "
+                    f"{r.get('llm_retry_events', 0):>6}"
+                )
+        print("-" * len(hdr))
+        print(
+            f"{'合计':<56} "
+            f"{tot_in:>10} "
+            f"{tot_out:>10} "
+            f"{tot_tok:>10} "
+            f"{tot_elapsed:>10.3f} "
+            f"{tot_retry:>6}"
+        )
+        print()
+        print(
+            "说明: 「秒」为各次 pipeline 的 pipeline_elapsed_seconds 之和（簇耗时相加）；"
+            "若某次 summary 无该字段则按 0。"
+        )
+        if errors:
+            print("\n问题:")
+            for e in errors:
+                print(f"  - {e}")
+
+    if aggregate["error_count"] and not args.allow_partial:
+        raise SystemExit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/run_jsonl_web2json_pipeline.py b/scripts/run_jsonl_web2json_pipeline.py
index bf7ff33..d91d7b0 100644
--- a/scripts/run_jsonl_web2json_pipeline.py
+++ b/scripts/run_jsonl_web2json_pipeline.py
@@ -15,9 +15,10 @@
 import json
 import re
 import sys
+import time
 from dataclasses import asdict, dataclass
 from pathlib import Path
-from typing import Any
+from typing import Any, Optional
 
 PROJECT_ROOT = Path(__file__).resolve().parents[1]
 if str(PROJECT_ROOT) not in sys.path:
@@ -31,6 +32,7 @@
     extract_data_with_code,
 )
 from web2json.utils.llm_client import LLMClient
+from web2json.utils.llm_retry import get_retry_stats, reset_retry_stats
 
 
 @dataclass
@@ -50,7 +52,12 @@ def to_dict(self) -> dict[str, Any]:
 
 def parse_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser(description="Run web2json pipeline on a crawl jsonl.")
-    parser.add_argument("--source-jsonl", required=True, help="源 jsonl 文件路径。")
+    src = parser.add_mutually_exclusive_group(required=True)
+    src.add_argument("--source-jsonl", help="单个源 jsonl 文件路径。")
+    src.add_argument(
+        "--source-dir",
+        help="目录下所有 *.jsonl 依次全量跑流水线（与 --source-jsonl 二选一）。",
+    )
     parser.add_argument(
         "--work-id",
         default="",
@@ -83,45 +90,108 @@ def parse_args() -> argparse.Namespace:
         default=0,
         help="最多处理多少个 cluster，0 表示全部处理。",
     )
+    parser.add_argument(
+        "--fields",
+        default="",
+        help='预定义要抽取的字段，逗号分隔，类型均为 string，如 "title,content"。为空则走 auto schema。',
+    )
+    parser.add_argument(
+        "--max-jsonl-files",
+        type=int,
+        default=0,
+        help="与 --source-dir 联用：最多处理前 N 个 jsonl（按文件名排序），0 表示不限制。",
+    )
+    parser.add_argument(
+        "--schema-json",
+        default="",
+        help="Predefined schema JSON 文件路径（与 README Predefined Mode 一致），优先级高于 --fields。",
+    )
+    parser.add_argument(
+        "--skip-manifest",
+        action="store_true",
+        help="不重新从 jsonl 拆 HTML/manifest（需已存在 input_root/work_id/）。",
+    )
+    parser.add_argument(
+        "--skip-classify",
+        action="store_true",
+        help="不重新聚类，直接复用 pipeline_root/classify/clusters/（补跑失败簇时用）。",
+    )
+    parser.add_argument(
+        "--only-clusters",
+        default="",
+        help='只处理指定簇，逗号分隔，如 "cluster_4,cluster_5"。为空表示全部簇。',
+    )
+    parser.add_argument(
+        "--merge-summary",
+        action="store_true",
+        help="写回 pipeline_summary.json 时与已有 summary 按 cluster_name 合并（补跑时用）。",
+    )
+    parser.add_argument(
+        "--only-failed",
+        action="store_true",
+        help="仅补跑「HTML 数量 > result 下 JSON 数量」的簇（需已有 classify/extract 目录；可与 --only-clusters 叠加）。",
+    )
     return parser.parse_args()
 
 
+def load_schema_json(path: str) -> Optional[dict[str, Any]]:
+    if not path or not path.strip():
+        return None
+    p = Path(path).expanduser()
+    if not p.is_absolute():
+        p = PROJECT_ROOT / p
+    if not p.is_file():
+        raise SystemExit(f"--schema-json not found: {p}")
+    data = json.loads(p.read_text(encoding="utf-8"))
+    if not isinstance(data, dict):
+        raise SystemExit("--schema-json root must be a JSON object")
+    return data
+
+
+def fields_to_schema(fields_csv: str) -> Optional[dict[str, str]]:
+    if not fields_csv or not fields_csv.strip():
+        return None
+    out: dict[str, str] = {}
+    for part in fields_csv.split(","):
+        name = part.strip()
+        if name:
+            out[name] = "string"
+    return out or None
+
+
 def slugify(value: str) -> str:
     value = re.sub(r"[^a-zA-Z0-9]+", "_", value).strip("_").lower()
     return value or "run"
 
 
-def load_jsonl(path: Path) -> list[tuple[int, dict[str, Any]]]:
-    rows: list[tuple[int, dict[str, Any]]] = []
-    with path.open("r", encoding="utf-8") as fp:
-        for line_no, line in enumerate(fp, start=1):
+def build_html_manifest(source_jsonl: Path, output_dir: Path, html_key: str) -> Path:
+    """逐行流式读取 jsonl，避免大文件一次性载入内存。"""
+    output_dir.mkdir(parents=True, exist_ok=True)
+    manifest_path = output_dir / "manifest.jsonl"
+    idx = 0
+
+    with source_jsonl.open("r", encoding="utf-8") as src_fp, manifest_path.open(
+        "w", encoding="utf-8"
+    ) as manifest_fp:
+        for line_no, line in enumerate(src_fp, start=1):
             line = line.strip()
             if not line:
                 continue
             try:
-                rows.append((line_no, json.loads(line)))
+                row = json.loads(line)
             except json.JSONDecodeError as exc:
                 print(f"skip invalid json line {line_no}: {exc}")
-    return rows
-
-
-def build_html_manifest(source_jsonl: Path, output_dir: Path, html_key: str) -> Path:
-    rows = load_jsonl(source_jsonl)
-    output_dir.mkdir(parents=True, exist_ok=True)
-    manifest_path = output_dir / "manifest.jsonl"
-
-    with manifest_path.open("w", encoding="utf-8") as manifest_fp:
-        for idx, (source_line, row) in enumerate(rows, start=1):
+                continue
             html = row.get(html_key)
             if not isinstance(html, str) or not html.strip():
                 continue
-
+            idx += 1
             filename = f"{idx:04d}.html"
             (output_dir / filename).write_text(html, encoding="utf-8")
 
             manifest_row = {
                 "sample_no": idx,
-                "source_line": source_line,
+                "source_line": line_no,
                 "filename": filename,
                 "track_id": row.get("track_id"),
                 "url": row.get("url"),
@@ -141,6 +211,60 @@ def usage_delta(before: dict[str, int], after: dict[str, int]) -> dict[str, int]
     }
 
 
+def _synthetic_clusters_from_completed_extract(
+    pipeline_root: Path, current_names: set[str]
+) -> list[dict[str, Any]]:
+    """无 pipeline_summary.json 时，从已有 cluster_*_extract_data/result 推断已完成的簇。"""
+    out: list[dict[str, Any]] = []
+    for ed in sorted(pipeline_root.glob("cluster_*_extract_data")):
+        cname = ed.name[: -len("_extract_data")]
+        if cname in current_names:
+            continue
+        rd = ed / "result"
+        if not rd.is_dir():
+            continue
+        njson = len(list(rd.glob("*.json")))
+        if njson == 0:
+            continue
+        out.append(
+            {
+                "cluster_name": cname,
+                "cluster_size": njson,
+                "data_output": str(ed),
+                "note": "from_disk_merge (补跑合并时从结果目录推断)",
+            }
+        )
+    return out
+
+
+def _discover_clusters_from_disk(clusters_dir: Path) -> list[str]:
+    if not clusters_dir.is_dir():
+        return []
+    names = sorted(
+        p.name
+        for p in clusters_dir.iterdir()
+        if p.is_dir() and p.name.startswith("cluster_")
+    )
+    return names
+
+
+def _underextracted_cluster_names(pipeline_root: Path, clusters_dir: Path) -> list[str]:
+    """classify 中 HTML 数量大于对应 extract_data/result 中 JSON 数量的簇。"""
+    failed: list[str] = []
+    if not clusters_dir.is_dir():
+        return failed
+    for cluster_dir in sorted(clusters_dir.iterdir()):
+        if not cluster_dir.is_dir() or not cluster_dir.name.startswith("cluster_"):
+            continue
+        cname = cluster_dir.name
+        n_html = len(list(cluster_dir.glob("*.html"))) + len(list(cluster_dir.glob("*.htm")))
+        rd = pipeline_root / f"{cname}_extract_data" / "result"
+        n_json = len(list(rd.glob("*.json"))) if rd.is_dir() else 0
+        if n_html > n_json:
+            failed.append(cname)
+    return failed
+
+
 def run_jsonl_pipeline(
     source_jsonl: str,
     work_id: str = "",
@@ -149,6 +273,12 @@ def run_jsonl_pipeline(
     html_key: str = "html",
     iteration_rounds: int = 3,
     cluster_limit: int = 0,
+    schema: Optional[dict[str, Any]] = None,
+    skip_manifest: bool = False,
+    skip_classify: bool = False,
+    only_clusters: Optional[list[str]] = None,
+    merge_summary: bool = False,
+    only_failed: bool = False,
 ) -> PipelineRunResult:
     source_jsonl_path = Path(source_jsonl).expanduser()
     if not source_jsonl_path.is_absolute():
@@ -172,33 +302,75 @@ def run_jsonl_pipeline(
     print(f"work_id: {work_id}")
     print(f"html_dir: {html_dir}")
     print(f"pipeline_root: {pipeline_root}")
-
-    manifest_path = build_html_manifest(source_jsonl_path, html_dir, html_key)
-    print(f"manifest: {manifest_path}")
-
-    classify_config = Web2JsonConfig(
-        name="classify",
-        html_path=str(html_dir),
-        output_path=str(pipeline_root),
-        save=["report", "files"],
-    )
-    classify_result = classify_html_dir(classify_config)
+    if schema:
+        print(f"predefined schema fields: {list(schema.keys())}")
+
+    manifest_path = html_dir / "manifest.jsonl"
+    if skip_manifest:
+        if not manifest_path.is_file():
+            raise SystemExit(f"--skip-manifest requires existing {manifest_path}")
+        print(f"manifest (reuse): {manifest_path}")
+    else:
+        manifest_path = build_html_manifest(source_jsonl_path, html_dir, html_key)
+        print(f"manifest: {manifest_path}")
 
     clusters_dir = pipeline_root / "classify" / "clusters"
-    cluster_names = sorted(classify_result.clusters.keys())
+    classify_result = None
+
+    if skip_classify:
+        cluster_names = _discover_clusters_from_disk(clusters_dir)
+        if not cluster_names:
+            raise SystemExit(f"--skip-classify requires non-empty {clusters_dir}")
+        print(f"classify (reuse): {len(cluster_names)} clusters under {clusters_dir}")
+    else:
+        classify_config = Web2JsonConfig(
+            name="classify",
+            html_path=str(html_dir),
+            output_path=str(pipeline_root),
+            save=["report", "files"],
+        )
+        classify_result = classify_html_dir(classify_config)
+        cluster_names = sorted(classify_result.clusters.keys())
+
+    if only_clusters:
+        want = {x.strip() for x in only_clusters if x.strip()}
+        cluster_names = [c for c in cluster_names if c in want]
+        missing = want - set(cluster_names)
+        if missing:
+            raise SystemExit(f"--only-clusters not found on disk: {sorted(missing)}")
+        if not cluster_names:
+            raise SystemExit("--only-clusters filtered out all clusters")
+
+    if only_failed:
+        under = _underextracted_cluster_names(pipeline_root, clusters_dir)
+        under_set = set(under)
+        cluster_names = [c for c in cluster_names if c in under_set]
+        print(f"only-failed: 未跑满簇 {under} → 本次处理 {cluster_names}")
+        if not cluster_names:
+            print("当前没有需要补跑的 cluster（各簇 JSON 数量已不少于 HTML）。")
+            raise SystemExit(0)
+
     if cluster_limit:
         cluster_names = cluster_names[:cluster_limit]
 
     LLMClient.reset_usage()
+    reset_retry_stats()
     cluster_summaries: list[dict[str, Any]] = []
 
     for cluster_name in cluster_names:
         cluster_html_dir = clusters_dir / cluster_name
-        cluster_files = classify_result.clusters[cluster_name]
-        cluster_size = len(cluster_files)
+        if classify_result is not None:
+            cluster_files = classify_result.clusters[cluster_name]
+            cluster_size = len(cluster_files)
+        else:
+            cluster_files = sorted(cluster_html_dir.glob("*.html")) + sorted(
+                cluster_html_dir.glob("*.htm")
+            )
+            cluster_size = len(cluster_files)
         rounds = min(iteration_rounds, cluster_size)
 
         print(f"\n=== {cluster_name} ({cluster_size} files) ===")
+        cluster_t0 = time.perf_counter()
 
         before_schema = LLMClient.get_total_usage()
         schema_result = extract_schema(
@@ -208,6 +380,7 @@ def run_jsonl_pipeline(
                 output_path=str(pipeline_root),
                 iteration_rounds=rounds,
                 save=["schema"],
+                schema=schema,
             )
         )
         after_schema = LLMClient.get_total_usage()
@@ -220,6 +393,7 @@ def run_jsonl_pipeline(
                 output_path=str(pipeline_root),
                 schema=schema_result.final_schema,
                 save=["schema", "code"],
+                iteration_rounds=rounds,
             )
         )
         after_code = LLMClient.get_total_usage()
@@ -238,6 +412,7 @@ def run_jsonl_pipeline(
         cluster_summary = {
             "cluster_name": cluster_name,
             "cluster_size": cluster_size,
+            "elapsed_seconds": round(time.perf_counter() - cluster_t0, 3),
             "html_dir": str(cluster_html_dir),
             "schema_output": str(pipeline_root / f"{cluster_name}_schema"),
             "code_output": str(pipeline_root / f"{cluster_name}_code"),
@@ -252,20 +427,53 @@ def run_jsonl_pipeline(
         cluster_summaries.append(cluster_summary)
 
     total_usage = LLMClient.get_total_usage()
-    summary = {
+    summary_path = pipeline_root / "pipeline_summary.json"
+    run_elapsed = sum(c.get("elapsed_seconds", 0) for c in cluster_summaries if isinstance(c, dict))
+    cluster_count_total = len(_discover_clusters_from_disk(clusters_dir)) if clusters_dir.is_dir() else len(cluster_names)
+
+    summary: dict[str, Any] = {
         "source_jsonl": str(source_jsonl_path),
         "manifest": str(manifest_path),
         "html_dir": str(html_dir),
         "pipeline_root": str(pipeline_root),
-        "cluster_count": len(cluster_names),
+        "cluster_count": cluster_count_total,
         "clusters": cluster_summaries,
         "total_token_usage": total_usage,
+        "pipeline_elapsed_seconds": round(run_elapsed, 3),
+        "llm_retry_stats": get_retry_stats(),
     }
 
-    summary_path = pipeline_root / "pipeline_summary.json"
+    if merge_summary:
+        current_names = {c["cluster_name"] for c in cluster_summaries}
+        by_name: dict[str, Any] = {}
+        if summary_path.is_file():
+            try:
+                prev = json.loads(summary_path.read_text(encoding="utf-8"))
+            except json.JSONDecodeError:
+                prev = {}
+            prev_clusters = prev.get("clusters") or []
+            if isinstance(prev_clusters, list):
+                by_name = {
+                    c.get("cluster_name"): c
+                    for c in prev_clusters
+                    if isinstance(c, dict) and c.get("cluster_name")
+                }
+        else:
+            for c in _synthetic_clusters_from_completed_extract(pipeline_root, current_names):
+                by_name[c["cluster_name"]] = c
+        for c in cluster_summaries:
+            by_name[c["cluster_name"]] = c
+        if by_name:
+            summary["clusters"] = [by_name[k] for k in sorted(by_name.keys())]
+            summary["cluster_count"] = len(
+                _discover_clusters_from_disk(clusters_dir)
+            ) or len(summary["clusters"])
+        # total_token_usage 仅为本次运行累计（补跑时不会与历史相加，避免重复计算）
+
     summary_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8")
     print(f"\nsummary: {summary_path}")
     print(json.dumps(total_usage, ensure_ascii=False, indent=2))
+    print(json.dumps(get_retry_stats(), ensure_ascii=False, indent=2))
 
     return PipelineRunResult(
         source_jsonl=summary["source_jsonl"],
@@ -281,16 +489,63 @@ def run_jsonl_pipeline(
 
 def main() -> None:
     args = parse_args()
-    run_jsonl_pipeline(
-        source_jsonl=args.source_jsonl,
-        work_id=args.work_id,
-        input_root=args.input_root,
-        output_root=args.output_root,
-        html_key=args.html_key,
-        iteration_rounds=args.iteration_rounds,
-        cluster_limit=args.cluster_limit,
+    schema: Optional[dict[str, Any]] = None
+    if args.schema_json:
+        schema = load_schema_json(args.schema_json)
+    else:
+        schema = fields_to_schema(args.fields)
+
+    only = None
+    if args.only_clusters.strip():
+        only = [x.strip() for x in args.only_clusters.split(",") if x.strip()]
+
+    if args.skip_classify and not args.skip_manifest:
+        print("提示: --skip-classify 通常与 --skip-manifest 一起用，避免重复从大 jsonl 拆 HTML。")
+
+    extra_kw = dict(
+        skip_manifest=args.skip_manifest,
+        skip_classify=args.skip_classify,
+        only_clusters=only,
+        merge_summary=args.merge_summary,
+        only_failed=args.only_failed,
     )
 
+    if args.source_dir:
+        dir_path = Path(args.source_dir).expanduser()
+        if not dir_path.is_absolute():
+            dir_path = (PROJECT_ROOT / dir_path).resolve()
+        jsonl_files = sorted(dir_path.glob("*.jsonl"))
+        if args.max_jsonl_files:
+            jsonl_files = jsonl_files[: args.max_jsonl_files]
+        if not jsonl_files:
+            raise SystemExit(f"no *.jsonl under {dir_path}")
+        print(f"batch mode: {len(jsonl_files)} file(s) under {dir_path}")
+        for i, jp in enumerate(jsonl_files, 1):
+            print(f"\n{'='*60}\n[{i}/{len(jsonl_files)}] {jp.name}\n{'='*60}")
+            run_jsonl_pipeline(
+                source_jsonl=str(jp),
+                work_id="",
+                input_root=args.input_root,
+                output_root=args.output_root,
+                html_key=args.html_key,
+                iteration_rounds=args.iteration_rounds,
+                cluster_limit=args.cluster_limit,
+                schema=schema,
+                **extra_kw,
+            )
+    else:
+        run_jsonl_pipeline(
+            source_jsonl=args.source_jsonl,
+            work_id=args.work_id,
+            input_root=args.input_root,
+            output_root=args.output_root,
+            html_key=args.html_key,
+            iteration_rounds=args.iteration_rounds,
+            cluster_limit=args.cluster_limit,
+            schema=schema,
+            **extra_kw,
+        )
+
 
 if __name__ == "__main__":
     main()
diff --git a/web2json/agent/phases/schema_phase.py b/web2json/agent/phases/schema_phase.py
index 5deb008..7c0923e 100644
--- a/web2json/agent/phases/schema_phase.py
+++ b/web2json/agent/phases/schema_phase.py
@@ -149,6 +149,7 @@ def execute(self, html_files: List[str]) -> Dict[str, Any]:
 
         if not schema_results:
             logger.error("没有成功处理的Schema")
+            result["error"] = "没有成功处理的Schema"
             return result
 
         # ============ 构建轮次结果 ============
@@ -202,8 +203,10 @@ def execute(self, html_files: List[str]) -> Dict[str, Any]:
                     self.progress_callback("schema_merge", "Schema合并完成", 35)
 
             except Exception as e:
-                logger.error(f"合并多个Schema失败: {str(e)}")
+                err_text = str(e)
+                logger.error(f"合并多个Schema失败: {err_text}")
                 import traceback
                 logger.debug(traceback.format_exc())
+                result["error"] = f"Schema合并失败: {err_text}"
 
         return result
diff --git a/web2json/config/settings.py b/web2json/config/settings.py
index 82d22a3..5da41b4 100644
--- a/web2json/config/settings.py
+++ b/web2json/config/settings.py
@@ -19,6 +19,13 @@
         break
 
 
+def _env_optional_float(name: str) -> Optional[float]:
+    raw = os.getenv(name)
+    if raw is None or str(raw).strip() == "":
+        return None
+    return float(raw)
+
+
 class Settings(BaseModel):
     """全局配置"""
 
@@ -28,6 +35,21 @@ class Settings(BaseModel):
     openai_api_key: str = Field(default_factory=lambda: os.getenv("OPENAI_API_KEY", ""))
     openai_api_base: str = Field(default_factory=lambda: os.getenv("OPENAI_API_BASE", "https://api.openai.com/v1"))
 
+    # LLM 请求：超时与失败重试（网关 502/503、超时、限流等）
+    llm_request_timeout: Optional[float] = Field(
+        default_factory=lambda: _env_optional_float("LLM_REQUEST_TIMEOUT")
+    )
+    # 最大尝试次数（含首次请求），例如 6 表示首次失败后最多再试 5 次
+    llm_api_retry_max_attempts: int = Field(
+        default_factory=lambda: int(os.getenv("LLM_API_RETRY_MAX_ATTEMPTS", "6"))
+    )
+    llm_api_retry_base_seconds: float = Field(
+        default_factory=lambda: float(os.getenv("LLM_API_RETRY_BASE_SECONDS", "1.0"))
+    )
+    llm_api_retry_max_seconds: float = Field(
+        default_factory=lambda: float(os.getenv("LLM_API_RETRY_MAX_SECONDS", "60.0"))
+    )
+
     # ============================================
     # 模型配置
     # ============================================
diff --git a/web2json/tools/schema_extraction.py b/web2json/tools/schema_extraction.py
index a4ff62e..ee6dafd 100644
--- a/web2json/tools/schema_extraction.py
+++ b/web2json/tools/schema_extraction.py
@@ -12,6 +12,7 @@
 from web2json.config.settings import settings
 from web2json.prompts.schema_extraction import SchemaExtractionPrompts
 from web2json.prompts.schema_merge import SchemaMergePrompts
+from web2json.utils.llm_retry import chat_openai_invoke_kwargs, invoke_with_retry
 
 
 def _parse_llm_response(response: str) -> Dict:
@@ -103,7 +104,8 @@ def extract_schema_from_html(html_content: str) -> Dict:
             model=settings.default_model,
             api_key=os.getenv("OPENAI_API_KEY"),
             base_url=os.getenv("OPENAI_API_BASE"),
-            temperature=0.1
+            temperature=0.1,
+            **chat_openai_invoke_kwargs(),
         )
 
         messages = [
@@ -111,7 +113,9 @@ def extract_schema_from_html(html_content: str) -> Dict:
             {"role": "user", "content": f"{prompt}\n\n## HTML内容\n\n```html\n{html_content[:50000]}\n```"}
         ]
 
-        response = model.invoke(messages)
+        response = invoke_with_retry(
+            "extract_schema_from_html", lambda: model.invoke(messages)
+        )
 
         # 3. 解析响应
         if hasattr(response, 'content'):
@@ -154,7 +158,8 @@ def merge_multiple_schemas(schemas: List[Dict]) -> Dict:
             model=settings.default_model,
             api_key=os.getenv("OPENAI_API_KEY"),
             base_url=os.getenv("OPENAI_API_BASE"),
-            temperature=0.1
+            temperature=0.1,
+            **chat_openai_invoke_kwargs(),
         )
 
         messages = [
@@ -162,7 +167,9 @@ def merge_multiple_schemas(schemas: List[Dict]) -> Dict:
             {"role": "user", "content": prompt}
         ]
 
-        response = model.invoke(messages)
+        response = invoke_with_retry(
+            "merge_multiple_schemas", lambda: model.invoke(messages)
+        )
 
         # 3. 解析响应
         if hasattr(response, 'content'):
@@ -223,7 +230,8 @@ def enrich_schema_with_xpath(schema_template: Dict, html_content: str) -> Dict:
             model=settings.default_model,
             api_key=os.getenv("OPENAI_API_KEY"),
             base_url=os.getenv("OPENAI_API_BASE"),
-            temperature=0.1
+            temperature=0.1,
+            **chat_openai_invoke_kwargs(),
         )
 
         messages = [
@@ -231,7 +239,9 @@ def enrich_schema_with_xpath(schema_template: Dict, html_content: str) -> Dict:
             {"role": "user", "content": user_message}
         ]
 
-        response = model.invoke(messages)
+        response = invoke_with_retry(
+            "enrich_schema_with_xpath", lambda: model.invoke(messages)
+        )
 
         # 4. 解析响应
         if hasattr(response, 'content'):
diff --git a/web2json/utils/llm_client.py b/web2json/utils/llm_client.py
index 46c62e7..4e2a3ed 100644
--- a/web2json/utils/llm_client.py
+++ b/web2json/utils/llm_client.py
@@ -11,6 +11,7 @@
 from langchain_openai import ChatOpenAI
 from loguru import logger
 from web2json.config.settings import settings
+from web2json.utils.llm_retry import chat_openai_invoke_kwargs, invoke_with_retry
 
 # 加载项目根目录的 .env 文件
 project_root = Path(__file__).parent.parent
@@ -96,12 +97,13 @@ def __init__(
             # 如果模型不在 tiktoken 的预设中，使用 cl100k_base 作为默认
             self.tokenizer = tiktoken.get_encoding("cl100k_base")
 
-        # 构建 ChatOpenAI 参数
+        # 构建 ChatOpenAI 参数（关闭 SDK 内置重试，由 chat_completion 统一退避）
         client_kwargs = {
             "model": self.model,
             "api_key": self.api_key,
             "base_url": self.api_base,
-            "temperature": self.temperature
+            "temperature": self.temperature,
+            **chat_openai_invoke_kwargs(),
         }
 
         # 如果启用了禁用思考模式选项，直接传递 extra_body 参数
@@ -241,8 +243,11 @@ def chat_completion(
             模型响应文本
         """
         try:
-            # 使用 LangChain 1.0 的 invoke 方法
-            response = self.client.invoke(messages)
+            # 使用 LangChain 1.0 的 invoke 方法（网关/超时等可重试）
+            response = invoke_with_retry(
+                "chat_completion",
+                lambda: self.client.invoke(messages),
+            )
             
             # 从响应中提取 token 使用情况
             if hasattr(response, 'response_metadata') and 'token_usage' in response.response_metadata:
diff --git a/web2json/utils/llm_retry.py b/web2json/utils/llm_retry.py
new file mode 100644
index 0000000..b17a5a0
--- /dev/null
+++ b/web2json/utils/llm_retry.py
@@ -0,0 +1,135 @@
+"""
+LLM 调用重试：网关故障、超时、限流等瞬时错误时使用指数退避重试。
+"""
+from __future__ import annotations
+
+import random
+import time
+from typing import Any, Callable, TypeVar
+
+from loguru import logger
+
+from web2json.config.settings import settings
+
+T = TypeVar("T")
+
+# 与 LLMClient 类似：进程内累计，便于 pipeline 汇总
+_retry_events: int = 0
+
+
+def reset_retry_stats() -> None:
+    """新一批 pipeline 运行前清零。"""
+    global _retry_events
+    _retry_events = 0
+
+
+def get_retry_stats() -> dict[str, int]:
+    """本次进程内 LLM 可重试失败后实际执行重试的次数（每次退避前计 1）。"""
+    return {"llm_retry_events": _retry_events}
+
+
+def is_retryable_api_error(exc: BaseException) -> bool:
+    """是否为可重试的瞬时 API 故障（非业务/鉴权错误）。"""
+    try:
+        from openai import (
+            APIConnectionError,
+            APITimeoutError,
+            InternalServerError,
+            RateLimitError,
+        )
+        from openai import APIStatusError
+
+        if isinstance(exc, (APIConnectionError, APITimeoutError, RateLimitError, InternalServerError)):
+            return True
+        if isinstance(exc, APIStatusError):
+            resp = getattr(exc, "response", None)
+            code = getattr(resp, "status_code", None) if resp is not None else None
+            if code is not None and code in (408, 429, 500, 502, 503, 504):
+                return True
+    except ImportError:
+        pass
+
+    try:
+        import httpx
+
+        if isinstance(
+            exc,
+            (
+                httpx.ConnectError,
+                httpx.ReadTimeout,
+                httpx.WriteTimeout,
+                httpx.ConnectTimeout,
+                httpx.PoolTimeout,
+            ),
+        ):
+            return True
+        if isinstance(exc, httpx.HTTPStatusError) and exc.response is not None:
+            if exc.response.status_code in (408, 429, 500, 502, 503, 504):
+                return True
+    except ImportError:
+        pass
+
+    # 兜底：部分异常被 LangChain 包装或仅有字符串信息
+    msg = str(exc).lower()
+    hints = (
+        "502",
+        "503",
+        "504",
+        "timeout",
+        "timed out",
+        "connection",
+        "temporarily unavailable",
+        "bad gateway",
+        "gateway timeout",
+        "rate limit",
+        "overloaded",
+    )
+    if any(h in msg for h in hints):
+        return True
+
+    return False
+
+
+def invoke_with_retry(
+    operation_label: str,
+    invoke_fn: Callable[[], T],
+) -> T:
+    """
+    执行无参调用（通常为 model.invoke），在可重试错误时退避重试。
+
+    Args:
+        operation_label: 日志用简短说明
+        invoke_fn: 实际调用，如 lambda: model.invoke(messages)
+
+    Returns:
+        invoke_fn 的返回值
+    """
+    global _retry_events
+    max_attempts = max(1, settings.llm_api_retry_max_attempts)
+
+    for attempt in range(1, max_attempts + 1):
+        try:
+            return invoke_fn()
+        except Exception as e:
+            if attempt >= max_attempts or not is_retryable_api_error(e):
+                raise
+            delay = min(
+                settings.llm_api_retry_max_seconds,
+                settings.llm_api_retry_base_seconds * (2 ** (attempt - 1)),
+            )
+            jitter = random.uniform(0, max(delay * 0.1, 0.05))
+            sleep_s = min(delay + jitter, settings.llm_api_retry_max_seconds)
+            _retry_events += 1
+            logger.warning(
+                f"[{operation_label}] LLM 调用失败 ({attempt}/{max_attempts}): {e!s} — "
+                f"{sleep_s:.1f}s 后重试 (累计重试 #{_retry_events})"
+            )
+            time.sleep(sleep_s)
+
+
+def chat_openai_invoke_kwargs() -> dict[str, Any]:
+    """构造 ChatOpenAI 的公共参数：关闭 SDK 内置重试，由 invoke_with_retry 统一退避。"""
+    out: dict[str, Any] = {"max_retries": 0}
+    if settings.llm_request_timeout is not None:
+        out["timeout"] = settings.llm_request_timeout
+    return out

From cbccf68f0ceb3504bff1ae541a40d16c7cfd5387 Mon Sep 17 00:00:00 2001
From: root <root@PJNL231040017.pjlab.org>
Date: Wed, 15 Apr 2026 14:36:12 +0800
Subject: [PATCH 4/5] docs: add ms-web-jwn full pipeline cleaning report

Made-with: Cursor
---
 ...05\346\264\227\346\212\245\345\221\212.md" | 219 ++++++++++++++++++
 1 file changed, 219 insertions(+)
 create mode 100644 "\351\251\254\346\235\245\350\245\277\344\272\232\350\257\255ms-web-jwn\345\205\250\351\207\217\346\270\205\346\264\227\346\212\245\345\221\212.md"

diff --git "a/\351\251\254\346\235\245\350\245\277\344\272\232\350\257\255ms-web-jwn\345\205\250\351\207\217\346\270\205\346\264\227\346\212\245\345\221\212.md" "b/\351\251\254\346\235\245\350\245\277\344\272\232\350\257\255ms-web-jwn\345\205\250\351\207\217\346\270\205\346\264\227\346\212\245\345\221\212.md"
new file mode 100644
index 0000000..34f4395
--- /dev/null
+++ "b/\351\251\254\346\235\245\350\245\277\344\272\232\350\257\255ms-web-jwn\345\205\250\351\207\217\346\270\205\346\264\227\346\212\245\345\221\212.md"
@@ -0,0 +1,219 @@
+# 马来西亚语 ms-web-jwn 全量清洗报告
+
+## 1. 任务范围
+
+本次处理对象是目录：
+
+- [Prod/ms-web-jwn](/home/luqing/Downloads/web2json-agent/Prod/ms-web-jwn)
+
+其中包含 **9** 份源 `jsonl`（按文件名排序）：
+
+1. `20260310094905_352_d5c97d3855d7e8c777f63fd07dfce918.jsonl`
+2. `20260311203119_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee.jsonl`
+3. `20260312001751_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee.jsonl`
+4. `20260312035605_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee.jsonl`
+5. `20260312073301_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee.jsonl`
+6. `20260312111019_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee.jsonl`
+7. `20260312150026_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee.jsonl`
+8. `20260312183025_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee.jsonl`
+9. `20260312220228_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee.jsonl`
+
+处理流程为：
+
+```text
+jsonl -> html + manifest -> classify_html_dir -> extract_schema -> infer_code -> extract_data_with_code
+```
+
+说明：
+
+- 当前 repo 没有原生 URL 分桶能力；本次完全按 **HTML 结构聚类** 与 **cluster 内 parser 生成** 处理。
+- Schema 采用 **Predefined** 模式，字段定义见项目根目录 [my_schema.json](/home/luqing/Downloads/web2json-agent/my_schema.json)：`title`、`content`、`author`、`date`（类型均为 `string`）。
+- 调度命令形态：`scripts/run_jsonl_web2json_pipeline.py --source-jsonl <文件> --schema-json my_schema.json`（部分批次带 `--merge-summary` 等补跑参数）。
+
+## 2. 产物目录
+
+每份 `jsonl` 对应一个流水线根目录（`work_id` 与文件名主名一致）：
+
+| 源 jsonl | 输出目录 |
+|-----------|----------|
+| `20260310094905_352_d5c97d3855d7e8c777f63fd07dfce918.jsonl` | [output/20260310094905_352_d5c97d3855d7e8c777f63fd07dfce918_pipeline](/home/luqing/Downloads/web2json-agent/output/20260310094905_352_d5c97d3855d7e8c777f63fd07dfce918_pipeline) |
+| `20260311203119_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee.jsonl` | [output/20260311203119_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee_pipeline](/home/luqing/Downloads/web2json-agent/output/20260311203119_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee_pipeline) |
+| `20260312001751_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee.jsonl` | [output/20260312001751_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee_pipeline](/home/luqing/Downloads/web2json-agent/output/20260312001751_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee_pipeline) |
+| `20260312035605_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee.jsonl` | [output/20260312035605_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee_pipeline](/home/luqing/Downloads/web2json-agent/output/20260312035605_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee_pipeline) |
+| `20260312073301_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee.jsonl` | [output/20260312073301_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee_pipeline](/home/luqing/Downloads/web2json-agent/output/20260312073301_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee_pipeline) |
+| `20260312111019_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee.jsonl` | [output/20260312111019_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee_pipeline](/home/luqing/Downloads/web2json-agent/output/20260312111019_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee_pipeline) |
+| `20260312150026_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee.jsonl` | [output/20260312150026_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee_pipeline](/home/luqing/Downloads/web2json-agent/output/20260312150026_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee_pipeline) |
+| `20260312183025_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee.jsonl` | [output/20260312183025_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee_pipeline](/home/luqing/Downloads/web2json-agent/output/20260312183025_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee_pipeline) |
+| `20260312220228_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee.jsonl` | [output/20260312220228_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee_pipeline](/home/luqing/Downloads/web2json-agent/output/20260312220228_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee_pipeline) |
+
+每份流水线下的核心汇总：
+
+- `output/<work_id>_pipeline/pipeline_summary.json`：token、各簇耗时、`llm_retry_stats`（若脚本版本已写入）等。
+- `input_html/<work_id>/manifest.jsonl`：与 `0001.html` 等 HTML 一一对应。
+
+说明：本批 **未** 在流水线目录中生成 `qa_summary.json` / `qa_summary_schema_auto.json`（与 ms-web-kln 当时跑质检的路径不同）；若需字段回溯 QA，可复用 kln 报告中的 `verify_extract_alignment.py` 思路单独跑（见第 9 节）。
+
+## 3. 规模与耗时
+
+### 3.1 源数据规模（汇总）
+
+- 源 `jsonl` 总大小：约 **11 GB**（`du -ch Prod/ms-web-jwn/*.jsonl`）
+- `manifest.jsonl` 总行数（有效 HTML 行数口径）：**76,360** 行
+- 各 `cluster_*_extract_data/result/*.json` 合计：**76,356** 个  
+  - 与 manifest 差 **4**：集中在首份小批量 `20260310094905_...`（1210 行 manifest，1206 份结果），多为源行无有效 `html` 字段被流水线跳过，属预期范围。
+
+### 3.2 各份 jsonl 行数与磁盘占用（约）
+
+| 源 jsonl | 行数（wc -l） | 单文件约 |
+|----------|---------------|----------|
+| `20260310094905_352_d5c97d3855d7e8c777f63fd07dfce918.jsonl` | 1,210 | 169 MB |
+| 其余 7 份（各 10k 行） | 10,000 × 7 | 约 1.3–1.4 GB/份 |
+| `20260312220228_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee.jsonl` | 5,150 | 686 MB |
+
+### 3.3 聚类簇数（来自各 `pipeline_summary.json` 的 `cluster_count`）
+
+| 源 jsonl | cluster_count |
+|----------|-----------------|
+| `20260310094905_...` | 7 |
+| `20260311203119_...` | 7 |
+| `20260312001751_...` | 5 |
+| `20260312035605_...` | 6 |
+| `20260312073301_...` | 6 |
+| `20260312111019_...` | 6 |
+| `20260312150026_...` | 6 |
+| `20260312183025_...` | 5 |
+| `20260312220228_...` | 5 |
+| **合计** | **53**（九次流水线各自聚类簇数之和，非全局去重簇名） |
+
+### 3.4 流水线脚本侧耗时（`pipeline_elapsed_seconds`）
+
+九份 `pipeline_summary.json` 中，**前四份**较早跑次未写入 `pipeline_elapsed_seconds`（记为 `null`，汇总时按 **0**）；**后五份**有记录，其和为：
+
+- `379.412 + 1050.147 + 1009.324 + 833.803 + 1615.611` = **4,888.297 s**（约 **1 h 21 min**）
+
+与脚本 [aggregate_site_pipeline_stats.py](/home/luqing/Downloads/web2json-agent/scripts/aggregate_site_pipeline_stats.py) 字段 **`pipeline_elapsed_seconds_sum`** 一致。
+
+说明：九份任务多为 **顺序或错峰执行**，墙钟总时间以实际排期为准；上值为 **各次 pipeline 内已记录簇耗时之和**，用于与 token 同口径对比成本；未写字段的跑次不代表实际墙钟为 0。
+
+## 4. Token 消耗
+
+以下为九份 `pipeline_summary.json` 中 `total_token_usage` **相加**（与脚本 `aggregate_site_pipeline_stats.py` 一致）：
+
+| 指标 | 数值 |
+|------|------|
+| 请求次数 `request_count` | 146 |
+| 输入 tokens | 1,816,722 |
+| 输出 tokens | 481,691 |
+| 合计 tokens | 2,298,413 |
+
+各份明细（摘自各 `pipeline_summary.json`）：
+
+| 源 jsonl | input | output | total |
+|----------|-------|--------|-------|
+| `20260310094905_...` | 290,156 | 60,120 | 350,276 |
+| `20260311203119_...` | 329,881 | 67,892 | 397,773 |
+| `20260312001751_...` | 176,182 | 44,655 | 220,837 |
+| `20260312035605_...` | 208,499 | 65,357 | 273,856 |
+| `20260312073301_...` | 70,853 | 21,271 | 92,124 |
+| `20260312111019_...` | 208,477 | 63,485 | 271,962 |
+| `20260312150026_...` | 206,996 | 59,579 | 266,575 |
+| `20260312183025_...` | 160,467 | 47,037 | 207,504 |
+| `20260312220228_...` | 165,211 | 52,295 | 217,506 |
+
+LLM 可重试失败次数（`llm_retry_stats.llm_retry_events` 之和）：**0**（本批汇总为 0）。
+
+## 5. 钱的换算（粗估）
+
+按 **Claude Sonnet 4.5** 公开价粗估（与 ms-web-kln 报告口径一致，仅作量级参考）：
+
+- 输入：`$3 / 1M tokens`
+- 输出：`$15 / 1M tokens`
+
+则：
+
+- 输入成本：`1.816722 × 3 ≈ USD 5.45`
+- 输出成本：`0.481691 × 15 ≈ USD 7.23`
+- **合计：约 USD 12.68**
+
+说明：实际计费若走内部兼容网关，可能与公开价不一致；此处仅供成本量级对比。
+
+## 6. 解析结果
+
+### 6.1 磁盘口径（推荐）
+
+- `manifest` 总行数：**76,360**
+- `cluster_*_extract_data/result/*.json` 总数：**76,356**
+- 差 **4**：见 **§3.1**，视为无有效 HTML 的跳过行。
+
+按「是否产出结果 JSON 文件」：
+
+- 成功率：**76,356 / 76,360 ≈ 99.995%**
+
+### 6.2 `pipeline_summary.json` 中的 `parse_success_count` 之和
+
+部分 summary 在 **补跑合并** 场景下只包含部分簇条目，**`parse_success_count` 加总可能小于磁盘真实结果数**（例如 `20260312073301_...` 的 summary 曾仅合并部分 cluster）。**验收与对账请以 §6.1 磁盘计数为准**。
+
+### 6.3 与 ms-web-kln 的差异
+
+- kln 报告含 **QA 双口径**（宽松 / 严格字段回溯）；本批 **ms-web-jwn 未跑同套 qa_summary**，若要对齐 kln 的可信度分析，需另行执行字段对齐脚本并落盘报告。
+
+## 7. 结论与后续建议
+
+**结论（流程是否跑通）：**
+
+- 9 份 `jsonl` 均在 `output/<work_id>_pipeline/` 下具备完整目录结构及 `pipeline_summary.json`。
+- 磁盘结果 JSON 与 manifest 对齐情况见 **§6.1**，整体可视为 **全量清洗流程已跑通**。
+
+**建议（可选）：**
+
+1. **质检**：参考 [马来西亚语ms-web-kln全量清洗报告.md](/home/luqing/Downloads/web2json-agent/马来西亚语ms-web-kln全量清洗报告.md) 第 7–8 节，对抽样或全量跑 `verify_extract_alignment.py`，区分宽松 / 严格回溯口径。
+2. **summary 合并**：若需单份 `pipeline_summary.json` 完整列出全部簇统计，可在代码已支持 `--merge-summary` 的前提下补跑一次仅写 summary 的流程，或手工合并 JSON。
+3. **Schema**：若某簇 `content` 过长导致 QA 假阴性居多，可考虑拆字段或收紧 predefined schema（与 kln 报告 §10 思路一致）。
+
+## 8. 可复用脚本
+
+### 8.1 总调度脚本
+
+- [run_jsonl_web2json_pipeline.py](/home/luqing/Downloads/web2json-agent/scripts/run_jsonl_web2json_pipeline.py)
+
+作用简述：拆 `jsonl` → `manifest` → 聚类 → `extract_schema` → `infer_code` → `extract_data_with_code`，并写 `pipeline_summary.json`（含 token、耗时、`llm_retry_stats` 等，视脚本版本）。
+
+单文件示例：
+
+```bash
+cd /home/luqing/Downloads/web2json-agent
+.venv/bin/python scripts/run_jsonl_web2json_pipeline.py \
+  --source-jsonl Prod/ms-web-jwn/你的文件.jsonl \
+  --schema-json my_schema.json
+```
+
+目录批量（按文件名排序依次处理目录下所有 `*.jsonl`）：
+
+```bash
+.venv/bin/python scripts/run_jsonl_web2json_pipeline.py --source-dir Prod/ms-web-jwn \
+  --schema-json my_schema.json
+```
+
+补跑未跑满簇、合并 summary 等参数见脚本 `--help`。
+
+### 8.2 站点级 token / 耗时汇总
+
+- [aggregate_site_pipeline_stats.py](/home/luqing/Downloads/web2json-agent/scripts/aggregate_site_pipeline_stats.py)
+
+用法示例：
+
+```bash
+cd /home/luqing/Downloads/web2json-agent
+.venv/bin/python scripts/aggregate_site_pipeline_stats.py Prod/ms-web-jwn
+```
+
+全部 jsonl 均有 summary 且无缺失时退出码为 **0**；需同时校验「每簇 HTML 数 = result JSON 数」时加 **`--strict`**。
+
+### 8.3 其它
+
+- **LLM 重试**：`web2json/utils/llm_retry.py` + 环境变量 `LLM_API_RETRY_*` / `LLM_REQUEST_TIMEOUT`（见 `web2json/config/settings.py`）。
+- **字段回溯 QA**：仍可采用 kln 报告 **§11.3** 中的 `verify_extract_alignment.py` 工作流（若仓库中已存在该脚本）。
+
+---
+
+*文档生成说明：规模、token、耗时、簇数等来自当前工作区 `Prod/ms-web-jwn` 与 `output/*_pipeline/pipeline_summary.json` 及磁盘计数；若你迁移目录或重跑流水线，请重新执行 `aggregate_site_pipeline_stats.py` 更新数字。*

From ccd9f6d65d5db227020bff4b6a76a9e6296bc043 Mon Sep 17 00:00:00 2001
From: root <root@PJNL231040017.pjlab.org>
Date: Fri, 17 Apr 2026 14:25:24 +0800
Subject: [PATCH 5/5] feat(crawl-jsonl): merge-dir layout cluster,
 cluster_list, slices; docs for ms-web-mma

- Add classify_crawl_jsonl_dir, crawl_jsonl helpers (split, manifest-friendly rows)
- Slice rows use layout_cluster_id / crawl_source_name / crawl_line_no (no _w2j)
- Export APIs from web2json.__init__
- Add ms-web-mma flow doc and Jupyter Spark checklist

Made-with: Cursor
---
 ...6\236\346\223\215\347\211\210checklist.md" | 411 +++++++++++++++++
 ...01\347\250\213\350\257\264\346\230\216.md" | 272 ++++++++++++
 web2json/__init__.py                          |   4 +
 web2json/simple.py                            | 420 ++++++++++++++++--
 web2json/tools/crawl_jsonl.py                 | 217 +++++++++
 5 files changed, 1296 insertions(+), 28 deletions(-)
 create mode 100644 "ms-web-mma_Jupyter\345\256\236\346\223\215\347\211\210checklist.md"
 create mode 100644 "ms-web-mma_\350\201\232\347\261\273\344\270\216schema\345\233\236\345\241\253\346\265\201\347\250\213\350\257\264\346\230\216.md"
 create mode 100644 web2json/tools/crawl_jsonl.py

diff --git "a/ms-web-mma_Jupyter\345\256\236\346\223\215\347\211\210checklist.md" "b/ms-web-mma_Jupyter\345\256\236\346\223\215\347\211\210checklist.md"
new file mode 100644
index 0000000..db13061
--- /dev/null
+++ "b/ms-web-mma_Jupyter\345\256\236\346\223\215\347\211\210checklist.md"
@@ -0,0 +1,411 @@
+# ms-web-mma：Jupyter 实操版 Checklist（Spark 清洗作业）
+
+本文把 [ms-web-mma_聚类与schema回填流程说明.md](/home/luqing/Downloads/v2/web2json-agent/ms-web-mma_聚类与schema回填流程说明.md) 中的目标流程，展开成一份可在 `http://jupyter.bigdata.shlab.tech/` 上执行的实操 checklist。
+
+适用目标：
+
+- 利用 **web2json** 对 `ms-web-mma` 做统一聚类
+- 按簇抽取 schema / parser / data
+- 再回填成发布向 JSONL
+
+---
+
+## 0. 执行原则
+
+建议先跑 **主文件**：
+
+- `20260310094859_353_79bda33fa180eedac40d37876224609d.jsonl`
+
+跑通主文件后，再放大到整个目录。原因很简单：
+
+- 主文件有 191 行，是主要数据量
+- 其余 5 个文件都只有 1 行，容易因为聚类噪声而影响你对流程状态的判断
+
+---
+
+## 1. 准备环境
+
+### 输入
+
+- Jupyter 环境：`http://jupyter.bigdata.shlab.tech/`
+- 本地代码目录：
+  [web2json-agent](/home/luqing/Downloads/v2/web2json-agent)
+- 流程说明：
+  [ms-web-mma_聚类与schema回填流程说明.md](/home/luqing/Downloads/v2/web2json-agent/ms-web-mma_聚类与schema回填流程说明.md)
+- 数据目录：
+  [Prod/ms-web-mma](/home/luqing/Downloads/v2/web2json-agent/Prod/ms-web-mma)
+- 路径约定：
+  `Prod/ms-web-mma/s3Path.txt`
+
+### 要做什么
+
+1. 打开 notebook，确认 Python 环境可用。
+2. 确认能访问 `web2json-agent` 项目目录。
+3. 确认能读取 `Prod/ms-web-mma/*.jsonl`。
+4. 读取 `s3Path.txt`，确认输入与输出前缀。
+5. 明确本次跑的是：
+   - 仅主文件
+   - 还是整个目录
+
+### 产出
+
+- 一份确认过的任务参数清单，例如：
+
+```text
+project_root=/home/luqing/Downloads/v2/web2json-agent
+source_dir=/home/luqing/Downloads/v2/web2json-agent/Prod/ms-web-mma
+source_files=[20260310094859_353_79bda33fa180eedac40d37876224609d.jsonl]
+output_root=s3://xyz2-process-hdd1/nlp/ms-web-mma/v0001
+```
+
+### 要验什么
+
+- 代码目录能读
+- 源文件能读
+- S3 路径约定明确
+- 本次范围已经锁定，不临时变化
+
+---
+
+## 2. 启动 Spark Session
+
+### 输入
+
+- notebook 环境
+- 项目根目录
+- 任务名，例如：
+  `ms-web-mma-cluster-v0001`
+
+### 要做什么
+
+1. 在 notebook 中初始化 Spark Session。
+2. 配置应用名、必要的 executor / memory / shuffle 参数。
+3. 将 `web2json-agent` 项目目录加入 Python path。
+4. 用一个极小样本做试读。
+
+### 产出
+
+- 一个可用的 SparkSession
+- 一次试读结果，确认 JSON 行结构正常
+
+### 要验什么
+
+- Spark 能成功启动
+- 能成功 `read jsonl`
+- 抽样记录中至少存在：
+  - `html`
+  - `track_id` 或可替代唯一键
+  - `url`
+
+---
+
+## 3. 生成全量索引表
+
+### 输入
+
+- 目标源文件列表
+- SparkSession
+
+### 要做什么
+
+对所有目标 JSONL 行生成统一索引。建议每一行至少补齐这些字段：
+
+| 字段 | 说明 |
+|---|---|
+| `global_index` | 全局唯一顺序号 |
+| `source_jsonl` | 源文件路径 |
+| `source_name` | 源文件名 |
+| `line_no` | 文件内行号 |
+| `record_id` | 优先用 `track_id`，缺失时兜底 |
+| `html` | 原始 HTML |
+| `url` | 原始 URL |
+
+建议将这一步的结果单独落盘，作为后面所有 join 的主键底表。
+
+### 产出
+
+- 一份“全量索引表”
+- 可选落盘：
+  - `output/ms-web-mma/v001/index/all_rows_with_index.jsonl`
+
+### 要验什么
+
+- 行数是否等于输入总行数
+- `global_index` 是否唯一
+- `(source_jsonl, line_no)` 是否唯一
+- 每行是否有可用 `record_id`
+
+---
+
+## 4. 跑统一布局聚类
+
+### 输入
+
+- 全量索引表
+- 每行的 `html`
+
+### 要做什么
+
+1. 对每行 `html` 调 web2json 的布局特征逻辑。
+2. 对全体样本只做一次全局布局聚类。
+3. 为每条记录生成：
+   - `layout_cluster_id`
+4. 单独标出：
+   - `noise` 或 `-1`
+
+### 产出
+
+- `cluster_list/cluster_list.jsonl`
+- 可选：
+  - `cluster_list/cluster_info.txt`
+
+建议 `cluster_list.jsonl` 包含：
+
+| 字段 | 说明 |
+|---|---|
+| `global_index` | 全局索引 |
+| `layout_cluster_id` | 聚类标签 |
+| `source_jsonl` | 源路径 |
+| `source_name` | 源文件名 |
+| `line_no` | 原始行号 |
+| `record_id` | 唯一标识 |
+
+### 要验什么
+
+- `cluster_list` 总行数是否等于输入总行数
+- 是否存在异常大量 `noise`
+- `layout_cluster_id` 是否有稳定分布
+- `cluster_info.txt` 能否帮助人工快速判断簇是否合理
+
+---
+
+## 5. 按簇切片写 JSONL
+
+### 输入
+
+- 全量索引表
+- `cluster_list.jsonl`
+
+### 要做什么
+
+1. 根据 `layout_cluster_id` 把原始行切到不同簇目录。
+2. 每个簇形成一个或多个切片 JSONL。
+3. `noise` 单独处理。
+
+建议目录：
+
+```text
+output/ms-web-mma/v001/cluster_list/format_clusters/
+  cluster_0/
+  cluster_1/
+  ...
+  noise/
+```
+
+切片行里建议保留：
+
+- 原始 crawl 字段
+- `global_index`
+- `layout_cluster_id`
+- `source_name`
+- `line_no`
+- `record_id`
+
+### 产出
+
+- `cluster_k/*.jsonl.gz`
+- `noise/*.jsonl.gz`
+
+### 要验什么
+
+- 所有簇切片行数总和是否等于输入总行数
+- 同一 `global_index` 是否只出现在一个簇里
+- `noise` 是否单独隔离成功
+
+---
+
+## 6. 对每个簇独立跑抽取
+
+### 输入
+
+- `cluster_k/*.jsonl.gz`
+
+### 要做什么
+
+对每个有效簇依次执行：
+
+1. `extract_schema`
+2. `infer_code`
+3. `extract_data_with_code`
+
+每个簇都要形成独立产物目录。
+
+建议每簇至少保留：
+
+| 文件 | 作用 |
+|---|---|
+| `schema.json` | 最终 schema |
+| `final_parser.py` | 最终 parser |
+| `result/*.json` | 每页抽取结果 |
+| `cluster_k_extract_manifest.jsonl` | 回填主索引 |
+
+### 产出
+
+- 每个簇的抽取产物目录
+- `cluster_k_extract_manifest.jsonl`
+
+### 要验什么
+
+- 每个簇都至少有 schema 和 parser
+- `result/*.json` 数量和 manifest 对齐
+- manifest 中能唯一回指 `global_index` 或 `(source_name, line_no)`
+- 对于失败页，是否有 `parse_ok=false` 或等价状态
+
+---
+
+## 7. 回填原始 JSONL
+
+### 输入
+
+- 原始索引表
+- `cluster_list.jsonl`
+- 每簇 manifest
+- 每簇 `result/*.json`
+
+### 要做什么
+
+基于 join 逻辑把抽取结果写回原始行：
+
+1. 以 `global_index` 或 `(source_jsonl, line_no)` 做主键对齐。
+2. 在原始对象基础上增量合并抽取字段：
+   - `content`
+   - `title`
+   - `author`
+   - `publish_time`
+   - 其他 schema 定义字段
+3. 把 schema / xpath 元信息写到：
+   - `remark.extract_schema`
+4. 生成：
+   - `track_loc`
+   - `doc_loc`
+
+注意：
+
+- 不要新增 `_w2j` 顶层字段
+- 原始 crawl 字段尽量保留
+
+### 产出
+
+- 回填后的“发布向 JSONL”
+
+### 要验什么
+
+- 回填后总行数是否与输入一致
+- 未抽取成功的行是否仍然可追踪
+- `remark.extract_schema` 是否落对
+- `track_loc` / `doc_loc` 是否符合文档约定
+
+---
+
+## 8. 写发布结果
+
+### 输入
+
+- 回填后的结果 DataFrame / JSONL
+- 输出前缀：`xyz2-process-hdd1/.../nlp/ms-web-mma/v0001`
+
+### 要做什么
+
+1. 按最终发布规范写出 `.jsonl.gz`
+2. 目录中保留：
+   - 发布文件
+   - pipeline 产物
+   - 可追溯索引
+
+### 产出
+
+- 发布路径下的最终 `.jsonl.gz`
+
+### 要验什么
+
+- 能按 `doc_loc` 找到对应发布文件
+- gzip 可解压
+- 单行 JSON 格式合法
+- 样本抽查字段完整
+
+---
+
+## 9. 最小验收清单
+
+在整个 notebook 流程结束后，至少要核对下面这些：
+
+| 项 | 验收问题 |
+|---|---|
+| 输入行数 | 是否与原始输入一致 |
+| `cluster_list` 行数 | 是否与输入一致 |
+| 切片总行数 | 是否与输入一致 |
+| manifest 对齐 | 是否能唯一回填到原始行 |
+| 回填后行数 | 是否与输入一致 |
+| `remark.extract_schema` | 是否存在且结构合理 |
+| `track_loc` | 是否能回指原始 source |
+| `doc_loc` | 是否能回指发布文件 |
+| `noise` | 是否单独标记处理 |
+
+---
+
+## 10. 推荐 notebook 拆分
+
+建议不要把所有逻辑塞进一个 notebook，最好拆成 4 本：
+
+### 10.1 `01_build_index_and_classify.ipynb`
+
+负责：
+
+- 读源数据
+- 建立索引
+- 跑统一聚类
+- 写 `cluster_list.jsonl`
+
+### 10.2 `02_split_by_cluster.ipynb`
+
+负责：
+
+- 按簇切片
+- 写 `cluster_k/*.jsonl.gz`
+
+### 10.3 `03_extract_per_cluster.ipynb`
+
+负责：
+
+- 对每个 `cluster_k` 跑
+  - `extract_schema`
+  - `infer_code`
+  - `extract_data_with_code`
+- 写 manifest
+
+### 10.4 `04_merge_backfill_and_publish.ipynb`
+
+负责：
+
+- join 回填
+- 写最终发布 JSONL
+- 做最终验收
+
+---
+
+## 11. 建议执行顺序
+
+最稳妥的顺序是：
+
+1. 先只跑主文件 `20260310094859_...jsonl`
+2. 跑通 `索引 -> 聚类 -> 切片 -> 单簇抽取 -> 回填`
+3. 固化产物格式
+4. 再扩大到整目录
+
+---
+
+## 12. 一句话版本
+
+```text
+先统一聚类，再按簇抽取，最后按索引回填原始行并发布。
+```
diff --git "a/ms-web-mma_\350\201\232\347\261\273\344\270\216schema\345\233\236\345\241\253\346\265\201\347\250\213\350\257\264\346\230\216.md" "b/ms-web-mma_\350\201\232\347\261\273\344\270\216schema\345\233\236\345\241\253\346\265\201\347\250\213\350\257\264\346\230\216.md"
new file mode 100644
index 0000000..6876976
--- /dev/null
+++ "b/ms-web-mma_\350\201\232\347\261\273\344\270\216schema\345\233\236\345\241\253\346\265\201\347\250\213\350\257\264\346\230\216.md"
@@ -0,0 +1,272 @@
+# ms-web-mma：聚类 → 按簇抽取 → 回填原始 JSONL（流程说明稿）
+
+> 本文档描述针对 `web2json-agent/Prod/ms-web-mma/` 下**全部 crawl JSONL** 的**目标流程**与**数据契约**，便于评审；**暂不涉及具体代码改动**。
+
+---
+
+## 1. 背景与目标
+
+**一句话目标**：对 `ms-web-mma` 全目录 crawl 数据**只做一次**布局聚类；再**按簇**分别跑 schema / parser / 抽取；最后用 **`cluster_list.jsonl` + manifest** 把结果**对齐回填**到原始行，生成带 **`remark`、`track_loc`、`doc_loc`** 的发布向 JSONL（不在行内增加 `_w2j` 类字段）。
+
+- **输入**：`Prod/ms-web-mma/` 目录（可递归）下所有 `*.jsonl`，每行一条 JSON，至少包含 **`html`** 字段（及 `track_id` 等溯源字段）。
+- **目标**：
+  1. 对所有行的 `html` **统一做一次**布局聚类（不按单个 jsonl 文件串行跑完整流水线）。
+  2. 将数据**按簇切片**成多份 JSONL，**每个簇**分别执行 `extract_schema` → `infer_code` → `extract_data`（或 `extract_data_with_code`）。
+  3. 将每簇抽取得到的**结构化结果**（如 `content`、`title` 等）以及 **schema / xpath 元信息**（如 `remark.extract_schema`）**回填**到**原始行**对应位置，产出**新的 JSONL**（行数与溯源与原始对齐或可追踪）。
+
+---
+
+## 2. 产物目录建议：`cluster_list/` 文件夹
+
+单文件聚类会写出 `cluster_list.jsonl`；全目录合并聚类建议统一到 **`output/ms-web-mma/v001/`**（版本号可换）下，结构示例：
+
+```text
+output/ms-web-mma/v001/
+├── jsonl/                                    # 可选：发布或中间 gzip 与原始命名对齐
+│   └── 20260310....jsonl.gz
+│   └── ...
+└── cluster_list/
+    ├── cluster_info.txt                      # 人类可读摘要（可选）
+    ├── cluster_list.jsonl                    # 全局索引：global_index ↔ 源文件/行号/簇 id
+    └── format_clusters/                    # 按簇切分后的输入（名称可改为 slices/）
+        ├── cluster_0/
+        │   └── <stem>_cluster_0.jsonl.gz     # 仅含 cluster_0 的原始行（可选附加调试字段）
+        ├── cluster_1/
+        │   └── ...
+        └── noise/
+            └── <stem>_noise.jsonl.gz
+```
+
+说明：
+
+- **`cluster_list/cluster_list.jsonl`**：回填时的**主索引**（见 §4.2）。
+- **`format_clusters/cluster_k/`**（或扁平命名）：每个簇一份切片，供该簇 `extract_schema` 起全流程使用；**簇编号建议与 `layout_cluster_id` 一致**（`cluster_0` 而非 `cluster_01`，除非团队另有约定）。
+- 文件名中的 **`<stem>`** 可与源 `jsonl` 主名一致，便于对照（多源合并时也可用 `union` 等统一前缀）。
+
+---
+
+## 3. 流程总览（四段）
+
+```text
+[1] 合并聚类（全目录 jsonl）
+        ↓
+[2] 写出 cluster_list/ + 按簇切片 jsonl
+        ↓
+[3] 对每个 cluster_k 独立跑：extract_schema → infer_code → extract_data(_with_code)
+        ↓
+[4] 合并回填：按 cluster_list + manifest 对齐，写「新 jsonl」
+    - 保留原 crawl 字段；新增/覆盖抽取字段；写入 remark、track_loc、doc_loc
+```
+
+### 3.1 回填后单行结构示例（目标形态）
+
+以下 **S3 路径为 ms-web-jwn 历史示例**，仅说明字段关系；**ms-web-mma 落地时请替换为实际桶与前缀**。
+
+```json
+{
+  "track_id": "2201bfce-a0ef-4ca8-90e4-7e34172c9395",
+  "url": "https://www.heritage.gov.my/index.php/tapak-warisan",
+  "content": "# Tapak Warisan\n\npage_title: Tapak Warisan\ntotal_records: Display #",
+  "html": "<!DOCTYPE html>...",
+  "content_bytes": 67,
+  "remark": {
+    "extract_schema": {
+      "title": {
+        "type": "string",
+        "description": "文章标题",
+        "value_sample": "Rindukan Muzium. Berita baik untuk anda!",
+        "xpaths": ["//h1[@class='page-title kl-blog-post-title entry-title']/text()"]
+      },
+      "author": {
+        "type": "string",
+        "description": "作者姓名",
+        "value_sample": "defaultweb",
+        "xpaths": ["//span[@class='itemAuthor ...']/text()"]
+      },
+      "publish_time": {
+        "type": "string",
+        "description": "发布时间",
+        "value_sample": "Rabu, 29 September 2021",
+        "xpaths": ["//span[@class='itemDateCreated ...']/text()"]
+      },
+      "content": {
+        "type": "string",
+        "description": "文章正文内容（完整文本）",
+        "value_sample": "Muzium Sejarah ...",
+        "xpaths": ["//div[@class='itemBody ...']/p//text()"]
+      },
+      "content_paragraphs": {
+        "type": "array",
+        "description": "文章段落列表",
+        "value_sample": ["..."],
+        "xpaths": ["//div[@class='itemBody ...']/p"]
+      },
+      "content_list_items": {
+        "type": "array",
+        "description": "文章中的列表项",
+        "value_sample": ["...", "..."],
+        "xpaths": ["//div[@class='itemBody ...']//ul/li"]
+      }
+    }
+  },
+  "track_loc": [
+    "s3://.../prod/ms-web-mma/jsonl/<源>.jsonl?bytes=<offset>,<length>" //  input path defined in web2json-agent/Prod/ms-web-mma/s3Path.txt
+  ],
+  "doc_loc": "s3://.../nlp/ms-web-mma/v001/ms/<发布文件>.jsonl.gz?bytes=0,0" //  output path defined in web2json-agent/Prod/ms-web-mma/s3Path.txt
+}
+```
+
+---
+
+## 4. 各阶段说明
+
+### 4.1 阶段一：统一 classify（合并聚类）
+
+- **输入**：`Prod/ms-web-mma/**/*.jsonl`（或顶层仅 jsonl 的目录）。
+- **行为**：与现有 `classify_crawl_jsonl_dir` 一致：所有行的 `html` 进入**同一套**布局聚类，得到 `layout_cluster_id ∈ {0,1,…} ∪ {-1}`（噪声）。
+- **不要求**在此阶段跑 `extract_schema`。
+
+### 4.2 阶段二：切片与 `cluster_list`
+
+- **`cluster_list.jsonl`**（每行一条建议字段）：
+
+| 字段 | 含义 |
+|------|------|
+| `global_index` | 全合并后的从 0 开始的序号（或从 1，实现需统一） |
+| `layout_cluster_id` | 聚类标签，`-1` 表示噪声 / 无法聚类 |
+| `source_jsonl` | 原始文件绝对路径或可解析的相对路径 |
+| `source_name` | 文件名 |
+| `line_no` | 在该 jsonl 文件内的行号（从 1 计） |
+| `record_id` | 业务主键，如 `track_id`；缺失时可用 `line_{line_no}` |
+
+- **切片文件**：每个 `cluster_k` 一个 jsonl，每行仍是**原始 crawl 行**（JSON 对象），如需调试可在切片阶段可选附加与 `cluster_list` 一致的元信息副本（例如簇 id、源文件名、行号），**不要求**使用 `_w2j` 前缀字段。
+
+**噪声** `noise` 可单独一个 `*_noise.jsonl`；是否对该文件也跑抽取由产品决定（默认可跳过或只跑兜底 parser）。
+
+### 4.3 阶段三：按簇抽取（现有 web2json API）
+
+对每个 `cluster_k` 的切片：
+
+1. **`extract_schema`**：得到 `final_schema.json`（字段定义、xpath、`value_sample` 等）。
+2. **`infer_code`**：生成 `final_parser.py`。
+3. **`extract_data` / `extract_data_with_code`**：对该簇**全部行**解析，得到多份 `result/*.json` 或与行一一对应的结构化结果。
+
+**注意**：簇内解析结果的「键」必须与回填时能**对应回** `cluster_list` 中的某一行（见 §5.2）。
+
+### 4.4 阶段四：回填组成新 JSONL
+
+**目标**：在**不丢失原始 crawl 字段**的前提下，把本簇抽取结果写回「原始行」的扩展形态。
+
+---
+
+## 5. 回填数据契约（建议）
+
+### 5.1 新 JSONL 每行建议结构（逻辑）
+
+在原始对象基础上**增量合并**（字段名可评审后固定）：
+
+- **抽取正文等**：如 `content`、`title`、`author`、`publish_time` 等——与 `final_schema` / parser 输出一致。
+- **schema / xpath 溯源**：放入 **`remark.extract_schema`**（结构见 §3.1），与顶层抽取字段区分，避免与 crawl 原字段无意义冲突。
+
+说明：
+
+- **`remark.extract_schema`**：存放**该簇最终 merged schema**（每字段含 `type` / `description` / `value_sample` / `xpaths`）；若体量大，可改为只存 **schema 的 S3/本地路径** + **hash**，正文仍保留顶层抽取值。
+- **非空策略**：仅当某字段在 parser 结果中非空（或满足 QA 规则）时写入顶层；否则可省略或显式 `null`（需统一约定）。
+- **`track_loc` / `doc_loc`**：与现有发布规范对齐（见《多语种网站清洗计划》§4.4）；`doc_loc` 指向 gzip 发布产物时需带 `?bytes=`。流水线产物路径如需落盘，可放在 `remark` 或单独 manifest，**不**在行顶层增加 `_w2j`。
+
+### 5.2 对齐键（回填时如何「找到原始行」）
+
+必须能唯一对应：
+
+- **主键**：`(source_jsonl, line_no)` 或 `record_id`（全局唯一时）。
+- **簇内解析顺序**：若 parser 按切片文件顺序输出，需与 `global_index` 或 `(source_name, line_no)` 建立**显式映射**（建议在抽取阶段输出 **`manifest.jsonl`**：一行对应 `global_index` → 解析结果或 `result_xxx.json`）。
+
+推荐在阶段三结束时为每个 cluster 产出一个 **`cluster_k_extract_manifest.jsonl`**：
+
+```json
+{"global_index": 123, "source_name": "part-000.jsonl", "line_no": 45, "record_id": "uuid", "parse_ok": true}
+```
+
+回填脚本依赖：**`cluster_list.jsonl` + manifest + 每行 parse 结果**（三者 join）。
+
+### 5.3 工程与分布式（评审点）
+
+- **Spark / 分区**：聚类前可分区读 jsonl；**回填**建议按 **原始 `source_name` 或分区键** 写回，避免单 task 超大 shuffle。
+- **幂等**：同一 `global_index` 重复跑应可覆盖或带 `schema_version`。
+- **与 Spark 的分工**：四段流程在集群上的拆分见 **§5.4**；其中布局侧须与 **web2json** 现网逻辑一致（`get_feature` → `fuse_features` → `cluster_html_layouts_optimized` 等）。
+
+### 5.4 Spark 阶段划分草图（对齐 web2json 逻辑）
+
+以下与 §3「四段流程」一一对应；**布局聚类与单机 `classify_crawl_jsonl` / `classify_crawl_jsonl_dir`（`web2json.simple`）同源**，Spark 只负责可并行 I/O 与 join，**不另写一套特征或聚类定义**。
+
+**web2json 侧关键符号（须保持一致）**
+
+| 名称 | 位置 | 作用 |
+|------|------|------|
+| `get_feature` | `web2json.tools.html_layout_cosin.get_feature` | 输入 HTML 字符串，输出**布局特征 dict**（`tags` / `attrs` 等层级结构）。 |
+| `fuse_features` | `web2json.tools.html_layout_cosin.fuse_features`（由 `cluster` 内部调用） | 在统一 `layer_n`、`k` 下将多页 dict **融合为稠密向量**，供余弦相似度与 DBSCAN。 |
+| `cluster_html_layouts_optimized` | `web2json.tools.cluster.cluster_html_layouts_optimized` | 与 `simple._execute_crawl_layout_cluster` 相同入口：`threshold`、`k`、`min_samples`、`use_knn_graph`、`n_neighbors` 等须与线上一致，否则簇 id 不可比。 |
+| `classify_crawl_jsonl` / `classify_crawl_jsonl_dir` | `web2json.simple` | 单机全链路：读 JSONL → 上述特征与聚类 → 写 `cluster_list` / 按簇切片。 |
+| `extract_schema` / `infer_code` / `extract_data` / `extract_data_with_code` | `web2json.simple` | §4.3 按簇流水线；回填依赖 manifest 与 `cluster_list` join（§5.2）。 |
+
+**阶段映射（文档 §3 ↔ Spark / 混部）**
+
+| 文档阶段 | Spark 上适合做的事 | 须沿用 web2json 或外置作业的部分 |
+|----------|-------------------|----------------------------------|
+| **[1] 合并聚类** | 并行读 JSONL、解析行；在 executor 上对每行 HTML 调用 **`get_feature`**（与 `web2json.tools.cluster._compute_features` 相同）。 | **全局聚类**：在单机路径中为 **`cluster_html_layouts_optimized`**（内部：`fuse_features` → `cosine_similarity` + DBSCAN，或 `use_knn_graph` 近似）。分布式上需**二次开发或外置**：例如先落盘全量 feature dict，再在**大内存单机 / Ray** 调同一套 `fuse_features`+聚类，产出 `global_index → layout_cluster_id` 表再回灌 Spark；**禁止**换用与 `get_feature` 无关的自研特征。 |
+| **[2] cluster_list + 切片** | 对标签表生成 `cluster_list.jsonl`；按 `layout_cluster_id` **partitionBy** 写各簇 JSONL（及 `noise`）。可选行内附加 `layout_cluster_id`、`crawl_source_name`、`crawl_line_no`（与 `annotate_slice_rows` 约定一致）。 | 字段语义同 §4.2。 |
+| **[3] 按簇抽取** | 若仅用生成好的 `final_parser.py` 做 CPU 解析，可用 **`mapPartitions`** 批量跑解析（等价于 `extract_data_with_code` 的解析段）。 | **`extract_schema` / `infer_code`**（及含 LLM 的 schema 生成）多为 **每簇独立作业**（编排起 Pod/单机），与现网 API 一致；产出每簇 **`cluster_k_extract_manifest.jsonl`**（§5.2）。 |
+| **[4] 回填** | **`join`**：`cluster_list` + manifest + 解析结果；合并 `remark`、`track_loc`、`doc_loc`；**按 `source_name` 或分区键写出**，控制 shuffle。 | 逻辑同 §5.1，不新增 `_w2j` 顶层字段。 |
+
+**流程简图（执行形态）**
+
+```mermaid
+flowchart TB
+  subgraph P1["[1] 合并聚类"]
+    S1[Spark: 读 JSONL]
+    S2["executor: get_feature(html)"]
+    S3["外置/大内存: fuse_features + cluster_html_layouts_optimized 同参"]
+    S4[标签表 global_index / layout_cluster_id]
+    S1 --> S2 --> S3 --> S4
+  end
+  subgraph P2["[2] 切片"]
+    T1[Spark: cluster_list + partitionBy 簇]
+  end
+  subgraph P3["[3] 抽取"]
+    U1["每簇: extract_schema → infer_code → extract_data(_with_code)"]
+    U2[manifest]
+    U1 --> U2
+  end
+  subgraph P4["[4] 回填"]
+    V1[Spark: join + 分区写出]
+  end
+  S4 --> T1 --> U1
+  U2 --> V1
+```
+
+**小结**：Spark 擅长 **[2][4]** 与 **[1] 中的读数 + **`get_feature`**；**与 `cluster_html_layouts_optimized` 等价的聚类**和 **含 LLM 的 [3] 前段**宜混部或编排，且 **特征与聚类参数必须来自 web2json**，以保证与 `Prod` 本地/单机试跑结果可对齐。
+
+---
+
+## 6. 待确认清单
+
+1. **`format_clusters` 命名**是否改为 `slices/` 或 `by_cluster/`？
+2. **回填产物**：按**源文件一对一**（`xxx.with_extract.jsonl`）还是**单文件 merged**？
+3. **噪声簇**是否跑抽取，还是仅打标原样输出？
+4. **ms-web-mma** 正式 **S3 前缀**与 **doc_loc** 桶是否已定稿（文档中示例路径需替换）？
+
+---
+
+## 7. `Prod/ms-web-mma` 本地目录快照（便于对齐 S3 与聚类输入）
+
+约定见同目录 **`s3Path.txt`**（Input：`hcorpus-develop-hw60p/.../prod/ms-web-mma/jsonl/`；Output：`xyz2-process-hdd1/.../nlp/ms-web-mma/v0001`）。
+
+当前仓库内该目录含 **6 个** `*.jsonl` + `s3Path.txt`：
+
+| 文件 | 行数（约） | 说明 |
+|------|------------|------|
+| `20260310094859_353_79bda33fa180eedac40d37876224609d.jsonl` | 191 | 主数据量（约 61MB，单行 HTML 较大） |
+| `20260312172301_353_ac06ada8c9d8f53d11ff4ce459ff470e.jsonl` | 1 | 小样本 |
+| `20260320141631_353_2f2ece9c1819287308a80962f0f108f5.jsonl` 等 4 个 | 各 1 | 同日多批次试探样（各约百 KB 级） |
+
+**清洗建议**：聚类/回填以 **191 行主文件**为基准即可；其余 5 个单行文件若仅为调试，可迁到子目录（如 `samples/`）或归档，避免与全量 `discover_jsonl_files` 合并时重复混入（若暂不移除，合并跑目录时需知悉总行数 ≈ **196** 且含重复 URL 风险）。是否搬迁由工程侧决定。
diff --git a/web2json/__init__.py b/web2json/__init__.py
index eeeeac5..fa75dee 100644
--- a/web2json/__init__.py
+++ b/web2json/__init__.py
@@ -18,6 +18,8 @@
     infer_code,
     extract_data_with_code,
     classify_html_dir,
+    classify_crawl_jsonl,
+    classify_crawl_jsonl_dir,
     # 返回数据类
     ExtractDataResult,
     ExtractSchemaResult,
@@ -38,6 +40,8 @@
     "infer_code",
     "extract_data_with_code",
     "classify_html_dir",
+    "classify_crawl_jsonl",
+    "classify_crawl_jsonl_dir",
     # 返回数据类
     "ExtractDataResult",
     "ExtractSchemaResult",
diff --git a/web2json/simple.py b/web2json/simple.py
index 95b8ebe..6629564 100644
--- a/web2json/simple.py
+++ b/web2json/simple.py
@@ -5,7 +5,7 @@
 import sys
 import json
 from pathlib import Path
-from typing import Optional, Dict, List, Any
+from typing import Optional, Dict, List, Any, Callable
 from dataclasses import dataclass, asdict
 import numpy as np
 from loguru import logger
@@ -99,7 +99,7 @@ class Web2JsonConfig:
 
     Args:
         name: 运行名称（在output_path下创建此名称的子目录）
-        html_path: HTML文件目录
+        html_path: HTML 目录、单个 ``.html``/``.htm`` 文件，或 crawl 源 ``.jsonl``（见 ``crawl_html_field``）
         output_path: 输出主目录（默认为"output"）
         iteration_rounds: 迭代轮数（用于Schema学习的样本数量，默认3）
         schema: Schema模板（可选，为None时使用auto模式，有值时使用predefined模式）
@@ -129,6 +129,9 @@ class Web2JsonConfig:
     parser_code: Optional[str] = None
     save: Optional[List[str]] = None
     remove_null_fields: bool = True
+    # crawl JSONL：html_path 为 .jsonl 时物化 HTML 所用字段与主键（见 _resolve_pipeline_html_files）
+    crawl_html_field: str = "html"
+    crawl_jsonl_id_field: Optional[str] = "track_id"
 
     def __post_init__(self):
         """验证配置"""
@@ -207,6 +210,359 @@ def _read_html_files(directory_path: str) -> List[str]:
     raise ValueError(f"路径既不是文件也不是目录: {directory_path}")
 
 
+def _resolve_pipeline_html_files(
+    config: Web2JsonConfig,
+) -> tuple[List[str], Optional[Callable[[], None]]]:
+    """
+    将 ``html_path`` 解析为 HTML 文件路径列表。
+
+    - 目录 / 单 ``.html``：沿用 ``_read_html_files``。
+    - ``.jsonl``：把每行 ``html`` 字段物化到**临时目录**中的 ``.html``，避免在项目里落大量切片文件；
+      返回第二个值 ``cleanup``，调用方须在 ``finally`` 中执行以删除临时目录。
+    """
+    import shutil
+    import tempfile
+    from web2json.tools.crawl_jsonl import materialize_jsonl_to_html_dir
+
+    p = Path(config.html_path)
+    if not p.exists():
+        raise FileNotFoundError(f"路径不存在: {config.html_path}")
+
+    if p.is_file() and p.suffix.lower() == ".jsonl":
+        tmp = Path(tempfile.mkdtemp(prefix="w2j_crawl_jsonl_"))
+        try:
+            hf = getattr(config, "crawl_html_field", "html")
+            idf = getattr(config, "crawl_jsonl_id_field", "track_id")
+            files = materialize_jsonl_to_html_dir(
+                p, tmp, html_field=hf, id_field=idf
+            )
+        except Exception:
+            shutil.rmtree(tmp, ignore_errors=True)
+            raise
+        if not files:
+            shutil.rmtree(tmp, ignore_errors=True)
+            raise ValueError(
+                f"JSONL 中无可用 html 字段（字段名: {getattr(config, 'crawl_html_field', 'html')}）: {config.html_path}"
+            )
+
+        def cleanup() -> None:
+            shutil.rmtree(tmp, ignore_errors=True)
+
+        logger.info(
+            f"  [crawl jsonl] 已将 {len(files)} 条 HTML 物化到临时目录（流水线结束后自动删除）"
+        )
+        return files, cleanup
+
+    if p.is_file():
+        if p.suffix.lower() in (".html", ".htm"):
+            return [str(p.resolve())], None
+        raise ValueError(
+            f"不支持的文件: {p}（请使用 HTML 目录、单个 .html 或 crawl .jsonl）"
+        )
+    if p.is_dir():
+        return _read_html_files(str(p)), None
+    raise ValueError(f"路径既不是文件也不是目录: {config.html_path}")
+
+
+def _execute_crawl_layout_cluster(
+    line_metas: List[Dict[str, Any]],
+    config: Web2JsonConfig,
+    *,
+    output_stem: str,
+    hf: str,
+    rid_f: Optional[str],
+    report_extra: Optional[Dict[str, Any]] = None,
+    annotate_slice_rows: bool = False,
+) -> ClusterResult:
+    """对已加载的 ``line_metas``（见 ``load_crawl_line_metas_for_file``）执行布局聚类并可选落盘。"""
+    from web2json.tools.cluster import cluster_html_layouts_optimized
+    from web2json.tools.crawl_jsonl import split_jsonl_by_cluster_labels, write_jsonl_lines
+    from web2json.tools.html_layout_cosin import get_feature
+
+    if not line_metas:
+        raise ValueError("无有效行")
+
+    surrogate_keys: List[str] = [
+        f"{m['source_name']}:{m['line_no']}" for m in line_metas
+    ]
+    valid_keys: List[str] = []
+    valid_contents: List[str] = []
+    for m, sk in zip(line_metas, surrogate_keys):
+        if not m.get("html"):
+            continue
+        try:
+            feature = get_feature(m["html"])
+        except Exception as e:
+            logger.warning(f"  跳过布局特征提取失败: {sk} ({e})")
+            continue
+        if not feature:
+            logger.warning(f"  跳过无有效布局特征: {sk}")
+            continue
+        valid_keys.append(sk)
+        valid_contents.append(m["html"])
+
+    if not valid_contents:
+        raise Exception("聚类失败: 没有可用于布局聚类的有效 HTML 页面")
+
+    try:
+        labels_partial, _sim, _clusters = cluster_html_layouts_optimized(
+            valid_contents,
+            use_knn_graph=True,
+        )
+    except Exception as e:
+        raise Exception(f"聚类失败: {e}") from e
+
+    label_map = {k: int(lbl) for k, lbl in zip(valid_keys, labels_partial)}
+    labels = np.array([label_map.get(sk, -1) for sk in surrogate_keys], dtype=int)
+
+    unique_labels = sorted(set(labels.tolist()))
+    noise_count = sum(1 for l in labels if l == -1)
+    cluster_count = len([l for l in unique_labels if l != -1])
+
+    logger.info("✓ 聚类分析完成")
+    logger.info(f"  总行数（非空行）: {len(line_metas)}")
+    logger.info(f"  识别出的布局簇数: {cluster_count}")
+    logger.info(f"  噪声点（未归类）: {noise_count}")
+
+    clusters_dict: Dict[str, List[str]] = {}
+    noise_files: List[str] = []
+    for lbl in unique_labels:
+        ids = [line_metas[i]["rid"] for i in range(len(line_metas)) if labels[i] == lbl]
+        if not ids:
+            continue
+        if lbl == -1:
+            noise_files = ids
+            clusters_dict["noise"] = ids
+        else:
+            clusters_dict[f"cluster_{lbl}"] = ids
+        logger.info(f"  {'噪声点' if lbl == -1 else f'簇 {lbl}'}: {len(ids)} 条")
+
+    if config.should_save():
+        out_root = Path(config.get_full_output_path())
+        out_root.mkdir(parents=True, exist_ok=True)
+        stem = output_stem
+
+        if config.should_save_item("report"):
+            report_payload: Dict[str, Any] = {
+                "source": "crawl_jsonl",
+                "html_field": hf,
+                "record_id_field": rid_f,
+                "clusters": {k: v for k, v in clusters_dict.items() if k != "noise"},
+                "noise_record_ids": noise_files,
+                "labels": labels.tolist(),
+                "record_ids": [m["rid"] for m in line_metas],
+                "record_keys": surrogate_keys,
+                "source_jsonl": [m["source_jsonl"] for m in line_metas],
+                "cluster_count": cluster_count,
+                "total_records": len(line_metas),
+            }
+            if report_extra:
+                report_payload.update(report_extra)
+
+            report_json = out_root / "cluster_report.json"
+            with open(report_json, "w", encoding="utf-8") as f:
+                json.dump(report_payload, f, ensure_ascii=False, indent=2)
+            logger.info(f"  ✓ 报告已保存: {report_json}")
+
+            cluster_list_path = out_root / "cluster_list.jsonl"
+            list_rows: List[Dict[str, Any]] = []
+            for i, m in enumerate(line_metas):
+                lab = int(labels[i])
+                list_rows.append(
+                    {
+                        "global_index": i,
+                        "layout_cluster_id": lab,
+                        "source_jsonl": m["source_jsonl"],
+                        "source_name": m["source_name"],
+                        "line_no": m["line_no"],
+                        "record_id": m["rid"],
+                    }
+                )
+            write_jsonl_lines(cluster_list_path, list_rows)
+            logger.info(f"  ✓ 聚类清单已保存: {cluster_list_path}")
+
+            info_txt = out_root / "cluster_info.txt"
+            with open(info_txt, "w", encoding="utf-8") as f:
+                f.write("Crawl JSONL 布局聚类结果\n")
+                f.write("=" * 70 + "\n\n")
+                if report_extra and report_extra.get("jsonl_files"):
+                    f.write(f"源 JSONL 文件数: {len(report_extra['jsonl_files'])}\n")
+                f.write(f"总行数: {len(line_metas)}\n")
+                f.write(f"簇数: {cluster_count}\n")
+                f.write(f"噪声: {noise_count}\n\n")
+                for lbl in unique_labels:
+                    ids = [line_metas[i]["rid"] for i in range(len(line_metas)) if labels[i] == lbl]
+                    if lbl == -1:
+                        f.write(f"噪声点: {len(ids)} 条\n")
+                    else:
+                        f.write(f"簇 {lbl}: {len(ids)} 条\n")
+                    for rid in ids[:5]:
+                        f.write(f"  - {rid}\n")
+                    if len(ids) > 5:
+                        f.write(f"  ... 还有 {len(ids) - 5} 条\n")
+                    f.write("\n")
+            logger.info(f"  ✓ 摘要已保存: {info_txt}")
+
+        if config.should_save_item("jsonl"):
+            recs = []
+            for i, m in enumerate(line_metas):
+                row = dict(m["obj"])
+                if annotate_slice_rows:
+                    row["layout_cluster_id"] = int(labels[i])
+                    row["crawl_source_name"] = m["source_name"]
+                    row["crawl_line_no"] = m["line_no"]
+                recs.append(row)
+            split_jsonl_by_cluster_labels(
+                recs,
+                labels.tolist(),
+                out_dir=out_root,
+                stem=stem,
+            )
+            logger.info(f"  ✓ 已按簇写出 JSONL 切片到: {out_root}")
+
+        logger.info(f"✓ 结果已保存到: {out_root}")
+
+    clusters_only = {k: v for k, v in clusters_dict.items() if k.startswith("cluster_")}
+    return ClusterResult(
+        clusters=clusters_only,
+        labels=labels,
+        noise_files=noise_files,
+        cluster_count=cluster_count,
+    )
+
+
+def classify_crawl_jsonl(
+    config: Web2JsonConfig,
+    jsonl_path: Optional[str] = None,
+    *,
+    html_field: Optional[str] = None,
+    record_id_field: Optional[str] = None,
+    annotate_slice_rows: bool = False,
+) -> ClusterResult:
+    """API：对 crawl JSONL 按布局聚类，并按簇写出切片 JSONL（不物化 HTML 到磁盘）。
+
+    从每行 JSON 的 ``html``（或 ``html_field``）读入 HTML，布局特征与
+    ``classify_html_dir`` 相同。输出文件名形如
+    ``{jsonl 主名}_cluster_0.jsonl``、``{主名}_noise.jsonl``。
+
+    - ``config.html_path`` 或参数 ``jsonl_path``：指向 ``.jsonl`` 文件。
+    - ``config.save`` 含 ``report`` 时写 ``cluster_report.json`` / ``cluster_info.txt`` /
+      ``cluster_list.jsonl``；含 ``jsonl`` 时写各簇切片（推荐 ``['report', 'jsonl']``）。
+
+    ``ClusterResult.clusters`` 的值为 **record_id** 列表（``track_id`` 或 ``line_{n}``），
+    不是文件路径。
+    """
+    from web2json.tools.crawl_jsonl import load_crawl_line_metas_for_file
+
+    _setup_logger()
+    path = Path(jsonl_path or config.html_path)
+    if not path.is_file() or path.suffix.lower() != ".jsonl":
+        raise ValueError("classify_crawl_jsonl 需要指向一个 .jsonl 文件")
+
+    hf = html_field if html_field is not None else getattr(config, "crawl_html_field", "html")
+    rid_f = (
+        record_id_field
+        if record_id_field is not None
+        else getattr(config, "crawl_jsonl_id_field", "track_id")
+    )
+
+    logger.info(f"[API] classify_crawl_jsonl - 从 JSONL 读 html 并布局聚类")
+    logger.info(f"  JSONL: {path}")
+    logger.info(f"  html 字段: {hf}, id 字段: {rid_f}")
+    if config.should_save():
+        logger.info(f"  保存内容: {', '.join(config.save)}")
+        logger.info(f"  输出路径: {config.get_full_output_path()}")
+
+    line_metas = load_crawl_line_metas_for_file(path, html_field=hf, record_id_field=rid_f)
+    report_extra = {
+        "jsonl_path": str(path.resolve()),
+        "mode": "single_file",
+    }
+    return _execute_crawl_layout_cluster(
+        line_metas,
+        config,
+        output_stem=path.stem,
+        hf=hf,
+        rid_f=rid_f,
+        report_extra=report_extra,
+        annotate_slice_rows=annotate_slice_rows,
+    )
+
+
+def classify_crawl_jsonl_dir(
+    config: Web2JsonConfig,
+    jsonl_dir: Optional[str] = None,
+    *,
+    recursive: bool = True,
+    html_field: Optional[str] = None,
+    record_id_field: Optional[str] = None,
+    output_stem: str = "ms_web_jwn_union",
+    annotate_slice_rows: bool = True,
+) -> ClusterResult:
+    """API：将目录下**所有** ``*.jsonl`` 合并为一次布局聚类（不按文件顺序串行跑 schema 流水线）。
+
+    适用于 ``Prod/ms-web-jwn`` 等多文件场景：先统一得到 ``cluster_list.jsonl``，
+    再按簇分别对切片调用 ``extract_schema`` / ``infer_code`` / ``extract_data_with_code``。
+
+    - ``jsonl_dir`` 或 ``config.html_path``：目录路径。
+    - ``recursive``：是否递归子目录查找 ``*.jsonl``。
+    - ``output_stem``：输出切片文件名前缀，如 ``{stem}_cluster_0.jsonl``。
+    - ``annotate_slice_rows``：为 ``jsonl`` 切片每行附加 ``layout_cluster_id``、``crawl_source_name``、``crawl_line_no``（与 ``cluster_list`` 对齐，无 ``_w2j`` 前缀）。
+
+    需在 ``config.save`` 中包含 ``report``（含 ``cluster_list.jsonl``）与可选 ``jsonl``。
+    """
+    from web2json.tools.crawl_jsonl import discover_jsonl_files, load_crawl_line_metas_for_file
+
+    _setup_logger()
+    root = Path(jsonl_dir or config.html_path)
+    if not root.is_dir():
+        raise NotADirectoryError(f"classify_crawl_jsonl_dir 需要目录: {root}")
+
+    hf = html_field if html_field is not None else getattr(config, "crawl_html_field", "html")
+    rid_f = (
+        record_id_field
+        if record_id_field is not None
+        else getattr(config, "crawl_jsonl_id_field", "track_id")
+    )
+
+    files = discover_jsonl_files(root, recursive=recursive)
+    if not files:
+        raise FileNotFoundError(f"目录下未找到 .jsonl: {root}")
+
+    logger.info(f"[API] classify_crawl_jsonl_dir - 合并 {len(files)} 个 JSONL 做一次布局聚类")
+    for fp in files[:20]:
+        logger.info(f"  - {fp}")
+    if len(files) > 20:
+        logger.info(f"  ... 共 {len(files)} 个文件")
+
+    line_metas: List[Dict[str, Any]] = []
+    for fp in files:
+        line_metas.extend(load_crawl_line_metas_for_file(fp, html_field=hf, record_id_field=rid_f))
+
+    if not line_metas:
+        raise ValueError("合并后无任何有效行")
+
+    report_extra = {
+        "mode": "multi_jsonl_union",
+        "jsonl_dir": str(root.resolve()),
+        "jsonl_files": [str(p.resolve()) for p in files],
+        "jsonl_file_count": len(files),
+    }
+    if config.should_save():
+        logger.info(f"  保存内容: {', '.join(config.save)}")
+        logger.info(f"  输出路径: {config.get_full_output_path()}")
+
+    return _execute_crawl_layout_cluster(
+        line_metas,
+        config,
+        output_stem=output_stem,
+        hf=hf,
+        rid_f=rid_f,
+        report_extra=report_extra,
+        annotate_slice_rows=annotate_slice_rows,
+    )
+
+
 def _cleanup_unwanted_files(output_path: Path, save_items: List[str], api_type: str = "extract_data"):
     """
     清理不需要保存的文件，只保留save列表中指定的内容
@@ -336,9 +692,9 @@ def extract_data(config: Web2JsonConfig) -> ExtractDataResult:
         logger.info(f"  保存内容: {', '.join(config.save)}")
         logger.info(f"  输出路径: {config.get_full_output_path()}")
 
-    # 读取HTML文件
-    html_files = _read_html_files(config.html_path)
-    logger.info(f"找到 {len(html_files)} 个HTML文件")
+    jsonl_cleanup: Optional[Callable[[], None]] = None
+    html_files, jsonl_cleanup = _resolve_pipeline_html_files(config)
+    logger.info(f"找到 {len(html_files)} 个 HTML 文件")
 
     # 根据是否需要保存决定使用临时目录还是持久目录
     import tempfile
@@ -483,6 +839,8 @@ def extract_data(config: Web2JsonConfig) -> ExtractDataResult:
         )
 
     finally:
+        if jsonl_cleanup:
+            jsonl_cleanup()
         # 根据配置决定清理策略
         if use_temp_dir:
             # 临时目录：完全清理
@@ -539,9 +897,9 @@ def extract_schema(config: Web2JsonConfig) -> ExtractSchemaResult:
         logger.info(f"  保存内容: {', '.join(config.save)}")
         logger.info(f"  输出路径: {config.get_full_output_path()}")
 
-    # 读取HTML文件
-    html_files = _read_html_files(config.html_path)
-    logger.info(f"找到 {len(html_files)} 个HTML文件")
+    jsonl_cleanup: Optional[Callable[[], None]] = None
+    html_files, jsonl_cleanup = _resolve_pipeline_html_files(config)
+    logger.info(f"找到 {len(html_files)} 个 HTML 文件")
 
     # 根据是否需要保存决定使用临时目录还是持久目录
     import tempfile
@@ -643,6 +1001,8 @@ def extract_schema(config: Web2JsonConfig) -> ExtractSchemaResult:
         )
 
     finally:
+        if jsonl_cleanup:
+            jsonl_cleanup()
         # 根据配置决定清理策略
         if use_temp_dir:
             # 临时目录：完全清理
@@ -714,16 +1074,9 @@ def infer_code(config: Web2JsonConfig) -> InferCodeResult:
         logger.info(f"  保存内容: {', '.join(config.save)}")
         logger.info(f"  输出路径: {config.get_full_output_path()}")
 
-    # 处理HTML路径（可能是目录或单个文件）
-    html_file_path = Path(config.html_path)
-    if html_file_path.is_dir():
-        html_files = _read_html_files(config.html_path)
-    elif html_file_path.is_file():
-        html_files = [str(html_file_path.absolute())]
-    else:
-        raise FileNotFoundError(f"HTML路径不存在: {config.html_path}")
-
-    logger.info(f"找到 {len(html_files)} 个HTML文件")
+    jsonl_cleanup: Optional[Callable[[], None]] = None
+    html_files, jsonl_cleanup = _resolve_pipeline_html_files(config)
+    logger.info(f"找到 {len(html_files)} 个 HTML 文件")
 
     # 根据是否需要保存决定使用临时目录还是持久目录
     import tempfile
@@ -824,6 +1177,8 @@ def infer_code(config: Web2JsonConfig) -> InferCodeResult:
         )
 
     finally:
+        if jsonl_cleanup:
+            jsonl_cleanup()
         # 根据配置决定清理策略
         if use_temp_dir:
             # 临时目录：完全清理
@@ -913,16 +1268,9 @@ def extract_data_with_code(config: Web2JsonConfig) -> ParseResult:
         logger.info(f"  保存内容: {', '.join(config.save)}")
         logger.info(f"  输出路径: {config.get_full_output_path()}")
 
-    # 处理HTML路径（可能是目录或单个文件）
-    html_file_path = Path(config.html_path)
-    if html_file_path.is_dir():
-        html_files = _read_html_files(config.html_path)
-    elif html_file_path.is_file():
-        html_files = [str(html_file_path.absolute())]
-    else:
-        raise FileNotFoundError(f"HTML路径不存在: {config.html_path}")
-
-    logger.info(f"找到 {len(html_files)} 个HTML文件")
+    jsonl_cleanup: Optional[Callable[[], None]] = None
+    html_files, jsonl_cleanup = _resolve_pipeline_html_files(config)
+    logger.info(f"找到 {len(html_files)} 个 HTML 文件")
 
     # 确定是否需要保存到磁盘
     should_save = config.should_save()
@@ -978,6 +1326,8 @@ def extract_data_with_code(config: Web2JsonConfig) -> ParseResult:
         )
 
     finally:
+        if jsonl_cleanup:
+            jsonl_cleanup()
         # 清理临时parser文件
         import os
         if os.path.exists(temp_parser_path):
@@ -1034,6 +1384,20 @@ def classify_html_dir(config: Web2JsonConfig) -> ClusterResult:
         logger.info(f"  保存内容: {', '.join(config.save)}")
         logger.info(f"  输出路径: {config.get_full_output_path()}")
 
+    p_html = Path(config.html_path)
+    if p_html.is_file() and p_html.suffix.lower() == ".jsonl":
+        logger.info("  检测到 crawl .jsonl：改用 classify_crawl_jsonl（按 html 字段聚类并写出切片）")
+        return classify_crawl_jsonl(config)
+
+    if p_html.is_dir():
+        has_html = bool(list(p_html.glob("*.html")) + list(p_html.glob("*.htm")))
+        top_jsonl = list(p_html.glob("*.jsonl"))
+        if top_jsonl and not has_html:
+            logger.info(
+                "  检测到目录顶层仅有 .jsonl（无 .html）：改用 classify_crawl_jsonl_dir 合并聚类"
+            )
+            return classify_crawl_jsonl_dir(config, jsonl_dir=str(p_html), recursive=False)
+
     # 读取HTML文件
     html_files = _read_html_files(config.html_path)
     logger.info(f"找到 {len(html_files)} 个HTML文件")
diff --git a/web2json/tools/crawl_jsonl.py b/web2json/tools/crawl_jsonl.py
new file mode 100644
index 0000000..c9d8182
--- /dev/null
+++ b/web2json/tools/crawl_jsonl.py
@@ -0,0 +1,217 @@
+"""
+Crawl JSONL helpers: cluster split & optional materialize to .html for legacy pipeline.
+
+JSONL rows are expected to be JSON objects with at least a string field for HTML
+(default key ``html``). Record identity defaults to ``track_id`` or ``line_{n}``.
+"""
+from __future__ import annotations
+
+import json
+import re
+from pathlib import Path
+from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple
+
+_SAFE_NAME = re.compile(r"[^a-zA-Z0-9._-]+")
+
+
+def _safe_filename_part(s: str, max_len: int = 80) -> str:
+    s = _SAFE_NAME.sub("_", s.strip())[:max_len]
+    return s or "id"
+
+
+def iter_crawl_jsonl_records(
+    jsonl_path: Path,
+    *,
+    html_field: str = "html",
+    id_field: Optional[str] = "track_id",
+) -> Iterator[Tuple[int, str, Dict[str, Any], str]]:
+    """
+    Yield (line_index_1based, record_id, obj, html_string) for each line with usable html.
+
+    Lines that are empty, non-JSON, or missing html are skipped (not yielded).
+    """
+    with jsonl_path.open("r", encoding="utf-8") as f:
+        for i, line in enumerate(f, 1):
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                obj = json.loads(line)
+            except json.JSONDecodeError:
+                continue
+            if not isinstance(obj, dict):
+                continue
+            html = obj.get(html_field)
+            if not isinstance(html, str) or not html.strip():
+                continue
+            if id_field and obj.get(id_field) is not None:
+                rid = str(obj[id_field])
+            else:
+                rid = f"line_{i}"
+            yield i, rid, obj, html
+
+
+def materialize_jsonl_to_html_dir(
+    jsonl_path: Path,
+    dest_dir: Path,
+    *,
+    html_field: str = "html",
+    id_field: Optional[str] = "track_id",
+) -> List[str]:
+    """
+    Write one ``.html`` file per JSONL row (only rows with non-empty ``html_field``).
+
+    Filenames: ``{line_index:06d}_{safe_id}.html`` to preserve order and avoid collisions.
+
+    Returns sorted list of absolute paths to written HTML files.
+    """
+    dest_dir.mkdir(parents=True, exist_ok=True)
+    out: List[str] = []
+    for line_no, rid, _obj, html in iter_crawl_jsonl_records(
+        jsonl_path, html_field=html_field, id_field=id_field
+    ):
+        safe = _safe_filename_part(rid)
+        name = f"{line_no:06d}_{safe}.html"
+        p = dest_dir / name
+        p.write_text(html, encoding="utf-8")
+        out.append(str(p.resolve()))
+    out.sort()
+    return out
+
+
+def discover_jsonl_files(directory: Path, *, recursive: bool = True) -> List[Path]:
+    """Return sorted ``*.jsonl`` paths under ``directory`` (``rglob`` if ``recursive``)."""
+    if not directory.is_dir():
+        raise NotADirectoryError(f"不是目录: {directory}")
+    if recursive:
+        found = sorted(directory.rglob("*.jsonl"))
+    else:
+        found = sorted(directory.glob("*.jsonl"))
+    return [p for p in found if p.is_file()]
+
+
+def load_crawl_line_metas_for_file(
+    jsonl_path: Path,
+    *,
+    html_field: str = "html",
+    record_id_field: Optional[str] = "track_id",
+) -> List[Dict[str, Any]]:
+    """
+    解析单个 crawl JSONL，每行一条 meta（与 ``classify_crawl_jsonl`` 规则一致）。
+
+    每条 meta 含: ``line_no``, ``obj``, ``html`` (可 None), ``rid``, ``source_jsonl``, ``source_name``。
+    """
+    source_jsonl = str(jsonl_path.resolve())
+    source_name = jsonl_path.name
+    metas: List[Dict[str, Any]] = []
+    with jsonl_path.open("r", encoding="utf-8") as f:
+        for line_no, raw in enumerate(f, 1):
+            raw = raw.strip()
+            if not raw:
+                continue
+            try:
+                obj = json.loads(raw)
+            except json.JSONDecodeError:
+                metas.append(
+                    {
+                        "line_no": line_no,
+                        "obj": {
+                            "_classify_json_error": "json_decode_error",
+                            "line": line_no,
+                            "source_jsonl": source_jsonl,
+                        },
+                        "html": None,
+                        "rid": f"{source_name}:{line_no}",
+                        "source_jsonl": source_jsonl,
+                        "source_name": source_name,
+                    }
+                )
+                continue
+            if not isinstance(obj, dict):
+                metas.append(
+                    {
+                        "line_no": line_no,
+                        "obj": {
+                            "_classify_json_error": "not_a_json_object",
+                            "line": line_no,
+                            "source_jsonl": source_jsonl,
+                        },
+                        "html": None,
+                        "rid": f"{source_name}:{line_no}",
+                        "source_jsonl": source_jsonl,
+                        "source_name": source_name,
+                    }
+                )
+                continue
+            html = obj.get(html_field)
+            if not isinstance(html, str) or not html.strip():
+                rid_local = (
+                    str(obj[record_id_field])
+                    if record_id_field and obj.get(record_id_field) is not None
+                    else f"line_{line_no}"
+                )
+                metas.append(
+                    {
+                        "line_no": line_no,
+                        "obj": obj,
+                        "html": None,
+                        "rid": rid_local,
+                        "source_jsonl": source_jsonl,
+                        "source_name": source_name,
+                    }
+                )
+                continue
+            rid_local = (
+                str(obj[record_id_field])
+                if record_id_field and obj.get(record_id_field) is not None
+                else f"line_{line_no}"
+            )
+            metas.append(
+                {
+                    "line_no": line_no,
+                    "obj": obj,
+                    "html": html,
+                    "rid": rid_local,
+                    "source_jsonl": source_jsonl,
+                    "source_name": source_name,
+                }
+            )
+    return metas
+
+
+def write_jsonl_lines(path: Path, rows: List[Dict[str, Any]]) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("w", encoding="utf-8") as f:
+        for row in rows:
+            f.write(json.dumps(row, ensure_ascii=False) + "\n")
+
+
+def split_jsonl_by_cluster_labels(
+    records: List[Dict[str, Any]],
+    labels: List[int],
+    *,
+    out_dir: Path,
+    stem: str,
+) -> Dict[str, Path]:
+    """
+    Write ``{stem}_cluster_{k}.jsonl`` and ``{stem}_noise.jsonl`` for label -1.
+
+    ``records`` and ``labels`` must have the same length.
+    """
+    if len(records) != len(labels):
+        raise ValueError("records and labels length mismatch")
+
+    buckets: Dict[str, List[Dict[str, Any]]] = {}
+    for rec, lab in zip(records, labels):
+        key = "noise" if lab == -1 else f"cluster_{lab}"
+        buckets.setdefault(key, []).append(rec)
+
+    written: Dict[str, Path] = {}
+    for key, rows in buckets.items():
+        if key == "noise":
+            path = out_dir / f"{stem}_noise.jsonl"
+        else:
+            path = out_dir / f"{stem}_{key}.jsonl"
+        write_jsonl_lines(path, rows)
+        written[key] = path
+    return written