From f3dd52e25b3e43899e4a9d136e009189c442a1fb Mon Sep 17 00:00:00 2001 From: ql101 Date: Tue, 14 Apr 2026 13:36:36 +0800 Subject: [PATCH 1/5] Run in jupter --- jupyter_helper.py | 3 + notebooks/jupyter_helper.py | 136 ++++++++++++ notebooks/web2json_quickstart.ipynb | 100 +++++++++ scripts/build_html_manifest.py | 141 ++++++++++++ scripts/run_jsonl_web2json_pipeline.py | 296 +++++++++++++++++++++++++ scripts/verify_extract_alignment.py | 282 +++++++++++++++++++++++ 6 files changed, 958 insertions(+) create mode 100644 jupyter_helper.py create mode 100644 notebooks/jupyter_helper.py create mode 100644 notebooks/web2json_quickstart.ipynb create mode 100644 scripts/build_html_manifest.py create mode 100644 scripts/run_jsonl_web2json_pipeline.py create mode 100644 scripts/verify_extract_alignment.py diff --git a/jupyter_helper.py b/jupyter_helper.py new file mode 100644 index 0000000..d82c824 --- /dev/null +++ b/jupyter_helper.py @@ -0,0 +1,3 @@ +"""Compatibility wrapper so notebooks can import jupyter_helper from multiple locations.""" + +from notebooks.jupyter_helper import * # noqa: F401,F403 diff --git a/notebooks/jupyter_helper.py b/notebooks/jupyter_helper.py new file mode 100644 index 0000000..801d70e --- /dev/null +++ b/notebooks/jupyter_helper.py @@ -0,0 +1,136 @@ +"""Utilities for running web2json-agent inside Jupyter notebooks.""" + +from __future__ import annotations + +import json +import os +import sys +from pathlib import Path +from typing import Any, Optional, Sequence + +PROJECT_ROOT = Path(__file__).resolve().parents[1] + + +def prepare_notebook( + api_key: Optional[str] = None, + api_base: Optional[str] = None, + project_root: Optional[str] = None, +) -> Path: + """Prepare the notebook process for local package imports and env loading.""" + root = Path(project_root).expanduser().resolve() if project_root else PROJECT_ROOT + + if str(root) not in sys.path: + sys.path.insert(0, str(root)) + + os.chdir(root) + + if api_key: + os.environ["OPENAI_API_KEY"] = api_key + + if api_base: + os.environ["OPENAI_API_BASE"] = api_base + + return root + + +def make_extract_config( + name: str, + html_path: str, + output_path: str = "output", + save: Optional[Sequence[str]] = ("schema", "code", "data"), + schema: Optional[dict[str, Any]] = None, + iteration_rounds: int = 3, + enable_schema_edit: bool = False, + remove_null_fields: bool = True, + parser_code: Optional[str] = None, +): + """Build a Web2JsonConfig with notebook-friendly path resolution.""" + prepare_notebook() + + from web2json import Web2JsonConfig + + html_target = _resolve_project_path(html_path) + output_target = _resolve_project_path(output_path) + + return Web2JsonConfig( + name=name, + html_path=str(html_target), + output_path=str(output_target), + iteration_rounds=iteration_rounds, + schema=schema, + enable_schema_edit=enable_schema_edit, + parser_code=parser_code, + save=list(save) if save is not None else None, + remove_null_fields=remove_null_fields, + ) + + +def preview_records(records: Sequence[dict[str, Any]], limit: int = 3) -> list[dict[str, Any]]: + """Return the first few parsed records so a notebook cell renders them directly.""" + return list(records[:limit]) + + +def print_schema(schema: dict[str, Any]) -> None: + """Pretty print schema content inside notebooks.""" + print(json.dumps(schema, ensure_ascii=False, indent=2)) + + +def summarize_cluster_result(cluster_result: Any) -> dict[str, Any]: + """Convert a cluster result into a compact notebook-friendly summary.""" + return { + "cluster_count": cluster_result.cluster_count, + "clusters": {name: len(files) for name, files in cluster_result.clusters.items()}, + "noise_files": len(cluster_result.noise_files), + } + + +def run_jsonl_pipeline( + source_jsonl: str, + work_id: str = "", + input_root: str = "input_html", + output_root: str = "output", + html_key: str = "html", + iteration_rounds: int = 3, + cluster_limit: int = 0, +): + """Run the full JSONL pipeline from a notebook and return the structured summary.""" + prepare_notebook() + + from scripts.run_jsonl_web2json_pipeline import run_jsonl_pipeline as _run_jsonl_pipeline + + return _run_jsonl_pipeline( + source_jsonl=str(_resolve_project_path(source_jsonl)), + work_id=work_id, + input_root=str(_resolve_project_path(input_root)), + output_root=str(_resolve_project_path(output_root)), + html_key=html_key, + iteration_rounds=iteration_rounds, + cluster_limit=cluster_limit, + ) + + +def summarize_pipeline_result(result: Any) -> dict[str, Any]: + """Build a compact summary view for notebook display.""" + return { + "source_jsonl": result.source_jsonl, + "pipeline_root": result.pipeline_root, + "cluster_count": result.cluster_count, + "clusters": [ + { + "cluster_name": cluster["cluster_name"], + "cluster_size": cluster["cluster_size"], + "parse_success_count": cluster["parse_success_count"], + "parse_failed_count": cluster["parse_failed_count"], + } + for cluster in result.clusters + ], + "total_token_usage": result.total_token_usage, + "summary_path": result.summary_path, + } + + +def _resolve_project_path(path_str: str) -> Path: + path = Path(path_str).expanduser() + if path.is_absolute(): + return path + return (PROJECT_ROOT / path).resolve() diff --git a/notebooks/web2json_quickstart.ipynb b/notebooks/web2json_quickstart.ipynb new file mode 100644 index 0000000..da203aa --- /dev/null +++ b/notebooks/web2json_quickstart.ipynb @@ -0,0 +1,100 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# web2json-agent JSONL Pipeline Quickstart\n", + "\n", + "这个 notebook 基于 `scripts/run_jsonl_web2json_pipeline.py`,按顺序运行下面几个单元即可。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "import sys\n", + "\n", + "PROJECT_ROOT = Path.cwd()\n", + "if PROJECT_ROOT.name == \"notebooks\":\n", + " PROJECT_ROOT = PROJECT_ROOT.parent\n", + "if str(PROJECT_ROOT) not in sys.path:\n", + " sys.path.insert(0, str(PROJECT_ROOT))\n", + "\n", + "from jupyter_helper import prepare_notebook\n", + "\n", + "PROJECT_ROOT = prepare_notebook(\n", + " api_key=\"YOUR_API_KEY\",\n", + " api_base=\"https://api.openai.com/v1\", # 如果你使用兼容网关,请替换这里\n", + ")\n", + "\n", + "PROJECT_ROOT" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from jupyter_helper import run_jsonl_pipeline, summarize_pipeline_result\n", + "\n", + "result = run_jsonl_pipeline(\n", + " source_jsonl=\"ToClassify/sample.json\",\n", + " work_id=\"sample_run\",\n", + " input_root=\"input_html\",\n", + " output_root=\"output\",\n", + " html_key=\"html\",\n", + " iteration_rounds=3,\n", + " cluster_limit=1,\n", + ")\n", + "\n", + "summarize_pipeline_result(result)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result.to_dict()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result.summary_path" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "output_dir = PROJECT_ROOT / \"output\" / \"sample_run_pipeline\"\n", + "output_dir" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/scripts/build_html_manifest.py b/scripts/build_html_manifest.py new file mode 100644 index 0000000..8d1508f --- /dev/null +++ b/scripts/build_html_manifest.py @@ -0,0 +1,141 @@ +#!/usr/bin/env python3 +""" +从 crawl jsonl 拆分出 HTML 文件,并生成 manifest.jsonl 索引。 + +示例: +python scripts/build_html_manifest.py \ + --source ToClassify/example.jsonl \ + --output-dir input_html/example_set +""" + +from __future__ import annotations + +import argparse +import json +from pathlib import Path +from typing import Any + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Split a crawl jsonl into numbered HTML files plus manifest.jsonl." + ) + parser.add_argument( + "--source", + required=True, + help="源 jsonl 文件路径,每行应至少包含 html 字段。", + ) + parser.add_argument( + "--output-dir", + required=True, + help="输出目录,会写入 0001.html... 和 manifest.jsonl。", + ) + parser.add_argument( + "--html-key", + default="html", + help="HTML 内容字段名,默认 html。", + ) + parser.add_argument( + "--start-index", + type=int, + default=1, + help="输出编号起始值,默认 1。", + ) + parser.add_argument( + "--width", + type=int, + default=4, + help="输出文件编号宽度,默认 4,例如 0001.html。", + ) + parser.add_argument( + "--limit", + type=int, + default=0, + help="最多处理多少条记录,0 表示不限制。", + ) + parser.add_argument( + "--skip-empty-html", + action="store_true", + help="遇到缺失或空 html 时跳过该记录;默认直接报错。", + ) + return parser.parse_args() + + +def ensure_text(value: Any) -> str: + if value is None: + return "" + if isinstance(value, str): + return value + return str(value) + + +def main() -> None: + args = parse_args() + + source = Path(args.source) + output_dir = Path(args.output_dir) + + if not source.exists(): + raise FileNotFoundError(f"Source jsonl not found: {source}") + + output_dir.mkdir(parents=True, exist_ok=True) + manifest_path = output_dir / "manifest.jsonl" + + processed_count = 0 + skipped_count = 0 + current_index = args.start_index + + with source.open("r", encoding="utf-8") as src, manifest_path.open( + "w", encoding="utf-8" + ) as manifest_fp: + for source_line, line in enumerate(src, start=1): + if args.limit and processed_count >= args.limit: + break + + line = line.strip() + if not line: + skipped_count += 1 + continue + + try: + record = json.loads(line) + except json.JSONDecodeError as exc: + raise ValueError(f"Invalid JSON at line {source_line}: {exc}") from exc + + html = ensure_text(record.get(args.html_key)) + if not html.strip(): + if args.skip_empty_html: + skipped_count += 1 + continue + raise ValueError( + f"Missing or empty '{args.html_key}' at line {source_line}" + ) + + filename = f"{current_index:0{args.width}d}.html" + html_path = output_dir / filename + html_path.write_text(html, encoding="utf-8") + + manifest_record = { + "sample_no": current_index, + "source_line": source_line, + "filename": filename, + "track_id": record.get("track_id"), + "url": record.get("url"), + "status": record.get("status"), + "html_len": len(html), + } + + manifest_fp.write(json.dumps(manifest_record, ensure_ascii=False) + "\n") + + processed_count += 1 + current_index += 1 + + print(f"source: {source}") + print(f"output_dir: {output_dir}") + print(f"manifest: {manifest_path}") + print(f"processed: {processed_count}") + print(f"skipped: {skipped_count}") + + +if __name__ == "__main__": + main() diff --git a/scripts/run_jsonl_web2json_pipeline.py b/scripts/run_jsonl_web2json_pipeline.py new file mode 100644 index 0000000..bf7ff33 --- /dev/null +++ b/scripts/run_jsonl_web2json_pipeline.py @@ -0,0 +1,296 @@ +#!/usr/bin/env python3 +""" +对 jsonl 执行完整 web2json 流水线: +1. 拆分 html + manifest +2. classify_html_dir +3. 对每个 cluster 执行 extract_schema +4. infer_code +5. extract_data_with_code +6. 汇总 token 使用 +""" + +from __future__ import annotations + +import argparse +import json +import re +import sys +from dataclasses import asdict, dataclass +from pathlib import Path +from typing import Any + +PROJECT_ROOT = Path(__file__).resolve().parents[1] +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) + +from web2json import ( + Web2JsonConfig, + classify_html_dir, + extract_schema, + infer_code, + extract_data_with_code, +) +from web2json.utils.llm_client import LLMClient + + +@dataclass +class PipelineRunResult: + source_jsonl: str + manifest: str + html_dir: str + pipeline_root: str + cluster_count: int + clusters: list[dict[str, Any]] + total_token_usage: dict[str, int] + summary_path: str + + def to_dict(self) -> dict[str, Any]: + return asdict(self) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Run web2json pipeline on a crawl jsonl.") + parser.add_argument("--source-jsonl", required=True, help="源 jsonl 文件路径。") + parser.add_argument( + "--work-id", + default="", + help="输出目录标识。默认根据 jsonl 文件名自动生成。", + ) + parser.add_argument( + "--input-root", + default="input_html", + help="HTML 输出根目录,默认 input_html。", + ) + parser.add_argument( + "--output-root", + default="output", + help="结果输出根目录,默认 output。", + ) + parser.add_argument( + "--html-key", + default="html", + help="jsonl 中 HTML 字段名,默认 html。", + ) + parser.add_argument( + "--iteration-rounds", + type=int, + default=3, + help="schema 学习轮数上限,默认 3。", + ) + parser.add_argument( + "--cluster-limit", + type=int, + default=0, + help="最多处理多少个 cluster,0 表示全部处理。", + ) + return parser.parse_args() + + +def slugify(value: str) -> str: + value = re.sub(r"[^a-zA-Z0-9]+", "_", value).strip("_").lower() + return value or "run" + + +def load_jsonl(path: Path) -> list[tuple[int, dict[str, Any]]]: + rows: list[tuple[int, dict[str, Any]]] = [] + with path.open("r", encoding="utf-8") as fp: + for line_no, line in enumerate(fp, start=1): + line = line.strip() + if not line: + continue + try: + rows.append((line_no, json.loads(line))) + except json.JSONDecodeError as exc: + print(f"skip invalid json line {line_no}: {exc}") + return rows + + +def build_html_manifest(source_jsonl: Path, output_dir: Path, html_key: str) -> Path: + rows = load_jsonl(source_jsonl) + output_dir.mkdir(parents=True, exist_ok=True) + manifest_path = output_dir / "manifest.jsonl" + + with manifest_path.open("w", encoding="utf-8") as manifest_fp: + for idx, (source_line, row) in enumerate(rows, start=1): + html = row.get(html_key) + if not isinstance(html, str) or not html.strip(): + continue + + filename = f"{idx:04d}.html" + (output_dir / filename).write_text(html, encoding="utf-8") + + manifest_row = { + "sample_no": idx, + "source_line": source_line, + "filename": filename, + "track_id": row.get("track_id"), + "url": row.get("url"), + "status": row.get("status"), + "html_len": len(html), + } + manifest_fp.write(json.dumps(manifest_row, ensure_ascii=False) + "\n") + + return manifest_path + + +def usage_delta(before: dict[str, int], after: dict[str, int]) -> dict[str, int]: + return { + "total_input_tokens": after["total_input_tokens"] - before["total_input_tokens"], + "total_completion_tokens": after["total_completion_tokens"] - before["total_completion_tokens"], + "total_tokens": after["total_tokens"] - before["total_tokens"], + } + + +def run_jsonl_pipeline( + source_jsonl: str, + work_id: str = "", + input_root: str = "input_html", + output_root: str = "output", + html_key: str = "html", + iteration_rounds: int = 3, + cluster_limit: int = 0, +) -> PipelineRunResult: + source_jsonl_path = Path(source_jsonl).expanduser() + if not source_jsonl_path.is_absolute(): + source_jsonl_path = (PROJECT_ROOT / source_jsonl_path).resolve() + + work_id = work_id or slugify(source_jsonl_path.stem) + + input_root_path = Path(input_root).expanduser() + if not input_root_path.is_absolute(): + input_root_path = (PROJECT_ROOT / input_root_path).resolve() + + output_root_path = Path(output_root).expanduser() + if not output_root_path.is_absolute(): + output_root_path = (PROJECT_ROOT / output_root_path).resolve() + + html_dir = input_root_path / work_id + pipeline_root = output_root_path / f"{work_id}_pipeline" + pipeline_root.mkdir(parents=True, exist_ok=True) + + print(f"source_jsonl: {source_jsonl_path}") + print(f"work_id: {work_id}") + print(f"html_dir: {html_dir}") + print(f"pipeline_root: {pipeline_root}") + + manifest_path = build_html_manifest(source_jsonl_path, html_dir, html_key) + print(f"manifest: {manifest_path}") + + classify_config = Web2JsonConfig( + name="classify", + html_path=str(html_dir), + output_path=str(pipeline_root), + save=["report", "files"], + ) + classify_result = classify_html_dir(classify_config) + + clusters_dir = pipeline_root / "classify" / "clusters" + cluster_names = sorted(classify_result.clusters.keys()) + if cluster_limit: + cluster_names = cluster_names[:cluster_limit] + + LLMClient.reset_usage() + cluster_summaries: list[dict[str, Any]] = [] + + for cluster_name in cluster_names: + cluster_html_dir = clusters_dir / cluster_name + cluster_files = classify_result.clusters[cluster_name] + cluster_size = len(cluster_files) + rounds = min(iteration_rounds, cluster_size) + + print(f"\n=== {cluster_name} ({cluster_size} files) ===") + + before_schema = LLMClient.get_total_usage() + schema_result = extract_schema( + Web2JsonConfig( + name=f"{cluster_name}_schema", + html_path=str(cluster_html_dir), + output_path=str(pipeline_root), + iteration_rounds=rounds, + save=["schema"], + ) + ) + after_schema = LLMClient.get_total_usage() + + before_code = LLMClient.get_total_usage() + code_result = infer_code( + Web2JsonConfig( + name=f"{cluster_name}_code", + html_path=str(cluster_html_dir), + output_path=str(pipeline_root), + schema=schema_result.final_schema, + save=["schema", "code"], + ) + ) + after_code = LLMClient.get_total_usage() + + parser_path = pipeline_root / f"{cluster_name}_code" / "final_parser.py" + parse_result = extract_data_with_code( + Web2JsonConfig( + name=f"{cluster_name}_extract_data", + html_path=str(cluster_html_dir), + output_path=str(pipeline_root), + parser_code=str(parser_path), + save=["data"], + ) + ) + + cluster_summary = { + "cluster_name": cluster_name, + "cluster_size": cluster_size, + "html_dir": str(cluster_html_dir), + "schema_output": str(pipeline_root / f"{cluster_name}_schema"), + "code_output": str(pipeline_root / f"{cluster_name}_code"), + "data_output": str(pipeline_root / f"{cluster_name}_extract_data"), + "parser_path": str(parser_path), + "schema_fields": list(schema_result.final_schema.keys()), + "schema_token_usage": usage_delta(before_schema, after_schema), + "code_token_usage": usage_delta(before_code, after_code), + "parse_success_count": parse_result.success_count, + "parse_failed_count": parse_result.failed_count, + } + cluster_summaries.append(cluster_summary) + + total_usage = LLMClient.get_total_usage() + summary = { + "source_jsonl": str(source_jsonl_path), + "manifest": str(manifest_path), + "html_dir": str(html_dir), + "pipeline_root": str(pipeline_root), + "cluster_count": len(cluster_names), + "clusters": cluster_summaries, + "total_token_usage": total_usage, + } + + summary_path = pipeline_root / "pipeline_summary.json" + summary_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8") + print(f"\nsummary: {summary_path}") + print(json.dumps(total_usage, ensure_ascii=False, indent=2)) + + return PipelineRunResult( + source_jsonl=summary["source_jsonl"], + manifest=summary["manifest"], + html_dir=summary["html_dir"], + pipeline_root=summary["pipeline_root"], + cluster_count=summary["cluster_count"], + clusters=summary["clusters"], + total_token_usage=summary["total_token_usage"], + summary_path=str(summary_path), + ) + + +def main() -> None: + args = parse_args() + run_jsonl_pipeline( + source_jsonl=args.source_jsonl, + work_id=args.work_id, + input_root=args.input_root, + output_root=args.output_root, + html_key=args.html_key, + iteration_rounds=args.iteration_rounds, + cluster_limit=args.cluster_limit, + ) + + +if __name__ == "__main__": + main() diff --git a/scripts/verify_extract_alignment.py b/scripts/verify_extract_alignment.py new file mode 100644 index 0000000..1da78a8 --- /dev/null +++ b/scripts/verify_extract_alignment.py @@ -0,0 +1,282 @@ +#!/usr/bin/env python3 +""" +校验 source jsonl / manifest / html / result json 之间的一致性。 + +示例: +python scripts/verify_extract_alignment.py \ + --source-jsonl ToClassify/source.jsonl \ + --manifest input_html/npi_sample_2000/manifest.jsonl \ + --html-dir input_html/npi_category_detail_cluster_1 \ + --result-dir output/npi_category_detail_cluster_1_code/result \ + --output output/npi_category_detail_cluster_1_code/qa_report.json + +或者直接使用 cluster manifest: +python scripts/verify_extract_alignment.py \ + --source-jsonl ToClassify/source.jsonl \ + --cluster-manifest output/npi_category_detail_cluster_1_code/cluster_manifest.json \ + --manifest input_html/npi_sample_2000/manifest.jsonl +""" + +from __future__ import annotations + +import argparse +import json +from dataclasses import dataclass, asdict +from pathlib import Path +from typing import Any +from bs4 import BeautifulSoup + + +@dataclass +class FileReport: + filename: str + source_line: int | None + url: str | None + track_id: str | None + html_exists: bool + result_exists: bool + source_match: bool + html_len_match: bool + field_checks: dict[str, dict[str, Any]] + ok: bool + errors: list[str] + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Verify alignment across source jsonl, manifest, html files, and result json." + ) + parser.add_argument("--source-jsonl", required=True, help="原始 crawl jsonl 文件路径。") + parser.add_argument("--manifest", required=True, help="完整 manifest.jsonl 文件路径。") + parser.add_argument( + "--cluster-manifest", + default="", + help="cluster_manifest.json 路径。提供后会从其中自动读取 html-dir / result-dir / 文件子集 / schema-path。", + ) + parser.add_argument("--html-dir", default="", help="HTML 文件目录。") + parser.add_argument("--result-dir", default="", help="解析结果 JSON 目录。") + parser.add_argument("--schema-json", default="", help="cluster 对应的 schema.json 路径。") + parser.add_argument("--output", default="", help="QA 报告输出路径(可选)。") + parser.add_argument( + "--fields", + nargs="*", + default=None, + help="要校验是否出现在 HTML 中的结果字段。未提供时会优先从 schema 自动推断,否则回退到 title content。", + ) + return parser.parse_args() + + +def load_jsonl(path: Path) -> list[dict[str, Any]]: + rows: list[dict[str, Any]] = [] + with path.open("r", encoding="utf-8") as fp: + for line_no, line in enumerate(fp, start=1): + line = line.strip() + if not line: + continue + try: + rows.append(json.loads(line)) + except json.JSONDecodeError as exc: + raise ValueError(f"Invalid JSON in {path} line {line_no}: {exc}") from exc + return rows + + +def load_target_files( + manifest_rows: list[dict[str, Any]], cluster_manifest_path: Path | None +) -> tuple[list[dict[str, Any]], str, str, str]: + if not cluster_manifest_path: + return manifest_rows, "", "", "" + + cluster_manifest = json.loads(cluster_manifest_path.read_text(encoding="utf-8")) + wanted = {item["filename"] for item in cluster_manifest.get("files", [])} + filtered_rows = [row for row in manifest_rows if row.get("filename") in wanted] + html_dir = cluster_manifest.get("input_dir", "") + result_dir = cluster_manifest.get("result_dir", "") + schema_path = cluster_manifest.get("schema_path", "") + return filtered_rows, html_dir, result_dir, schema_path + + +def derive_fields_from_schema(schema_path: Path) -> list[str]: + schema = json.loads(schema_path.read_text(encoding="utf-8")) + fields: list[str] = [] + for field_name, field_meta in schema.items(): + if not isinstance(field_meta, dict): + continue + if field_meta.get("type") == "string": + fields.append(field_name) + return fields + + +def normalize_text(value: Any) -> str: + if value is None: + return "" + if not isinstance(value, str): + value = str(value) + return " ".join(value.split()) + + +def check_field_in_html(field_value: Any, html_text: str) -> dict[str, Any]: + normalized_value = normalize_text(field_value) + normalized_html = normalize_text(html_text) + normalized_text = normalize_text(BeautifulSoup(html_text, "html.parser").get_text(" ", strip=True)) + + if not normalized_value: + return { + "value_present": False, + "raw_html_match": False, + "text_match": False, + "substring_match": False, + "value_len": 0, + } + + return { + "value_present": True, + "raw_html_match": normalized_value in normalized_html, + "text_match": normalized_value in normalized_text, + "substring_match": normalized_value in normalized_html or normalized_value in normalized_text, + "value_len": len(normalized_value), + } + + +def main() -> None: + args = parse_args() + + source_jsonl = Path(args.source_jsonl) + manifest_path = Path(args.manifest) + cluster_manifest_path = Path(args.cluster_manifest) if args.cluster_manifest else None + output_path = Path(args.output) if args.output else None + + source_rows = load_jsonl(source_jsonl) + manifest_rows = load_jsonl(manifest_path) + target_rows, cluster_html_dir, cluster_result_dir, cluster_schema_path = load_target_files( + manifest_rows, cluster_manifest_path + ) + + html_dir_str = args.html_dir or cluster_html_dir + result_dir_str = args.result_dir or cluster_result_dir + schema_json_str = args.schema_json or cluster_schema_path + if not html_dir_str or not result_dir_str: + raise ValueError("html-dir 和 result-dir 不能为空;可直接传参,或通过 cluster-manifest 提供。") + + html_dir = Path(html_dir_str) + result_dir = Path(result_dir_str) + schema_json_path = Path(schema_json_str) if schema_json_str else None + + if args.fields is not None: + fields_to_check = args.fields + elif schema_json_path and schema_json_path.exists(): + fields_to_check = derive_fields_from_schema(schema_json_path) + if not fields_to_check: + fields_to_check = ["title", "content"] + else: + fields_to_check = ["title", "content"] + + reports: list[FileReport] = [] + ok_count = 0 + + for manifest_row in target_rows: + filename = manifest_row["filename"] + source_line = manifest_row.get("source_line") + url = manifest_row.get("url") + track_id = manifest_row.get("track_id") + + html_path = html_dir / filename + result_path = result_dir / filename.replace(".html", ".json") + + errors: list[str] = [] + html_exists = html_path.exists() + result_exists = result_path.exists() + source_match = False + html_len_match = False + field_checks: dict[str, dict[str, Any]] = {} + + html_text = "" + if html_exists: + html_text = html_path.read_text(encoding="utf-8") + else: + errors.append(f"missing_html:{html_path}") + + result_data: dict[str, Any] = {} + if result_exists: + result_data = json.loads(result_path.read_text(encoding="utf-8")) + else: + errors.append(f"missing_result:{result_path}") + + if source_line is not None and 1 <= source_line <= len(source_rows): + source_row = source_rows[source_line - 1] + source_match = ( + source_row.get("track_id") == track_id + and source_row.get("url") == url + ) + if not source_match: + errors.append("source_manifest_mismatch") + + source_html = source_row.get("html", "") + html_len_match = len(source_html) == manifest_row.get("html_len") + if not html_len_match: + errors.append("source_manifest_html_len_mismatch") + + if html_exists and len(html_text) != len(source_html): + errors.append("source_html_file_len_mismatch") + elif html_exists and html_text != source_html: + errors.append("source_html_file_content_mismatch") + else: + errors.append("invalid_source_line") + + if html_exists and result_exists: + for field in fields_to_check: + field_checks[field] = check_field_in_html(result_data.get(field), html_text) + if field_checks[field]["value_present"] and not field_checks[field]["substring_match"]: + errors.append(f"field_not_found_in_html:{field}") + + ok = not errors + if ok: + ok_count += 1 + + reports.append( + FileReport( + filename=filename, + source_line=source_line, + url=url, + track_id=track_id, + html_exists=html_exists, + result_exists=result_exists, + source_match=source_match, + html_len_match=html_len_match, + field_checks=field_checks, + ok=ok, + errors=errors, + ) + ) + + summary = { + "source_jsonl": str(source_jsonl), + "manifest": str(manifest_path), + "cluster_manifest": str(cluster_manifest_path) if cluster_manifest_path else "", + "html_dir": str(html_dir), + "result_dir": str(result_dir), + "schema_json": str(schema_json_path) if schema_json_path else "", + "fields_checked": fields_to_check, + "total_files": len(reports), + "ok_files": ok_count, + "failed_files": len(reports) - ok_count, + "reports": [asdict(report) for report in reports], + } + + if output_path: + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8") + print(f"qa_report: {output_path}") + + print(f"total_files: {summary['total_files']}") + print(f"ok_files: {summary['ok_files']}") + print(f"failed_files: {summary['failed_files']}") + + for report in reports: + status = "OK" if report.ok else "FAIL" + print(f"{status} {report.filename}") + if report.errors: + print(f" errors: {', '.join(report.errors)}") + + +if __name__ == "__main__": + main() From 47e41abafde7ef4f6347e2bde66ffa21f1ae0a21 Mon Sep 17 00:00:00 2001 From: ql101 Date: Tue, 14 Apr 2026 14:07:18 +0800 Subject: [PATCH 2/5] Improve pipeline ergonomics and HTML preprocessing --- README_JUPYTER.md | 195 ++++++++++++++++++++++++++++++ start.sh | 13 +- web2json/simple.py | 28 ++++- web2json/tools/html_simplifier.py | 76 ++++++++++++ web2json/utils/llm_client.py | 9 +- 5 files changed, 311 insertions(+), 10 deletions(-) create mode 100644 README_JUPYTER.md diff --git a/README_JUPYTER.md b/README_JUPYTER.md new file mode 100644 index 0000000..101160a --- /dev/null +++ b/README_JUPYTER.md @@ -0,0 +1,195 @@ +# web2json-agent Jupyter Guide + +这个文档专门基于 [scripts/run_jsonl_web2json_pipeline.py](/Users/luqing/Downloads/multiModal/web2json-agent/scripts/run_jsonl_web2json_pipeline.py) 来写,目标是在 Jupyter 里直接跑完整 `jsonl -> html -> classify -> schema -> code -> data` 流水线。 + +它不覆盖项目原始 [README.md](/Users/luqing/Downloads/multiModal/web2json-agent/README.md)。 + +## 这份文档对应哪条执行链路 + +这里用的不是最简单的 `extract_data(...)` 单接口方案,而是项目里的完整脚本流水线: + +- 入口脚本: [scripts/run_jsonl_web2json_pipeline.py](/Users/luqing/Downloads/multiModal/web2json-agent/scripts/run_jsonl_web2json_pipeline.py) +- Jupyter 包装: [jupyter_helper.py](/Users/luqing/Downloads/multiModal/web2json-agent/jupyter_helper.py) +- Notebook helper 实现: [notebooks/jupyter_helper.py](/Users/luqing/Downloads/multiModal/web2json-agent/notebooks/jupyter_helper.py) +- 示例 notebook: [notebooks/web2json_quickstart.ipynb](/Users/luqing/Downloads/multiModal/web2json-agent/notebooks/web2json_quickstart.ipynb) + +## 流水线做了什么 + +脚本会按下面顺序执行: + +1. 读取 `jsonl` +2. 从每条记录里取出 `html` 字段 +3. 拆成一批 `.html` 文件,并生成 `manifest.jsonl` +4. 对 HTML 做 `classify_html_dir` +5. 对每个 cluster 执行 `extract_schema` +6. 执行 `infer_code` +7. 用生成的 parser 执行 `extract_data_with_code` +8. 输出 `pipeline_summary.json` + +适合这种输入数据: + +- 原始数据是 `jsonl` +- 每行是一条网页记录 +- 每条记录里有 `html` 字段 +- 可能还带 `url`、`track_id`、`status` + +## Jupyter 最短路径 + +### 1. 进入项目目录 + +```bash +cd /Users/luqing/Downloads/multiModal/web2json-agent +``` + +### 2. 安装项目 + +请显式使用 `python3.11`,不要用系统默认的旧版 `python3`。 + +```bash +python3.11 -m pip install . +``` + +### 3. 启动 Jupyter + +```bash +python3.11 -m notebook +``` + +或者: + +```bash +python3.11 -m jupyter lab +``` + +### 4. 打开示例 notebook + +打开: + +`notebooks/web2json_quickstart.ipynb` + +## Notebook 最小示例 + +### Cell 1: 初始化环境 + +```python +from jupyter_helper import prepare_notebook + +prepare_notebook( + api_key="YOUR_API_KEY", + api_base="https://api.openai.com/v1", +) +``` + +### Cell 2: 运行完整 JSONL pipeline + +```python +from jupyter_helper import run_jsonl_pipeline, summarize_pipeline_result + +result = run_jsonl_pipeline( + source_jsonl="ToClassify/sample.json", + work_id="sample_run", + input_root="input_html", + output_root="output", + html_key="html", + iteration_rounds=3, + cluster_limit=1, +) + +summarize_pipeline_result(result) +``` + +### Cell 3: 查看完整结果 + +```python +result.to_dict() +``` + +## 也可以直接调用原脚本 + +如果你不想通过 helper,也可以在 notebook 里直接 import 原脚本里的函数: + +```python +from scripts.run_jsonl_web2json_pipeline import run_jsonl_pipeline + +result = run_jsonl_pipeline( + source_jsonl="ToClassify/sample.json", + work_id="sample_run", +) +``` + +这就是 [scripts/run_jsonl_web2json_pipeline.py](/Users/luqing/Downloads/multiModal/web2json-agent/scripts/run_jsonl_web2json_pipeline.py) 里新增的 notebook-friendly 入口。 + +## 参数说明 + +`run_jsonl_pipeline(...)` 主要参数: + +- `source_jsonl`: 源 `jsonl` 路径 +- `work_id`: 这次运行的标识;为空时按文件名自动生成 +- `input_root`: 拆分后 HTML 的输出根目录,默认 `input_html` +- `output_root`: pipeline 输出根目录,默认 `output` +- `html_key`: `jsonl` 中 HTML 字段名,默认 `html` +- `iteration_rounds`: schema 学习轮数上限,默认 `3` +- `cluster_limit`: 最多处理多少个 cluster,默认 `0`,表示全部 + +## 结果会落到哪里 + +如果你设置: + +```python +result = run_jsonl_pipeline( + source_jsonl="ToClassify/sample.json", + work_id="sample_run", +) +``` + +通常会生成: + +- `input_html/sample_run/` +- `output/sample_run_pipeline/` +- `output/sample_run_pipeline/pipeline_summary.json` + +每个 cluster 下面还会有: + +- schema 输出目录 +- code 输出目录 +- data 输出目录 +- 最终 parser 文件 + +## API Key 配置 + +你可以二选一: + +### 方式 A: 在 notebook 里设置 + +```python +from jupyter_helper import prepare_notebook + +prepare_notebook( + api_key="YOUR_API_KEY", + api_base="https://api.openai.com/v1", +) +``` + +### 方式 B: 在项目根目录放 `.env` + +```env +OPENAI_API_KEY=YOUR_API_KEY +OPENAI_API_BASE=https://api.openai.com/v1 +DEFAULT_MODEL=gpt-4.1 +``` + +## 已知前提 + +- Python 要求 `>= 3.10` +- 当前这台机器上默认 `python3` 是旧的 `3.7.3` +- 建议始终显式使用 `python3.11` +- 这条流水线依赖模型 API,可用前需要配置好 key/base + +## 相关文件 + +- [README.md](/Users/luqing/Downloads/multiModal/web2json-agent/README.md) +- [README_JUPYTER.md](/Users/luqing/Downloads/multiModal/web2json-agent/README_JUPYTER.md) +- [scripts/run_jsonl_web2json_pipeline.py](/Users/luqing/Downloads/multiModal/web2json-agent/scripts/run_jsonl_web2json_pipeline.py) +- [jupyter_helper.py](/Users/luqing/Downloads/multiModal/web2json-agent/jupyter_helper.py) +- [notebooks/jupyter_helper.py](/Users/luqing/Downloads/multiModal/web2json-agent/notebooks/jupyter_helper.py) +- [notebooks/web2json_quickstart.ipynb](/Users/luqing/Downloads/multiModal/web2json-agent/notebooks/web2json_quickstart.ipynb) diff --git a/start.sh b/start.sh index f2cd712..f7d735c 100755 --- a/start.sh +++ b/start.sh @@ -3,6 +3,9 @@ # Web2JSON Agent - Startup Script # Starts both the backend API and the frontend UI simultaneously +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PYTHON_BIN="${PYTHON_BIN:-python3.11}" + echo "🚀 Starting Web2JSON Agent..." echo "" @@ -21,14 +24,14 @@ fi # Start the backend echo "📡 Starting backend API (port 8000)..." -cd /Users/brown/Projects/AILabProject/web2json-agent +cd "$SCRIPT_DIR" # Create logs directory if it doesn't exist mkdir -p logs # Production Mode: Disable automatic reloading to avoid restarts triggered by changes in the output directory # If you need reload for development, use: --reload --reload-exclude 'output/**' --reload-exclude 'logs/**' -uvicorn web2json_api.main:app --host 0.0.0.0 --port 8000 \ +"$PYTHON_BIN" -m uvicorn web2json_api.main:app --host 0.0.0.0 --port 8000 \ --reload-exclude 'output/**' \ --reload-exclude 'logs/**' \ --reload-exclude '*.log' \ @@ -49,7 +52,7 @@ fi # Start the frontend echo "" echo "🎨 Starting frontend UI (port 5173)..." -cd web2json_ui && npm run dev > ../logs/ui.log 2>&1 & +cd "$SCRIPT_DIR/web2json_ui" && npm run dev > "$SCRIPT_DIR/logs/ui.log" 2>&1 & FRONTEND_PID=$! echo " Frontend PID: $FRONTEND_PID" @@ -70,8 +73,8 @@ echo "Or press Ctrl+C and run: pkill -f 'uvicorn|vite'" echo "" # Save PID -echo $BACKEND_PID > .backend.pid -echo $FRONTEND_PID > .frontend.pid +echo $BACKEND_PID > "$SCRIPT_DIR/.backend.pid" +echo $FRONTEND_PID > "$SCRIPT_DIR/.frontend.pid" # Wait for user interruption wait diff --git a/web2json/simple.py b/web2json/simple.py index 235d7c5..95b8ebe 100644 --- a/web2json/simple.py +++ b/web2json/simple.py @@ -7,6 +7,7 @@ from pathlib import Path from typing import Optional, Dict, List, Any from dataclasses import dataclass, asdict +import numpy as np from loguru import logger from web2json.agent import ParserAgent @@ -1050,15 +1051,39 @@ def classify_html_dir(config: Web2JsonConfig) -> ClusterResult: # 执行聚类分析 logger.info("正在进行布局聚类分析...") from web2json.tools.cluster import cluster_html_layouts_optimized + from web2json.tools.html_layout_cosin import get_feature + + valid_html_files = [] + valid_html_contents = [] + invalid_html_files = [] + for file_path, html_content in zip(html_files, html_contents): + try: + feature = get_feature(html_content) + except Exception as e: + logger.warning(f" 跳过布局特征提取失败页面: {file_path} ({e})") + invalid_html_files.append(file_path) + continue + if not feature: + logger.warning(f" 跳过无有效布局特征页面: {file_path}") + invalid_html_files.append(file_path) + continue + valid_html_files.append(file_path) + valid_html_contents.append(html_content) + + if not valid_html_contents: + raise Exception("聚类失败: 没有可用于布局聚类的有效HTML页面") try: labels, sim_mat, clusters = cluster_html_layouts_optimized( - html_contents, + valid_html_contents, use_knn_graph=True ) except Exception as e: raise Exception(f"聚类失败: {e}") + label_map = {file_path: int(label) for file_path, label in zip(valid_html_files, labels)} + labels = np.array([label_map.get(file_path, -1) for file_path in html_files], dtype=int) + # 统计聚类结果 unique_labels = sorted(set(labels)) noise_count = sum(1 for l in labels if l == -1) @@ -1160,4 +1185,3 @@ def classify_html_dir(config: Web2JsonConfig) -> ClusterResult: noise_files=noise_files, cluster_count=cluster_count ) - diff --git a/web2json/tools/html_simplifier.py b/web2json/tools/html_simplifier.py index 33f34ae..1a2581c 100644 --- a/web2json/tools/html_simplifier.py +++ b/web2json/tools/html_simplifier.py @@ -265,6 +265,77 @@ def remove_empty_tags( return root +def _contains_token_attr(element: html.HtmlElement, attr_name: str, patterns: List[str]) -> bool: + """检查元素指定属性中是否包含目标模式。""" + attr_value = element.get(attr_name, '') + if not attr_value: + return False + + attr_value = attr_value.lower() + return any(pattern in attr_value for pattern in patterns) + + +def is_sharepoint_html(html_str: str) -> bool: + """粗略识别 SharePoint 页面。""" + markers = [ + 'microsoft sharepoint', + '_sppagecontextinfo', + 's4-workspace', + '_layouts/15', + ] + html_lower = html_str.lower() + return any(marker in html_lower for marker in markers) + + +def remove_sharepoint_noise(root: html.HtmlElement) -> html.HtmlElement: + """ + 删除 SharePoint 门户模板中的高噪音区域。 + + 主要清理全站导航、页眉页脚、社媒区、noindex 容器和 mega menu。 + """ + class_patterns = [ + 'noindex', + 'mega-menu', + 'mega-sub-menu', + 'mega-menu-wrap', + 'mega-menu-toggle', + 'top-header', + 'button-close-top-header', + 'social-media-header', + 'breadcrumbs', + 'breadcrumb', + 'ms-csrlistview-controldiv', + ] + id_patterns = [ + 'top-header', + 'top-menu', + 'main-menu', + 'mega-menu', + 'social-media-header', + 'navigationmenu', + 'footer', + 'ctl00_placeholdersitenamen', + ] + tag_names = {'header', 'footer', 'nav'} + + remove_targets = [] + for element in root.iter(): + tag = str(element.tag).lower() if hasattr(element, 'tag') else '' + if tag in tag_names: + remove_targets.append(element) + continue + if _contains_token_attr(element, 'class', class_patterns): + remove_targets.append(element) + continue + if _contains_token_attr(element, 'id', id_patterns): + remove_targets.append(element) + + # 去重,避免重复删除同一元素 + unique_targets = list(dict.fromkeys(remove_targets)) + remove_reversely(unique_targets) + return root + + def clean_attributes( root: html.HtmlElement, keep_attrs: List[str] = None @@ -440,6 +511,11 @@ def simplify_html( clean_attrs=True, keep_attrs=keep_attrs_list ) + if is_sharepoint_html(html_str): + simplified_root = html_to_element(result) + simplified_root = remove_sharepoint_noise(simplified_root) + simplified_root = remove_empty_tags(simplified_root) + result = element_to_html(simplified_root) # 根据aggressive参数选择模式 elif aggressive: # 激进模式:删除所有无用内容 diff --git a/web2json/utils/llm_client.py b/web2json/utils/llm_client.py index 59913a8..46c62e7 100644 --- a/web2json/utils/llm_client.py +++ b/web2json/utils/llm_client.py @@ -17,9 +17,9 @@ env_path = project_root / ".env" load_dotenv(env_path) -# 验证 -if not os.getenv("OPENAI_API_KEY"): - raise ValueError(f".env 文件路径: {env_path}, API Key未加载") +# 验证(延迟到实际使用时) +_api_key_missing = not os.getenv("OPENAI_API_KEY") +_env_path_for_error = env_path # 定义场景类型 ScenarioType = Literal["default", "code_gen", "agent"] @@ -81,6 +81,9 @@ def __init__( if self._initialized: return + if _api_key_missing and not os.getenv("OPENAI_API_KEY"): + raise ValueError(f".env 文件路径: {_env_path_for_error}, API Key未加载") + self.api_key = api_key or settings.openai_api_key self.api_base = api_base or settings.openai_api_base self.model = model or settings.default_model From 6ad88f9710486cd9d828ff9d1168aaeecf9b302a Mon Sep 17 00:00:00 2001 From: root Date: Wed, 15 Apr 2026 14:18:32 +0800 Subject: [PATCH 3/5] feat: jsonl pipeline retries, stats, and site-wide token aggregation - Add LLM invoke retry with exponential backoff and llm_retry_stats in pipeline summary - Extend settings for LLM timeout and retry env vars - Schema merge failures now record error in phase result; schema_extraction uses shared retry - run_jsonl_web2json_pipeline: merge-summary from disk when no prior summary, --only-failed, per-cluster and pipeline elapsed time - Add aggregate_site_pipeline_stats.py to sum token usage and time across jsonl pipeline outputs Made-with: Cursor --- scripts/aggregate_site_pipeline_stats.py | 223 +++++++++++++++ scripts/run_jsonl_web2json_pipeline.py | 343 ++++++++++++++++++++--- web2json/agent/phases/schema_phase.py | 5 +- web2json/config/settings.py | 22 ++ web2json/tools/schema_extraction.py | 22 +- web2json/utils/llm_client.py | 13 +- web2json/utils/llm_retry.py | 135 +++++++++ 7 files changed, 708 insertions(+), 55 deletions(-) create mode 100644 scripts/aggregate_site_pipeline_stats.py create mode 100644 web2json/utils/llm_retry.py diff --git a/scripts/aggregate_site_pipeline_stats.py b/scripts/aggregate_site_pipeline_stats.py new file mode 100644 index 0000000..61a06e5 --- /dev/null +++ b/scripts/aggregate_site_pipeline_stats.py @@ -0,0 +1,223 @@ +#!/usr/bin/env python3 +""" +汇总「某目录下全部 jsonl」各自对应流水线目录中的统计: +- LLM token(来自各 pipeline_summary.json 的 total_token_usage) +- 时间(各次 pipeline_elapsed_seconds 之和;为脚本侧计时的簇耗时之和) +- 可选:llm_retry_stats 累计 + +work_id 规则与 run_jsonl_web2json_pipeline.slugify(jsonl 文件名不含后缀) 一致。 +""" +from __future__ import annotations + +import argparse +import json +import re +import sys +from pathlib import Path +from typing import Any + +PROJECT_ROOT = Path(__file__).resolve().parents[1] +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) + + +def slugify(value: str) -> str: + value = re.sub(r"[^a-zA-Z0-9]+", "_", value).strip("_").lower() + return value or "run" + + +def pipeline_extract_complete(pipeline_root: Path) -> tuple[bool, str]: + """各 cluster 下 HTML 数是否与 extract_data/result 中 JSON 数一致。""" + clusters_dir = pipeline_root / "classify" / "clusters" + if not clusters_dir.is_dir(): + return False, f"缺少目录: {clusters_dir}" + for cluster_dir in sorted(clusters_dir.iterdir()): + if not cluster_dir.is_dir() or not cluster_dir.name.startswith("cluster_"): + continue + cname = cluster_dir.name + n_html = len(list(cluster_dir.glob("*.html"))) + len(list(cluster_dir.glob("*.htm"))) + rd = pipeline_root / f"{cname}_extract_data" / "result" + n_json = len(list(rd.glob("*.json"))) if rd.is_dir() else 0 + if n_html != n_json: + return False, f"{cname}: html={n_html} json={n_json}" + return True, "" + + +def load_summary(path: Path) -> dict[str, Any]: + return json.loads(path.read_text(encoding="utf-8")) + + +def main() -> None: + parser = argparse.ArgumentParser( + description="汇总某目录下所有 jsonl 对应 pipeline 的 token / 时间 / retry。" + ) + parser.add_argument( + "site_dir", + nargs="?", + default="Prod/ms-web-jwn", + help="包含 *.jsonl 的目录(相对项目根或绝对路径),默认 Prod/ms-web-jwn", + ) + parser.add_argument( + "--output-root", + default="output", + help="流水线输出根目录,默认 output", + ) + parser.add_argument( + "--strict", + action="store_true", + help="除存在 pipeline_summary 外,还校验各簇 HTML 数与 result JSON 数一致", + ) + parser.add_argument( + "--allow-partial", + action="store_true", + help="有缺失或未通过 strict 时仍打印汇总且退出码为 0(默认识别到问题则退出 1)", + ) + parser.add_argument( + "--json", + action="store_true", + dest="json_out", + help="输出完整 JSON(便于脚本解析)", + ) + args = parser.parse_args() + + site_dir = Path(args.site_dir).expanduser() + if not site_dir.is_absolute(): + site_dir = (PROJECT_ROOT / site_dir).resolve() + if not site_dir.is_dir(): + raise SystemExit(f"目录不存在: {site_dir}") + + output_root = Path(args.output_root).expanduser() + if not output_root.is_absolute(): + output_root = (PROJECT_ROOT / output_root).resolve() + + jsonl_files = sorted(site_dir.glob("*.jsonl")) + if not jsonl_files: + raise SystemExit(f"目录下无 *.jsonl: {site_dir}") + + rows: list[dict[str, Any]] = [] + tot_in = tot_out = tot_tok = 0 + tot_req = 0 + tot_elapsed = 0.0 + tot_retry = 0 + errors: list[str] = [] + + for jp in jsonl_files: + work_id = slugify(jp.stem) + pr = output_root / f"{work_id}_pipeline" + sp = pr / "pipeline_summary.json" + row: dict[str, Any] = { + "jsonl": jp.name, + "work_id": work_id, + "pipeline_root": str(pr), + } + if not sp.is_file(): + row["error"] = "missing pipeline_summary.json" + errors.append(f"{jp.name}: 无 {sp}") + rows.append(row) + continue + + try: + summary = load_summary(sp) + except json.JSONDecodeError as e: + row["error"] = f"invalid json: {e}" + errors.append(f"{jp.name}: {e}") + rows.append(row) + continue + + if args.strict: + ok, msg = pipeline_extract_complete(pr) + if not ok: + row["error"] = f"incomplete extract: {msg}" + errors.append(f"{jp.name}: {msg}") + rows.append(row) + continue + + usage = summary.get("total_token_usage") or {} + if isinstance(usage, dict): + tot_in += int(usage.get("total_input_tokens", 0) or 0) + tot_out += int(usage.get("total_completion_tokens", 0) or 0) + tot_tok += int(usage.get("total_tokens", 0) or 0) + tot_req += int(usage.get("request_count", 0) or 0) + + elapsed = summary.get("pipeline_elapsed_seconds") + if elapsed is not None: + tot_elapsed += float(elapsed) + + retry_stats = summary.get("llm_retry_stats") or {} + if isinstance(retry_stats, dict): + tot_retry += int(retry_stats.get("llm_retry_events", 0) or 0) + + row["total_input_tokens"] = usage.get("total_input_tokens", 0) if isinstance(usage, dict) else 0 + row["total_completion_tokens"] = ( + usage.get("total_completion_tokens", 0) if isinstance(usage, dict) else 0 + ) + row["total_tokens"] = usage.get("total_tokens", 0) if isinstance(usage, dict) else 0 + row["request_count"] = usage.get("request_count", 0) if isinstance(usage, dict) else 0 + row["pipeline_elapsed_seconds"] = float(elapsed) if elapsed is not None else None + row["llm_retry_events"] = retry_stats.get("llm_retry_events", 0) if isinstance(retry_stats, dict) else 0 + rows.append(row) + + aggregate = { + "site_dir": str(site_dir), + "output_root": str(output_root), + "jsonl_count": len(jsonl_files), + "ok_count": sum(1 for r in rows if "error" not in r), + "error_count": sum(1 for r in rows if "error" in r), + "total_token_usage": { + "request_count": tot_req, + "total_input_tokens": tot_in, + "total_completion_tokens": tot_out, + "total_tokens": tot_tok, + }, + "pipeline_elapsed_seconds_sum": round(tot_elapsed, 3), + "llm_retry_events_sum": tot_retry, + "rows": rows, + } + + if args.json_out: + print(json.dumps(aggregate, ensure_ascii=False, indent=2)) + else: + print(f"目录: {site_dir}") + print(f"输出根: {output_root}") + print(f"jsonl 数: {len(jsonl_files)} 成功汇总: {aggregate['ok_count']} 失败/跳过: {aggregate['error_count']}") + print() + hdr = f"{'jsonl':<56} {'input':>10} {'output':>10} {'total':>10} {'秒':>10} {'retry':>6}" + print(hdr) + print("-" * len(hdr)) + for r in rows: + if "error" in r: + print(f"{r['jsonl']:<56} ERROR: {r['error']}") + else: + print( + f"{r['jsonl']:<56} " + f"{r['total_input_tokens']:>10} " + f"{r['total_completion_tokens']:>10} " + f"{r['total_tokens']:>10} " + f"{r['pipeline_elapsed_seconds'] or 0:>10.3f} " + f"{r.get('llm_retry_events', 0):>6}" + ) + print("-" * len(hdr)) + print( + f"{'合计':<56} " + f"{tot_in:>10} " + f"{tot_out:>10} " + f"{tot_tok:>10} " + f"{tot_elapsed:>10.3f} " + f"{tot_retry:>6}" + ) + print() + print( + "说明: 「秒」为各次 pipeline 的 pipeline_elapsed_seconds 之和(簇耗时相加);" + "若某次 summary 无该字段则按 0。" + ) + if errors: + print("\n问题:") + for e in errors: + print(f" - {e}") + + if aggregate["error_count"] and not args.allow_partial: + raise SystemExit(1) + + +if __name__ == "__main__": + main() diff --git a/scripts/run_jsonl_web2json_pipeline.py b/scripts/run_jsonl_web2json_pipeline.py index bf7ff33..d91d7b0 100644 --- a/scripts/run_jsonl_web2json_pipeline.py +++ b/scripts/run_jsonl_web2json_pipeline.py @@ -15,9 +15,10 @@ import json import re import sys +import time from dataclasses import asdict, dataclass from pathlib import Path -from typing import Any +from typing import Any, Optional PROJECT_ROOT = Path(__file__).resolve().parents[1] if str(PROJECT_ROOT) not in sys.path: @@ -31,6 +32,7 @@ extract_data_with_code, ) from web2json.utils.llm_client import LLMClient +from web2json.utils.llm_retry import get_retry_stats, reset_retry_stats @dataclass @@ -50,7 +52,12 @@ def to_dict(self) -> dict[str, Any]: def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description="Run web2json pipeline on a crawl jsonl.") - parser.add_argument("--source-jsonl", required=True, help="源 jsonl 文件路径。") + src = parser.add_mutually_exclusive_group(required=True) + src.add_argument("--source-jsonl", help="单个源 jsonl 文件路径。") + src.add_argument( + "--source-dir", + help="目录下所有 *.jsonl 依次全量跑流水线(与 --source-jsonl 二选一)。", + ) parser.add_argument( "--work-id", default="", @@ -83,45 +90,108 @@ def parse_args() -> argparse.Namespace: default=0, help="最多处理多少个 cluster,0 表示全部处理。", ) + parser.add_argument( + "--fields", + default="", + help='预定义要抽取的字段,逗号分隔,类型均为 string,如 "title,content"。为空则走 auto schema。', + ) + parser.add_argument( + "--max-jsonl-files", + type=int, + default=0, + help="与 --source-dir 联用:最多处理前 N 个 jsonl(按文件名排序),0 表示不限制。", + ) + parser.add_argument( + "--schema-json", + default="", + help="Predefined schema JSON 文件路径(与 README Predefined Mode 一致),优先级高于 --fields。", + ) + parser.add_argument( + "--skip-manifest", + action="store_true", + help="不重新从 jsonl 拆 HTML/manifest(需已存在 input_root/work_id/)。", + ) + parser.add_argument( + "--skip-classify", + action="store_true", + help="不重新聚类,直接复用 pipeline_root/classify/clusters/(补跑失败簇时用)。", + ) + parser.add_argument( + "--only-clusters", + default="", + help='只处理指定簇,逗号分隔,如 "cluster_4,cluster_5"。为空表示全部簇。', + ) + parser.add_argument( + "--merge-summary", + action="store_true", + help="写回 pipeline_summary.json 时与已有 summary 按 cluster_name 合并(补跑时用)。", + ) + parser.add_argument( + "--only-failed", + action="store_true", + help="仅补跑「HTML 数量 > result 下 JSON 数量」的簇(需已有 classify/extract 目录;可与 --only-clusters 叠加)。", + ) return parser.parse_args() +def load_schema_json(path: str) -> Optional[dict[str, Any]]: + if not path or not path.strip(): + return None + p = Path(path).expanduser() + if not p.is_absolute(): + p = PROJECT_ROOT / p + if not p.is_file(): + raise SystemExit(f"--schema-json not found: {p}") + data = json.loads(p.read_text(encoding="utf-8")) + if not isinstance(data, dict): + raise SystemExit("--schema-json root must be a JSON object") + return data + + +def fields_to_schema(fields_csv: str) -> Optional[dict[str, str]]: + if not fields_csv or not fields_csv.strip(): + return None + out: dict[str, str] = {} + for part in fields_csv.split(","): + name = part.strip() + if name: + out[name] = "string" + return out or None + + def slugify(value: str) -> str: value = re.sub(r"[^a-zA-Z0-9]+", "_", value).strip("_").lower() return value or "run" -def load_jsonl(path: Path) -> list[tuple[int, dict[str, Any]]]: - rows: list[tuple[int, dict[str, Any]]] = [] - with path.open("r", encoding="utf-8") as fp: - for line_no, line in enumerate(fp, start=1): +def build_html_manifest(source_jsonl: Path, output_dir: Path, html_key: str) -> Path: + """逐行流式读取 jsonl,避免大文件一次性载入内存。""" + output_dir.mkdir(parents=True, exist_ok=True) + manifest_path = output_dir / "manifest.jsonl" + idx = 0 + + with source_jsonl.open("r", encoding="utf-8") as src_fp, manifest_path.open( + "w", encoding="utf-8" + ) as manifest_fp: + for line_no, line in enumerate(src_fp, start=1): line = line.strip() if not line: continue try: - rows.append((line_no, json.loads(line))) + row = json.loads(line) except json.JSONDecodeError as exc: print(f"skip invalid json line {line_no}: {exc}") - return rows - - -def build_html_manifest(source_jsonl: Path, output_dir: Path, html_key: str) -> Path: - rows = load_jsonl(source_jsonl) - output_dir.mkdir(parents=True, exist_ok=True) - manifest_path = output_dir / "manifest.jsonl" - - with manifest_path.open("w", encoding="utf-8") as manifest_fp: - for idx, (source_line, row) in enumerate(rows, start=1): + continue html = row.get(html_key) if not isinstance(html, str) or not html.strip(): continue - + idx += 1 filename = f"{idx:04d}.html" (output_dir / filename).write_text(html, encoding="utf-8") manifest_row = { "sample_no": idx, - "source_line": source_line, + "source_line": line_no, "filename": filename, "track_id": row.get("track_id"), "url": row.get("url"), @@ -141,6 +211,60 @@ def usage_delta(before: dict[str, int], after: dict[str, int]) -> dict[str, int] } +def _synthetic_clusters_from_completed_extract( + pipeline_root: Path, current_names: set[str] +) -> list[dict[str, Any]]: + """无 pipeline_summary.json 时,从已有 cluster_*_extract_data/result 推断已完成的簇。""" + out: list[dict[str, Any]] = [] + for ed in sorted(pipeline_root.glob("cluster_*_extract_data")): + cname = ed.name[: -len("_extract_data")] + if cname in current_names: + continue + rd = ed / "result" + if not rd.is_dir(): + continue + njson = len(list(rd.glob("*.json"))) + if njson == 0: + continue + out.append( + { + "cluster_name": cname, + "cluster_size": njson, + "data_output": str(ed), + "note": "from_disk_merge (补跑合并时从结果目录推断)", + } + ) + return out + + +def _discover_clusters_from_disk(clusters_dir: Path) -> list[str]: + if not clusters_dir.is_dir(): + return [] + names = sorted( + p.name + for p in clusters_dir.iterdir() + if p.is_dir() and p.name.startswith("cluster_") + ) + return names + + +def _underextracted_cluster_names(pipeline_root: Path, clusters_dir: Path) -> list[str]: + """classify 中 HTML 数量大于对应 extract_data/result 中 JSON 数量的簇。""" + failed: list[str] = [] + if not clusters_dir.is_dir(): + return failed + for cluster_dir in sorted(clusters_dir.iterdir()): + if not cluster_dir.is_dir() or not cluster_dir.name.startswith("cluster_"): + continue + cname = cluster_dir.name + n_html = len(list(cluster_dir.glob("*.html"))) + len(list(cluster_dir.glob("*.htm"))) + rd = pipeline_root / f"{cname}_extract_data" / "result" + n_json = len(list(rd.glob("*.json"))) if rd.is_dir() else 0 + if n_html > n_json: + failed.append(cname) + return failed + + def run_jsonl_pipeline( source_jsonl: str, work_id: str = "", @@ -149,6 +273,12 @@ def run_jsonl_pipeline( html_key: str = "html", iteration_rounds: int = 3, cluster_limit: int = 0, + schema: Optional[dict[str, Any]] = None, + skip_manifest: bool = False, + skip_classify: bool = False, + only_clusters: Optional[list[str]] = None, + merge_summary: bool = False, + only_failed: bool = False, ) -> PipelineRunResult: source_jsonl_path = Path(source_jsonl).expanduser() if not source_jsonl_path.is_absolute(): @@ -172,33 +302,75 @@ def run_jsonl_pipeline( print(f"work_id: {work_id}") print(f"html_dir: {html_dir}") print(f"pipeline_root: {pipeline_root}") - - manifest_path = build_html_manifest(source_jsonl_path, html_dir, html_key) - print(f"manifest: {manifest_path}") - - classify_config = Web2JsonConfig( - name="classify", - html_path=str(html_dir), - output_path=str(pipeline_root), - save=["report", "files"], - ) - classify_result = classify_html_dir(classify_config) + if schema: + print(f"predefined schema fields: {list(schema.keys())}") + + manifest_path = html_dir / "manifest.jsonl" + if skip_manifest: + if not manifest_path.is_file(): + raise SystemExit(f"--skip-manifest requires existing {manifest_path}") + print(f"manifest (reuse): {manifest_path}") + else: + manifest_path = build_html_manifest(source_jsonl_path, html_dir, html_key) + print(f"manifest: {manifest_path}") clusters_dir = pipeline_root / "classify" / "clusters" - cluster_names = sorted(classify_result.clusters.keys()) + classify_result = None + + if skip_classify: + cluster_names = _discover_clusters_from_disk(clusters_dir) + if not cluster_names: + raise SystemExit(f"--skip-classify requires non-empty {clusters_dir}") + print(f"classify (reuse): {len(cluster_names)} clusters under {clusters_dir}") + else: + classify_config = Web2JsonConfig( + name="classify", + html_path=str(html_dir), + output_path=str(pipeline_root), + save=["report", "files"], + ) + classify_result = classify_html_dir(classify_config) + cluster_names = sorted(classify_result.clusters.keys()) + + if only_clusters: + want = {x.strip() for x in only_clusters if x.strip()} + cluster_names = [c for c in cluster_names if c in want] + missing = want - set(cluster_names) + if missing: + raise SystemExit(f"--only-clusters not found on disk: {sorted(missing)}") + if not cluster_names: + raise SystemExit("--only-clusters filtered out all clusters") + + if only_failed: + under = _underextracted_cluster_names(pipeline_root, clusters_dir) + under_set = set(under) + cluster_names = [c for c in cluster_names if c in under_set] + print(f"only-failed: 未跑满簇 {under} → 本次处理 {cluster_names}") + if not cluster_names: + print("当前没有需要补跑的 cluster(各簇 JSON 数量已不少于 HTML)。") + raise SystemExit(0) + if cluster_limit: cluster_names = cluster_names[:cluster_limit] LLMClient.reset_usage() + reset_retry_stats() cluster_summaries: list[dict[str, Any]] = [] for cluster_name in cluster_names: cluster_html_dir = clusters_dir / cluster_name - cluster_files = classify_result.clusters[cluster_name] - cluster_size = len(cluster_files) + if classify_result is not None: + cluster_files = classify_result.clusters[cluster_name] + cluster_size = len(cluster_files) + else: + cluster_files = sorted(cluster_html_dir.glob("*.html")) + sorted( + cluster_html_dir.glob("*.htm") + ) + cluster_size = len(cluster_files) rounds = min(iteration_rounds, cluster_size) print(f"\n=== {cluster_name} ({cluster_size} files) ===") + cluster_t0 = time.perf_counter() before_schema = LLMClient.get_total_usage() schema_result = extract_schema( @@ -208,6 +380,7 @@ def run_jsonl_pipeline( output_path=str(pipeline_root), iteration_rounds=rounds, save=["schema"], + schema=schema, ) ) after_schema = LLMClient.get_total_usage() @@ -220,6 +393,7 @@ def run_jsonl_pipeline( output_path=str(pipeline_root), schema=schema_result.final_schema, save=["schema", "code"], + iteration_rounds=rounds, ) ) after_code = LLMClient.get_total_usage() @@ -238,6 +412,7 @@ def run_jsonl_pipeline( cluster_summary = { "cluster_name": cluster_name, "cluster_size": cluster_size, + "elapsed_seconds": round(time.perf_counter() - cluster_t0, 3), "html_dir": str(cluster_html_dir), "schema_output": str(pipeline_root / f"{cluster_name}_schema"), "code_output": str(pipeline_root / f"{cluster_name}_code"), @@ -252,20 +427,53 @@ def run_jsonl_pipeline( cluster_summaries.append(cluster_summary) total_usage = LLMClient.get_total_usage() - summary = { + summary_path = pipeline_root / "pipeline_summary.json" + run_elapsed = sum(c.get("elapsed_seconds", 0) for c in cluster_summaries if isinstance(c, dict)) + cluster_count_total = len(_discover_clusters_from_disk(clusters_dir)) if clusters_dir.is_dir() else len(cluster_names) + + summary: dict[str, Any] = { "source_jsonl": str(source_jsonl_path), "manifest": str(manifest_path), "html_dir": str(html_dir), "pipeline_root": str(pipeline_root), - "cluster_count": len(cluster_names), + "cluster_count": cluster_count_total, "clusters": cluster_summaries, "total_token_usage": total_usage, + "pipeline_elapsed_seconds": round(run_elapsed, 3), + "llm_retry_stats": get_retry_stats(), } - summary_path = pipeline_root / "pipeline_summary.json" + if merge_summary: + current_names = {c["cluster_name"] for c in cluster_summaries} + by_name: dict[str, Any] = {} + if summary_path.is_file(): + try: + prev = json.loads(summary_path.read_text(encoding="utf-8")) + except json.JSONDecodeError: + prev = {} + prev_clusters = prev.get("clusters") or [] + if isinstance(prev_clusters, list): + by_name = { + c.get("cluster_name"): c + for c in prev_clusters + if isinstance(c, dict) and c.get("cluster_name") + } + else: + for c in _synthetic_clusters_from_completed_extract(pipeline_root, current_names): + by_name[c["cluster_name"]] = c + for c in cluster_summaries: + by_name[c["cluster_name"]] = c + if by_name: + summary["clusters"] = [by_name[k] for k in sorted(by_name.keys())] + summary["cluster_count"] = len( + _discover_clusters_from_disk(clusters_dir) + ) or len(summary["clusters"]) + # total_token_usage 仅为本次运行累计(补跑时不会与历史相加,避免重复计算) + summary_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8") print(f"\nsummary: {summary_path}") print(json.dumps(total_usage, ensure_ascii=False, indent=2)) + print(json.dumps(get_retry_stats(), ensure_ascii=False, indent=2)) return PipelineRunResult( source_jsonl=summary["source_jsonl"], @@ -281,16 +489,63 @@ def run_jsonl_pipeline( def main() -> None: args = parse_args() - run_jsonl_pipeline( - source_jsonl=args.source_jsonl, - work_id=args.work_id, - input_root=args.input_root, - output_root=args.output_root, - html_key=args.html_key, - iteration_rounds=args.iteration_rounds, - cluster_limit=args.cluster_limit, + schema: Optional[dict[str, Any]] = None + if args.schema_json: + schema = load_schema_json(args.schema_json) + else: + schema = fields_to_schema(args.fields) + + only = None + if args.only_clusters.strip(): + only = [x.strip() for x in args.only_clusters.split(",") if x.strip()] + + if args.skip_classify and not args.skip_manifest: + print("提示: --skip-classify 通常与 --skip-manifest 一起用,避免重复从大 jsonl 拆 HTML。") + + extra_kw = dict( + skip_manifest=args.skip_manifest, + skip_classify=args.skip_classify, + only_clusters=only, + merge_summary=args.merge_summary, + only_failed=args.only_failed, ) + if args.source_dir: + dir_path = Path(args.source_dir).expanduser() + if not dir_path.is_absolute(): + dir_path = (PROJECT_ROOT / dir_path).resolve() + jsonl_files = sorted(dir_path.glob("*.jsonl")) + if args.max_jsonl_files: + jsonl_files = jsonl_files[: args.max_jsonl_files] + if not jsonl_files: + raise SystemExit(f"no *.jsonl under {dir_path}") + print(f"batch mode: {len(jsonl_files)} file(s) under {dir_path}") + for i, jp in enumerate(jsonl_files, 1): + print(f"\n{'='*60}\n[{i}/{len(jsonl_files)}] {jp.name}\n{'='*60}") + run_jsonl_pipeline( + source_jsonl=str(jp), + work_id="", + input_root=args.input_root, + output_root=args.output_root, + html_key=args.html_key, + iteration_rounds=args.iteration_rounds, + cluster_limit=args.cluster_limit, + schema=schema, + **extra_kw, + ) + else: + run_jsonl_pipeline( + source_jsonl=args.source_jsonl, + work_id=args.work_id, + input_root=args.input_root, + output_root=args.output_root, + html_key=args.html_key, + iteration_rounds=args.iteration_rounds, + cluster_limit=args.cluster_limit, + schema=schema, + **extra_kw, + ) + if __name__ == "__main__": main() diff --git a/web2json/agent/phases/schema_phase.py b/web2json/agent/phases/schema_phase.py index 5deb008..7c0923e 100644 --- a/web2json/agent/phases/schema_phase.py +++ b/web2json/agent/phases/schema_phase.py @@ -149,6 +149,7 @@ def execute(self, html_files: List[str]) -> Dict[str, Any]: if not schema_results: logger.error("没有成功处理的Schema") + result["error"] = "没有成功处理的Schema" return result # ============ 构建轮次结果 ============ @@ -202,8 +203,10 @@ def execute(self, html_files: List[str]) -> Dict[str, Any]: self.progress_callback("schema_merge", "Schema合并完成", 35) except Exception as e: - logger.error(f"合并多个Schema失败: {str(e)}") + err_text = str(e) + logger.error(f"合并多个Schema失败: {err_text}") import traceback logger.debug(traceback.format_exc()) + result["error"] = f"Schema合并失败: {err_text}" return result diff --git a/web2json/config/settings.py b/web2json/config/settings.py index 82d22a3..5da41b4 100644 --- a/web2json/config/settings.py +++ b/web2json/config/settings.py @@ -19,6 +19,13 @@ break +def _env_optional_float(name: str) -> Optional[float]: + raw = os.getenv(name) + if raw is None or str(raw).strip() == "": + return None + return float(raw) + + class Settings(BaseModel): """全局配置""" @@ -28,6 +35,21 @@ class Settings(BaseModel): openai_api_key: str = Field(default_factory=lambda: os.getenv("OPENAI_API_KEY", "")) openai_api_base: str = Field(default_factory=lambda: os.getenv("OPENAI_API_BASE", "https://api.openai.com/v1")) + # LLM 请求:超时与失败重试(网关 502/503、超时、限流等) + llm_request_timeout: Optional[float] = Field( + default_factory=lambda: _env_optional_float("LLM_REQUEST_TIMEOUT") + ) + # 最大尝试次数(含首次请求),例如 6 表示首次失败后最多再试 5 次 + llm_api_retry_max_attempts: int = Field( + default_factory=lambda: int(os.getenv("LLM_API_RETRY_MAX_ATTEMPTS", "6")) + ) + llm_api_retry_base_seconds: float = Field( + default_factory=lambda: float(os.getenv("LLM_API_RETRY_BASE_SECONDS", "1.0")) + ) + llm_api_retry_max_seconds: float = Field( + default_factory=lambda: float(os.getenv("LLM_API_RETRY_MAX_SECONDS", "60.0")) + ) + # ============================================ # 模型配置 # ============================================ diff --git a/web2json/tools/schema_extraction.py b/web2json/tools/schema_extraction.py index a4ff62e..ee6dafd 100644 --- a/web2json/tools/schema_extraction.py +++ b/web2json/tools/schema_extraction.py @@ -12,6 +12,7 @@ from web2json.config.settings import settings from web2json.prompts.schema_extraction import SchemaExtractionPrompts from web2json.prompts.schema_merge import SchemaMergePrompts +from web2json.utils.llm_retry import chat_openai_invoke_kwargs, invoke_with_retry def _parse_llm_response(response: str) -> Dict: @@ -103,7 +104,8 @@ def extract_schema_from_html(html_content: str) -> Dict: model=settings.default_model, api_key=os.getenv("OPENAI_API_KEY"), base_url=os.getenv("OPENAI_API_BASE"), - temperature=0.1 + temperature=0.1, + **chat_openai_invoke_kwargs(), ) messages = [ @@ -111,7 +113,9 @@ def extract_schema_from_html(html_content: str) -> Dict: {"role": "user", "content": f"{prompt}\n\n## HTML内容\n\n```html\n{html_content[:50000]}\n```"} ] - response = model.invoke(messages) + response = invoke_with_retry( + "extract_schema_from_html", lambda: model.invoke(messages) + ) # 3. 解析响应 if hasattr(response, 'content'): @@ -154,7 +158,8 @@ def merge_multiple_schemas(schemas: List[Dict]) -> Dict: model=settings.default_model, api_key=os.getenv("OPENAI_API_KEY"), base_url=os.getenv("OPENAI_API_BASE"), - temperature=0.1 + temperature=0.1, + **chat_openai_invoke_kwargs(), ) messages = [ @@ -162,7 +167,9 @@ def merge_multiple_schemas(schemas: List[Dict]) -> Dict: {"role": "user", "content": prompt} ] - response = model.invoke(messages) + response = invoke_with_retry( + "merge_multiple_schemas", lambda: model.invoke(messages) + ) # 3. 解析响应 if hasattr(response, 'content'): @@ -223,7 +230,8 @@ def enrich_schema_with_xpath(schema_template: Dict, html_content: str) -> Dict: model=settings.default_model, api_key=os.getenv("OPENAI_API_KEY"), base_url=os.getenv("OPENAI_API_BASE"), - temperature=0.1 + temperature=0.1, + **chat_openai_invoke_kwargs(), ) messages = [ @@ -231,7 +239,9 @@ def enrich_schema_with_xpath(schema_template: Dict, html_content: str) -> Dict: {"role": "user", "content": user_message} ] - response = model.invoke(messages) + response = invoke_with_retry( + "enrich_schema_with_xpath", lambda: model.invoke(messages) + ) # 4. 解析响应 if hasattr(response, 'content'): diff --git a/web2json/utils/llm_client.py b/web2json/utils/llm_client.py index 46c62e7..4e2a3ed 100644 --- a/web2json/utils/llm_client.py +++ b/web2json/utils/llm_client.py @@ -11,6 +11,7 @@ from langchain_openai import ChatOpenAI from loguru import logger from web2json.config.settings import settings +from web2json.utils.llm_retry import chat_openai_invoke_kwargs, invoke_with_retry # 加载项目根目录的 .env 文件 project_root = Path(__file__).parent.parent @@ -96,12 +97,13 @@ def __init__( # 如果模型不在 tiktoken 的预设中,使用 cl100k_base 作为默认 self.tokenizer = tiktoken.get_encoding("cl100k_base") - # 构建 ChatOpenAI 参数 + # 构建 ChatOpenAI 参数(关闭 SDK 内置重试,由 chat_completion 统一退避) client_kwargs = { "model": self.model, "api_key": self.api_key, "base_url": self.api_base, - "temperature": self.temperature + "temperature": self.temperature, + **chat_openai_invoke_kwargs(), } # 如果启用了禁用思考模式选项,直接传递 extra_body 参数 @@ -241,8 +243,11 @@ def chat_completion( 模型响应文本 """ try: - # 使用 LangChain 1.0 的 invoke 方法 - response = self.client.invoke(messages) + # 使用 LangChain 1.0 的 invoke 方法(网关/超时等可重试) + response = invoke_with_retry( + "chat_completion", + lambda: self.client.invoke(messages), + ) # 从响应中提取 token 使用情况 if hasattr(response, 'response_metadata') and 'token_usage' in response.response_metadata: diff --git a/web2json/utils/llm_retry.py b/web2json/utils/llm_retry.py new file mode 100644 index 0000000..b17a5a0 --- /dev/null +++ b/web2json/utils/llm_retry.py @@ -0,0 +1,135 @@ +""" +LLM 调用重试:网关故障、超时、限流等瞬时错误时使用指数退避重试。 +""" +from __future__ import annotations + +import random +import time +from typing import Any, Callable, TypeVar + +from loguru import logger + +from web2json.config.settings import settings + +T = TypeVar("T") + +# 与 LLMClient 类似:进程内累计,便于 pipeline 汇总 +_retry_events: int = 0 + + +def reset_retry_stats() -> None: + """新一批 pipeline 运行前清零。""" + global _retry_events + _retry_events = 0 + + +def get_retry_stats() -> dict[str, int]: + """本次进程内 LLM 可重试失败后实际执行重试的次数(每次退避前计 1)。""" + return {"llm_retry_events": _retry_events} + + +def is_retryable_api_error(exc: BaseException) -> bool: + """是否为可重试的瞬时 API 故障(非业务/鉴权错误)。""" + try: + from openai import ( + APIConnectionError, + APITimeoutError, + InternalServerError, + RateLimitError, + ) + from openai import APIStatusError + + if isinstance(exc, (APIConnectionError, APITimeoutError, RateLimitError, InternalServerError)): + return True + if isinstance(exc, APIStatusError): + resp = getattr(exc, "response", None) + code = getattr(resp, "status_code", None) if resp is not None else None + if code is not None and code in (408, 429, 500, 502, 503, 504): + return True + except ImportError: + pass + + try: + import httpx + + if isinstance( + exc, + ( + httpx.ConnectError, + httpx.ReadTimeout, + httpx.WriteTimeout, + httpx.ConnectTimeout, + httpx.PoolTimeout, + ), + ): + return True + if isinstance(exc, httpx.HTTPStatusError) and exc.response is not None: + if exc.response.status_code in (408, 429, 500, 502, 503, 504): + return True + except ImportError: + pass + + # 兜底:部分异常被 LangChain 包装或仅有字符串信息 + msg = str(exc).lower() + hints = ( + "502", + "503", + "504", + "timeout", + "timed out", + "connection", + "temporarily unavailable", + "bad gateway", + "gateway timeout", + "rate limit", + "overloaded", + ) + if any(h in msg for h in hints): + return True + + return False + + +def invoke_with_retry( + operation_label: str, + invoke_fn: Callable[[], T], +) -> T: + """ + 执行无参调用(通常为 model.invoke),在可重试错误时退避重试。 + + Args: + operation_label: 日志用简短说明 + invoke_fn: 实际调用,如 lambda: model.invoke(messages) + + Returns: + invoke_fn 的返回值 + """ + global _retry_events + max_attempts = max(1, settings.llm_api_retry_max_attempts) + + for attempt in range(1, max_attempts + 1): + try: + return invoke_fn() + except Exception as e: + if attempt >= max_attempts or not is_retryable_api_error(e): + raise + delay = min( + settings.llm_api_retry_max_seconds, + settings.llm_api_retry_base_seconds * (2 ** (attempt - 1)), + ) + jitter = random.uniform(0, max(delay * 0.1, 0.05)) + sleep_s = min(delay + jitter, settings.llm_api_retry_max_seconds) + _retry_events += 1 + logger.warning( + f"[{operation_label}] LLM 调用失败 ({attempt}/{max_attempts}): {e!s} — " + f"{sleep_s:.1f}s 后重试 (累计重试 #{_retry_events})" + ) + time.sleep(sleep_s) + + +def chat_openai_invoke_kwargs() -> dict[str, Any]: + """构造 ChatOpenAI 的公共参数:关闭 SDK 内置重试,由 invoke_with_retry 统一退避。""" + out: dict[str, Any] = {"max_retries": 0} + if settings.llm_request_timeout is not None: + out["timeout"] = settings.llm_request_timeout + return out From cbccf68f0ceb3504bff1ae541a40d16c7cfd5387 Mon Sep 17 00:00:00 2001 From: root Date: Wed, 15 Apr 2026 14:36:12 +0800 Subject: [PATCH 4/5] docs: add ms-web-jwn full pipeline cleaning report Made-with: Cursor --- ...05\346\264\227\346\212\245\345\221\212.md" | 219 ++++++++++++++++++ 1 file changed, 219 insertions(+) create mode 100644 "\351\251\254\346\235\245\350\245\277\344\272\232\350\257\255ms-web-jwn\345\205\250\351\207\217\346\270\205\346\264\227\346\212\245\345\221\212.md" diff --git "a/\351\251\254\346\235\245\350\245\277\344\272\232\350\257\255ms-web-jwn\345\205\250\351\207\217\346\270\205\346\264\227\346\212\245\345\221\212.md" "b/\351\251\254\346\235\245\350\245\277\344\272\232\350\257\255ms-web-jwn\345\205\250\351\207\217\346\270\205\346\264\227\346\212\245\345\221\212.md" new file mode 100644 index 0000000..34f4395 --- /dev/null +++ "b/\351\251\254\346\235\245\350\245\277\344\272\232\350\257\255ms-web-jwn\345\205\250\351\207\217\346\270\205\346\264\227\346\212\245\345\221\212.md" @@ -0,0 +1,219 @@ +# 马来西亚语 ms-web-jwn 全量清洗报告 + +## 1. 任务范围 + +本次处理对象是目录: + +- [Prod/ms-web-jwn](/home/luqing/Downloads/web2json-agent/Prod/ms-web-jwn) + +其中包含 **9** 份源 `jsonl`(按文件名排序): + +1. `20260310094905_352_d5c97d3855d7e8c777f63fd07dfce918.jsonl` +2. `20260311203119_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee.jsonl` +3. `20260312001751_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee.jsonl` +4. `20260312035605_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee.jsonl` +5. `20260312073301_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee.jsonl` +6. `20260312111019_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee.jsonl` +7. `20260312150026_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee.jsonl` +8. `20260312183025_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee.jsonl` +9. `20260312220228_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee.jsonl` + +处理流程为: + +```text +jsonl -> html + manifest -> classify_html_dir -> extract_schema -> infer_code -> extract_data_with_code +``` + +说明: + +- 当前 repo 没有原生 URL 分桶能力;本次完全按 **HTML 结构聚类** 与 **cluster 内 parser 生成** 处理。 +- Schema 采用 **Predefined** 模式,字段定义见项目根目录 [my_schema.json](/home/luqing/Downloads/web2json-agent/my_schema.json):`title`、`content`、`author`、`date`(类型均为 `string`)。 +- 调度命令形态:`scripts/run_jsonl_web2json_pipeline.py --source-jsonl <文件> --schema-json my_schema.json`(部分批次带 `--merge-summary` 等补跑参数)。 + +## 2. 产物目录 + +每份 `jsonl` 对应一个流水线根目录(`work_id` 与文件名主名一致): + +| 源 jsonl | 输出目录 | +|-----------|----------| +| `20260310094905_352_d5c97d3855d7e8c777f63fd07dfce918.jsonl` | [output/20260310094905_352_d5c97d3855d7e8c777f63fd07dfce918_pipeline](/home/luqing/Downloads/web2json-agent/output/20260310094905_352_d5c97d3855d7e8c777f63fd07dfce918_pipeline) | +| `20260311203119_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee.jsonl` | [output/20260311203119_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee_pipeline](/home/luqing/Downloads/web2json-agent/output/20260311203119_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee_pipeline) | +| `20260312001751_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee.jsonl` | [output/20260312001751_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee_pipeline](/home/luqing/Downloads/web2json-agent/output/20260312001751_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee_pipeline) | +| `20260312035605_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee.jsonl` | [output/20260312035605_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee_pipeline](/home/luqing/Downloads/web2json-agent/output/20260312035605_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee_pipeline) | +| `20260312073301_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee.jsonl` | [output/20260312073301_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee_pipeline](/home/luqing/Downloads/web2json-agent/output/20260312073301_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee_pipeline) | +| `20260312111019_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee.jsonl` | [output/20260312111019_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee_pipeline](/home/luqing/Downloads/web2json-agent/output/20260312111019_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee_pipeline) | +| `20260312150026_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee.jsonl` | [output/20260312150026_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee_pipeline](/home/luqing/Downloads/web2json-agent/output/20260312150026_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee_pipeline) | +| `20260312183025_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee.jsonl` | [output/20260312183025_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee_pipeline](/home/luqing/Downloads/web2json-agent/output/20260312183025_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee_pipeline) | +| `20260312220228_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee.jsonl` | [output/20260312220228_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee_pipeline](/home/luqing/Downloads/web2json-agent/output/20260312220228_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee_pipeline) | + +每份流水线下的核心汇总: + +- `output/_pipeline/pipeline_summary.json`:token、各簇耗时、`llm_retry_stats`(若脚本版本已写入)等。 +- `input_html//manifest.jsonl`:与 `0001.html` 等 HTML 一一对应。 + +说明:本批 **未** 在流水线目录中生成 `qa_summary.json` / `qa_summary_schema_auto.json`(与 ms-web-kln 当时跑质检的路径不同);若需字段回溯 QA,可复用 kln 报告中的 `verify_extract_alignment.py` 思路单独跑(见第 9 节)。 + +## 3. 规模与耗时 + +### 3.1 源数据规模(汇总) + +- 源 `jsonl` 总大小:约 **11 GB**(`du -ch Prod/ms-web-jwn/*.jsonl`) +- `manifest.jsonl` 总行数(有效 HTML 行数口径):**76,360** 行 +- 各 `cluster_*_extract_data/result/*.json` 合计:**76,356** 个 + - 与 manifest 差 **4**:集中在首份小批量 `20260310094905_...`(1210 行 manifest,1206 份结果),多为源行无有效 `html` 字段被流水线跳过,属预期范围。 + +### 3.2 各份 jsonl 行数与磁盘占用(约) + +| 源 jsonl | 行数(wc -l) | 单文件约 | +|----------|---------------|----------| +| `20260310094905_352_d5c97d3855d7e8c777f63fd07dfce918.jsonl` | 1,210 | 169 MB | +| 其余 7 份(各 10k 行) | 10,000 × 7 | 约 1.3–1.4 GB/份 | +| `20260312220228_352_f4b4afed7fe66b4dd5bd1356f9d0e9ee.jsonl` | 5,150 | 686 MB | + +### 3.3 聚类簇数(来自各 `pipeline_summary.json` 的 `cluster_count`) + +| 源 jsonl | cluster_count | +|----------|-----------------| +| `20260310094905_...` | 7 | +| `20260311203119_...` | 7 | +| `20260312001751_...` | 5 | +| `20260312035605_...` | 6 | +| `20260312073301_...` | 6 | +| `20260312111019_...` | 6 | +| `20260312150026_...` | 6 | +| `20260312183025_...` | 5 | +| `20260312220228_...` | 5 | +| **合计** | **53**(九次流水线各自聚类簇数之和,非全局去重簇名) | + +### 3.4 流水线脚本侧耗时(`pipeline_elapsed_seconds`) + +九份 `pipeline_summary.json` 中,**前四份**较早跑次未写入 `pipeline_elapsed_seconds`(记为 `null`,汇总时按 **0**);**后五份**有记录,其和为: + +- `379.412 + 1050.147 + 1009.324 + 833.803 + 1615.611` = **4,888.297 s**(约 **1 h 21 min**) + +与脚本 [aggregate_site_pipeline_stats.py](/home/luqing/Downloads/web2json-agent/scripts/aggregate_site_pipeline_stats.py) 字段 **`pipeline_elapsed_seconds_sum`** 一致。 + +说明:九份任务多为 **顺序或错峰执行**,墙钟总时间以实际排期为准;上值为 **各次 pipeline 内已记录簇耗时之和**,用于与 token 同口径对比成本;未写字段的跑次不代表实际墙钟为 0。 + +## 4. Token 消耗 + +以下为九份 `pipeline_summary.json` 中 `total_token_usage` **相加**(与脚本 `aggregate_site_pipeline_stats.py` 一致): + +| 指标 | 数值 | +|------|------| +| 请求次数 `request_count` | 146 | +| 输入 tokens | 1,816,722 | +| 输出 tokens | 481,691 | +| 合计 tokens | 2,298,413 | + +各份明细(摘自各 `pipeline_summary.json`): + +| 源 jsonl | input | output | total | +|----------|-------|--------|-------| +| `20260310094905_...` | 290,156 | 60,120 | 350,276 | +| `20260311203119_...` | 329,881 | 67,892 | 397,773 | +| `20260312001751_...` | 176,182 | 44,655 | 220,837 | +| `20260312035605_...` | 208,499 | 65,357 | 273,856 | +| `20260312073301_...` | 70,853 | 21,271 | 92,124 | +| `20260312111019_...` | 208,477 | 63,485 | 271,962 | +| `20260312150026_...` | 206,996 | 59,579 | 266,575 | +| `20260312183025_...` | 160,467 | 47,037 | 207,504 | +| `20260312220228_...` | 165,211 | 52,295 | 217,506 | + +LLM 可重试失败次数(`llm_retry_stats.llm_retry_events` 之和):**0**(本批汇总为 0)。 + +## 5. 钱的换算(粗估) + +按 **Claude Sonnet 4.5** 公开价粗估(与 ms-web-kln 报告口径一致,仅作量级参考): + +- 输入:`$3 / 1M tokens` +- 输出:`$15 / 1M tokens` + +则: + +- 输入成本:`1.816722 × 3 ≈ USD 5.45` +- 输出成本:`0.481691 × 15 ≈ USD 7.23` +- **合计:约 USD 12.68** + +说明:实际计费若走内部兼容网关,可能与公开价不一致;此处仅供成本量级对比。 + +## 6. 解析结果 + +### 6.1 磁盘口径(推荐) + +- `manifest` 总行数:**76,360** +- `cluster_*_extract_data/result/*.json` 总数:**76,356** +- 差 **4**:见 **§3.1**,视为无有效 HTML 的跳过行。 + +按「是否产出结果 JSON 文件」: + +- 成功率:**76,356 / 76,360 ≈ 99.995%** + +### 6.2 `pipeline_summary.json` 中的 `parse_success_count` 之和 + +部分 summary 在 **补跑合并** 场景下只包含部分簇条目,**`parse_success_count` 加总可能小于磁盘真实结果数**(例如 `20260312073301_...` 的 summary 曾仅合并部分 cluster)。**验收与对账请以 §6.1 磁盘计数为准**。 + +### 6.3 与 ms-web-kln 的差异 + +- kln 报告含 **QA 双口径**(宽松 / 严格字段回溯);本批 **ms-web-jwn 未跑同套 qa_summary**,若要对齐 kln 的可信度分析,需另行执行字段对齐脚本并落盘报告。 + +## 7. 结论与后续建议 + +**结论(流程是否跑通):** + +- 9 份 `jsonl` 均在 `output/_pipeline/` 下具备完整目录结构及 `pipeline_summary.json`。 +- 磁盘结果 JSON 与 manifest 对齐情况见 **§6.1**,整体可视为 **全量清洗流程已跑通**。 + +**建议(可选):** + +1. **质检**:参考 [马来西亚语ms-web-kln全量清洗报告.md](/home/luqing/Downloads/web2json-agent/马来西亚语ms-web-kln全量清洗报告.md) 第 7–8 节,对抽样或全量跑 `verify_extract_alignment.py`,区分宽松 / 严格回溯口径。 +2. **summary 合并**:若需单份 `pipeline_summary.json` 完整列出全部簇统计,可在代码已支持 `--merge-summary` 的前提下补跑一次仅写 summary 的流程,或手工合并 JSON。 +3. **Schema**:若某簇 `content` 过长导致 QA 假阴性居多,可考虑拆字段或收紧 predefined schema(与 kln 报告 §10 思路一致)。 + +## 8. 可复用脚本 + +### 8.1 总调度脚本 + +- [run_jsonl_web2json_pipeline.py](/home/luqing/Downloads/web2json-agent/scripts/run_jsonl_web2json_pipeline.py) + +作用简述:拆 `jsonl` → `manifest` → 聚类 → `extract_schema` → `infer_code` → `extract_data_with_code`,并写 `pipeline_summary.json`(含 token、耗时、`llm_retry_stats` 等,视脚本版本)。 + +单文件示例: + +```bash +cd /home/luqing/Downloads/web2json-agent +.venv/bin/python scripts/run_jsonl_web2json_pipeline.py \ + --source-jsonl Prod/ms-web-jwn/你的文件.jsonl \ + --schema-json my_schema.json +``` + +目录批量(按文件名排序依次处理目录下所有 `*.jsonl`): + +```bash +.venv/bin/python scripts/run_jsonl_web2json_pipeline.py --source-dir Prod/ms-web-jwn \ + --schema-json my_schema.json +``` + +补跑未跑满簇、合并 summary 等参数见脚本 `--help`。 + +### 8.2 站点级 token / 耗时汇总 + +- [aggregate_site_pipeline_stats.py](/home/luqing/Downloads/web2json-agent/scripts/aggregate_site_pipeline_stats.py) + +用法示例: + +```bash +cd /home/luqing/Downloads/web2json-agent +.venv/bin/python scripts/aggregate_site_pipeline_stats.py Prod/ms-web-jwn +``` + +全部 jsonl 均有 summary 且无缺失时退出码为 **0**;需同时校验「每簇 HTML 数 = result JSON 数」时加 **`--strict`**。 + +### 8.3 其它 + +- **LLM 重试**:`web2json/utils/llm_retry.py` + 环境变量 `LLM_API_RETRY_*` / `LLM_REQUEST_TIMEOUT`(见 `web2json/config/settings.py`)。 +- **字段回溯 QA**:仍可采用 kln 报告 **§11.3** 中的 `verify_extract_alignment.py` 工作流(若仓库中已存在该脚本)。 + +--- + +*文档生成说明:规模、token、耗时、簇数等来自当前工作区 `Prod/ms-web-jwn` 与 `output/*_pipeline/pipeline_summary.json` 及磁盘计数;若你迁移目录或重跑流水线,请重新执行 `aggregate_site_pipeline_stats.py` 更新数字。* From ccd9f6d65d5db227020bff4b6a76a9e6296bc043 Mon Sep 17 00:00:00 2001 From: root Date: Fri, 17 Apr 2026 14:25:24 +0800 Subject: [PATCH 5/5] feat(crawl-jsonl): merge-dir layout cluster, cluster_list, slices; docs for ms-web-mma - Add classify_crawl_jsonl_dir, crawl_jsonl helpers (split, manifest-friendly rows) - Slice rows use layout_cluster_id / crawl_source_name / crawl_line_no (no _w2j) - Export APIs from web2json.__init__ - Add ms-web-mma flow doc and Jupyter Spark checklist Made-with: Cursor --- ...6\236\346\223\215\347\211\210checklist.md" | 411 +++++++++++++++++ ...01\347\250\213\350\257\264\346\230\216.md" | 272 ++++++++++++ web2json/__init__.py | 4 + web2json/simple.py | 420 ++++++++++++++++-- web2json/tools/crawl_jsonl.py | 217 +++++++++ 5 files changed, 1296 insertions(+), 28 deletions(-) create mode 100644 "ms-web-mma_Jupyter\345\256\236\346\223\215\347\211\210checklist.md" create mode 100644 "ms-web-mma_\350\201\232\347\261\273\344\270\216schema\345\233\236\345\241\253\346\265\201\347\250\213\350\257\264\346\230\216.md" create mode 100644 web2json/tools/crawl_jsonl.py diff --git "a/ms-web-mma_Jupyter\345\256\236\346\223\215\347\211\210checklist.md" "b/ms-web-mma_Jupyter\345\256\236\346\223\215\347\211\210checklist.md" new file mode 100644 index 0000000..db13061 --- /dev/null +++ "b/ms-web-mma_Jupyter\345\256\236\346\223\215\347\211\210checklist.md" @@ -0,0 +1,411 @@ +# ms-web-mma:Jupyter 实操版 Checklist(Spark 清洗作业) + +本文把 [ms-web-mma_聚类与schema回填流程说明.md](/home/luqing/Downloads/v2/web2json-agent/ms-web-mma_聚类与schema回填流程说明.md) 中的目标流程,展开成一份可在 `http://jupyter.bigdata.shlab.tech/` 上执行的实操 checklist。 + +适用目标: + +- 利用 **web2json** 对 `ms-web-mma` 做统一聚类 +- 按簇抽取 schema / parser / data +- 再回填成发布向 JSONL + +--- + +## 0. 执行原则 + +建议先跑 **主文件**: + +- `20260310094859_353_79bda33fa180eedac40d37876224609d.jsonl` + +跑通主文件后,再放大到整个目录。原因很简单: + +- 主文件有 191 行,是主要数据量 +- 其余 5 个文件都只有 1 行,容易因为聚类噪声而影响你对流程状态的判断 + +--- + +## 1. 准备环境 + +### 输入 + +- Jupyter 环境:`http://jupyter.bigdata.shlab.tech/` +- 本地代码目录: + [web2json-agent](/home/luqing/Downloads/v2/web2json-agent) +- 流程说明: + [ms-web-mma_聚类与schema回填流程说明.md](/home/luqing/Downloads/v2/web2json-agent/ms-web-mma_聚类与schema回填流程说明.md) +- 数据目录: + [Prod/ms-web-mma](/home/luqing/Downloads/v2/web2json-agent/Prod/ms-web-mma) +- 路径约定: + `Prod/ms-web-mma/s3Path.txt` + +### 要做什么 + +1. 打开 notebook,确认 Python 环境可用。 +2. 确认能访问 `web2json-agent` 项目目录。 +3. 确认能读取 `Prod/ms-web-mma/*.jsonl`。 +4. 读取 `s3Path.txt`,确认输入与输出前缀。 +5. 明确本次跑的是: + - 仅主文件 + - 还是整个目录 + +### 产出 + +- 一份确认过的任务参数清单,例如: + +```text +project_root=/home/luqing/Downloads/v2/web2json-agent +source_dir=/home/luqing/Downloads/v2/web2json-agent/Prod/ms-web-mma +source_files=[20260310094859_353_79bda33fa180eedac40d37876224609d.jsonl] +output_root=s3://xyz2-process-hdd1/nlp/ms-web-mma/v0001 +``` + +### 要验什么 + +- 代码目录能读 +- 源文件能读 +- S3 路径约定明确 +- 本次范围已经锁定,不临时变化 + +--- + +## 2. 启动 Spark Session + +### 输入 + +- notebook 环境 +- 项目根目录 +- 任务名,例如: + `ms-web-mma-cluster-v0001` + +### 要做什么 + +1. 在 notebook 中初始化 Spark Session。 +2. 配置应用名、必要的 executor / memory / shuffle 参数。 +3. 将 `web2json-agent` 项目目录加入 Python path。 +4. 用一个极小样本做试读。 + +### 产出 + +- 一个可用的 SparkSession +- 一次试读结果,确认 JSON 行结构正常 + +### 要验什么 + +- Spark 能成功启动 +- 能成功 `read jsonl` +- 抽样记录中至少存在: + - `html` + - `track_id` 或可替代唯一键 + - `url` + +--- + +## 3. 生成全量索引表 + +### 输入 + +- 目标源文件列表 +- SparkSession + +### 要做什么 + +对所有目标 JSONL 行生成统一索引。建议每一行至少补齐这些字段: + +| 字段 | 说明 | +|---|---| +| `global_index` | 全局唯一顺序号 | +| `source_jsonl` | 源文件路径 | +| `source_name` | 源文件名 | +| `line_no` | 文件内行号 | +| `record_id` | 优先用 `track_id`,缺失时兜底 | +| `html` | 原始 HTML | +| `url` | 原始 URL | + +建议将这一步的结果单独落盘,作为后面所有 join 的主键底表。 + +### 产出 + +- 一份“全量索引表” +- 可选落盘: + - `output/ms-web-mma/v001/index/all_rows_with_index.jsonl` + +### 要验什么 + +- 行数是否等于输入总行数 +- `global_index` 是否唯一 +- `(source_jsonl, line_no)` 是否唯一 +- 每行是否有可用 `record_id` + +--- + +## 4. 跑统一布局聚类 + +### 输入 + +- 全量索引表 +- 每行的 `html` + +### 要做什么 + +1. 对每行 `html` 调 web2json 的布局特征逻辑。 +2. 对全体样本只做一次全局布局聚类。 +3. 为每条记录生成: + - `layout_cluster_id` +4. 单独标出: + - `noise` 或 `-1` + +### 产出 + +- `cluster_list/cluster_list.jsonl` +- 可选: + - `cluster_list/cluster_info.txt` + +建议 `cluster_list.jsonl` 包含: + +| 字段 | 说明 | +|---|---| +| `global_index` | 全局索引 | +| `layout_cluster_id` | 聚类标签 | +| `source_jsonl` | 源路径 | +| `source_name` | 源文件名 | +| `line_no` | 原始行号 | +| `record_id` | 唯一标识 | + +### 要验什么 + +- `cluster_list` 总行数是否等于输入总行数 +- 是否存在异常大量 `noise` +- `layout_cluster_id` 是否有稳定分布 +- `cluster_info.txt` 能否帮助人工快速判断簇是否合理 + +--- + +## 5. 按簇切片写 JSONL + +### 输入 + +- 全量索引表 +- `cluster_list.jsonl` + +### 要做什么 + +1. 根据 `layout_cluster_id` 把原始行切到不同簇目录。 +2. 每个簇形成一个或多个切片 JSONL。 +3. `noise` 单独处理。 + +建议目录: + +```text +output/ms-web-mma/v001/cluster_list/format_clusters/ + cluster_0/ + cluster_1/ + ... + noise/ +``` + +切片行里建议保留: + +- 原始 crawl 字段 +- `global_index` +- `layout_cluster_id` +- `source_name` +- `line_no` +- `record_id` + +### 产出 + +- `cluster_k/*.jsonl.gz` +- `noise/*.jsonl.gz` + +### 要验什么 + +- 所有簇切片行数总和是否等于输入总行数 +- 同一 `global_index` 是否只出现在一个簇里 +- `noise` 是否单独隔离成功 + +--- + +## 6. 对每个簇独立跑抽取 + +### 输入 + +- `cluster_k/*.jsonl.gz` + +### 要做什么 + +对每个有效簇依次执行: + +1. `extract_schema` +2. `infer_code` +3. `extract_data_with_code` + +每个簇都要形成独立产物目录。 + +建议每簇至少保留: + +| 文件 | 作用 | +|---|---| +| `schema.json` | 最终 schema | +| `final_parser.py` | 最终 parser | +| `result/*.json` | 每页抽取结果 | +| `cluster_k_extract_manifest.jsonl` | 回填主索引 | + +### 产出 + +- 每个簇的抽取产物目录 +- `cluster_k_extract_manifest.jsonl` + +### 要验什么 + +- 每个簇都至少有 schema 和 parser +- `result/*.json` 数量和 manifest 对齐 +- manifest 中能唯一回指 `global_index` 或 `(source_name, line_no)` +- 对于失败页,是否有 `parse_ok=false` 或等价状态 + +--- + +## 7. 回填原始 JSONL + +### 输入 + +- 原始索引表 +- `cluster_list.jsonl` +- 每簇 manifest +- 每簇 `result/*.json` + +### 要做什么 + +基于 join 逻辑把抽取结果写回原始行: + +1. 以 `global_index` 或 `(source_jsonl, line_no)` 做主键对齐。 +2. 在原始对象基础上增量合并抽取字段: + - `content` + - `title` + - `author` + - `publish_time` + - 其他 schema 定义字段 +3. 把 schema / xpath 元信息写到: + - `remark.extract_schema` +4. 生成: + - `track_loc` + - `doc_loc` + +注意: + +- 不要新增 `_w2j` 顶层字段 +- 原始 crawl 字段尽量保留 + +### 产出 + +- 回填后的“发布向 JSONL” + +### 要验什么 + +- 回填后总行数是否与输入一致 +- 未抽取成功的行是否仍然可追踪 +- `remark.extract_schema` 是否落对 +- `track_loc` / `doc_loc` 是否符合文档约定 + +--- + +## 8. 写发布结果 + +### 输入 + +- 回填后的结果 DataFrame / JSONL +- 输出前缀:`xyz2-process-hdd1/.../nlp/ms-web-mma/v0001` + +### 要做什么 + +1. 按最终发布规范写出 `.jsonl.gz` +2. 目录中保留: + - 发布文件 + - pipeline 产物 + - 可追溯索引 + +### 产出 + +- 发布路径下的最终 `.jsonl.gz` + +### 要验什么 + +- 能按 `doc_loc` 找到对应发布文件 +- gzip 可解压 +- 单行 JSON 格式合法 +- 样本抽查字段完整 + +--- + +## 9. 最小验收清单 + +在整个 notebook 流程结束后,至少要核对下面这些: + +| 项 | 验收问题 | +|---|---| +| 输入行数 | 是否与原始输入一致 | +| `cluster_list` 行数 | 是否与输入一致 | +| 切片总行数 | 是否与输入一致 | +| manifest 对齐 | 是否能唯一回填到原始行 | +| 回填后行数 | 是否与输入一致 | +| `remark.extract_schema` | 是否存在且结构合理 | +| `track_loc` | 是否能回指原始 source | +| `doc_loc` | 是否能回指发布文件 | +| `noise` | 是否单独标记处理 | + +--- + +## 10. 推荐 notebook 拆分 + +建议不要把所有逻辑塞进一个 notebook,最好拆成 4 本: + +### 10.1 `01_build_index_and_classify.ipynb` + +负责: + +- 读源数据 +- 建立索引 +- 跑统一聚类 +- 写 `cluster_list.jsonl` + +### 10.2 `02_split_by_cluster.ipynb` + +负责: + +- 按簇切片 +- 写 `cluster_k/*.jsonl.gz` + +### 10.3 `03_extract_per_cluster.ipynb` + +负责: + +- 对每个 `cluster_k` 跑 + - `extract_schema` + - `infer_code` + - `extract_data_with_code` +- 写 manifest + +### 10.4 `04_merge_backfill_and_publish.ipynb` + +负责: + +- join 回填 +- 写最终发布 JSONL +- 做最终验收 + +--- + +## 11. 建议执行顺序 + +最稳妥的顺序是: + +1. 先只跑主文件 `20260310094859_...jsonl` +2. 跑通 `索引 -> 聚类 -> 切片 -> 单簇抽取 -> 回填` +3. 固化产物格式 +4. 再扩大到整目录 + +--- + +## 12. 一句话版本 + +```text +先统一聚类,再按簇抽取,最后按索引回填原始行并发布。 +``` diff --git "a/ms-web-mma_\350\201\232\347\261\273\344\270\216schema\345\233\236\345\241\253\346\265\201\347\250\213\350\257\264\346\230\216.md" "b/ms-web-mma_\350\201\232\347\261\273\344\270\216schema\345\233\236\345\241\253\346\265\201\347\250\213\350\257\264\346\230\216.md" new file mode 100644 index 0000000..6876976 --- /dev/null +++ "b/ms-web-mma_\350\201\232\347\261\273\344\270\216schema\345\233\236\345\241\253\346\265\201\347\250\213\350\257\264\346\230\216.md" @@ -0,0 +1,272 @@ +# ms-web-mma:聚类 → 按簇抽取 → 回填原始 JSONL(流程说明稿) + +> 本文档描述针对 `web2json-agent/Prod/ms-web-mma/` 下**全部 crawl JSONL** 的**目标流程**与**数据契约**,便于评审;**暂不涉及具体代码改动**。 + +--- + +## 1. 背景与目标 + +**一句话目标**:对 `ms-web-mma` 全目录 crawl 数据**只做一次**布局聚类;再**按簇**分别跑 schema / parser / 抽取;最后用 **`cluster_list.jsonl` + manifest** 把结果**对齐回填**到原始行,生成带 **`remark`、`track_loc`、`doc_loc`** 的发布向 JSONL(不在行内增加 `_w2j` 类字段)。 + +- **输入**:`Prod/ms-web-mma/` 目录(可递归)下所有 `*.jsonl`,每行一条 JSON,至少包含 **`html`** 字段(及 `track_id` 等溯源字段)。 +- **目标**: + 1. 对所有行的 `html` **统一做一次**布局聚类(不按单个 jsonl 文件串行跑完整流水线)。 + 2. 将数据**按簇切片**成多份 JSONL,**每个簇**分别执行 `extract_schema` → `infer_code` → `extract_data`(或 `extract_data_with_code`)。 + 3. 将每簇抽取得到的**结构化结果**(如 `content`、`title` 等)以及 **schema / xpath 元信息**(如 `remark.extract_schema`)**回填**到**原始行**对应位置,产出**新的 JSONL**(行数与溯源与原始对齐或可追踪)。 + +--- + +## 2. 产物目录建议:`cluster_list/` 文件夹 + +单文件聚类会写出 `cluster_list.jsonl`;全目录合并聚类建议统一到 **`output/ms-web-mma/v001/`**(版本号可换)下,结构示例: + +```text +output/ms-web-mma/v001/ +├── jsonl/ # 可选:发布或中间 gzip 与原始命名对齐 +│ └── 20260310....jsonl.gz +│ └── ... +└── cluster_list/ + ├── cluster_info.txt # 人类可读摘要(可选) + ├── cluster_list.jsonl # 全局索引:global_index ↔ 源文件/行号/簇 id + └── format_clusters/ # 按簇切分后的输入(名称可改为 slices/) + ├── cluster_0/ + │ └── _cluster_0.jsonl.gz # 仅含 cluster_0 的原始行(可选附加调试字段) + ├── cluster_1/ + │ └── ... + └── noise/ + └── _noise.jsonl.gz +``` + +说明: + +- **`cluster_list/cluster_list.jsonl`**:回填时的**主索引**(见 §4.2)。 +- **`format_clusters/cluster_k/`**(或扁平命名):每个簇一份切片,供该簇 `extract_schema` 起全流程使用;**簇编号建议与 `layout_cluster_id` 一致**(`cluster_0` 而非 `cluster_01`,除非团队另有约定)。 +- 文件名中的 **``** 可与源 `jsonl` 主名一致,便于对照(多源合并时也可用 `union` 等统一前缀)。 + +--- + +## 3. 流程总览(四段) + +```text +[1] 合并聚类(全目录 jsonl) + ↓ +[2] 写出 cluster_list/ + 按簇切片 jsonl + ↓ +[3] 对每个 cluster_k 独立跑:extract_schema → infer_code → extract_data(_with_code) + ↓ +[4] 合并回填:按 cluster_list + manifest 对齐,写「新 jsonl」 + - 保留原 crawl 字段;新增/覆盖抽取字段;写入 remark、track_loc、doc_loc +``` + +### 3.1 回填后单行结构示例(目标形态) + +以下 **S3 路径为 ms-web-jwn 历史示例**,仅说明字段关系;**ms-web-mma 落地时请替换为实际桶与前缀**。 + +```json +{ + "track_id": "2201bfce-a0ef-4ca8-90e4-7e34172c9395", + "url": "https://www.heritage.gov.my/index.php/tapak-warisan", + "content": "# Tapak Warisan\n\npage_title: Tapak Warisan\ntotal_records: Display #", + "html": "...", + "content_bytes": 67, + "remark": { + "extract_schema": { + "title": { + "type": "string", + "description": "文章标题", + "value_sample": "Rindukan Muzium. Berita baik untuk anda!", + "xpaths": ["//h1[@class='page-title kl-blog-post-title entry-title']/text()"] + }, + "author": { + "type": "string", + "description": "作者姓名", + "value_sample": "defaultweb", + "xpaths": ["//span[@class='itemAuthor ...']/text()"] + }, + "publish_time": { + "type": "string", + "description": "发布时间", + "value_sample": "Rabu, 29 September 2021", + "xpaths": ["//span[@class='itemDateCreated ...']/text()"] + }, + "content": { + "type": "string", + "description": "文章正文内容(完整文本)", + "value_sample": "Muzium Sejarah ...", + "xpaths": ["//div[@class='itemBody ...']/p//text()"] + }, + "content_paragraphs": { + "type": "array", + "description": "文章段落列表", + "value_sample": ["..."], + "xpaths": ["//div[@class='itemBody ...']/p"] + }, + "content_list_items": { + "type": "array", + "description": "文章中的列表项", + "value_sample": ["...", "..."], + "xpaths": ["//div[@class='itemBody ...']//ul/li"] + } + } + }, + "track_loc": [ + "s3://.../prod/ms-web-mma/jsonl/<源>.jsonl?bytes=," // input path defined in web2json-agent/Prod/ms-web-mma/s3Path.txt + ], + "doc_loc": "s3://.../nlp/ms-web-mma/v001/ms/<发布文件>.jsonl.gz?bytes=0,0" // output path defined in web2json-agent/Prod/ms-web-mma/s3Path.txt +} +``` + +--- + +## 4. 各阶段说明 + +### 4.1 阶段一:统一 classify(合并聚类) + +- **输入**:`Prod/ms-web-mma/**/*.jsonl`(或顶层仅 jsonl 的目录)。 +- **行为**:与现有 `classify_crawl_jsonl_dir` 一致:所有行的 `html` 进入**同一套**布局聚类,得到 `layout_cluster_id ∈ {0,1,…} ∪ {-1}`(噪声)。 +- **不要求**在此阶段跑 `extract_schema`。 + +### 4.2 阶段二:切片与 `cluster_list` + +- **`cluster_list.jsonl`**(每行一条建议字段): + +| 字段 | 含义 | +|------|------| +| `global_index` | 全合并后的从 0 开始的序号(或从 1,实现需统一) | +| `layout_cluster_id` | 聚类标签,`-1` 表示噪声 / 无法聚类 | +| `source_jsonl` | 原始文件绝对路径或可解析的相对路径 | +| `source_name` | 文件名 | +| `line_no` | 在该 jsonl 文件内的行号(从 1 计) | +| `record_id` | 业务主键,如 `track_id`;缺失时可用 `line_{line_no}` | + +- **切片文件**:每个 `cluster_k` 一个 jsonl,每行仍是**原始 crawl 行**(JSON 对象),如需调试可在切片阶段可选附加与 `cluster_list` 一致的元信息副本(例如簇 id、源文件名、行号),**不要求**使用 `_w2j` 前缀字段。 + +**噪声** `noise` 可单独一个 `*_noise.jsonl`;是否对该文件也跑抽取由产品决定(默认可跳过或只跑兜底 parser)。 + +### 4.3 阶段三:按簇抽取(现有 web2json API) + +对每个 `cluster_k` 的切片: + +1. **`extract_schema`**:得到 `final_schema.json`(字段定义、xpath、`value_sample` 等)。 +2. **`infer_code`**:生成 `final_parser.py`。 +3. **`extract_data` / `extract_data_with_code`**:对该簇**全部行**解析,得到多份 `result/*.json` 或与行一一对应的结构化结果。 + +**注意**:簇内解析结果的「键」必须与回填时能**对应回** `cluster_list` 中的某一行(见 §5.2)。 + +### 4.4 阶段四:回填组成新 JSONL + +**目标**:在**不丢失原始 crawl 字段**的前提下,把本簇抽取结果写回「原始行」的扩展形态。 + +--- + +## 5. 回填数据契约(建议) + +### 5.1 新 JSONL 每行建议结构(逻辑) + +在原始对象基础上**增量合并**(字段名可评审后固定): + +- **抽取正文等**:如 `content`、`title`、`author`、`publish_time` 等——与 `final_schema` / parser 输出一致。 +- **schema / xpath 溯源**:放入 **`remark.extract_schema`**(结构见 §3.1),与顶层抽取字段区分,避免与 crawl 原字段无意义冲突。 + +说明: + +- **`remark.extract_schema`**:存放**该簇最终 merged schema**(每字段含 `type` / `description` / `value_sample` / `xpaths`);若体量大,可改为只存 **schema 的 S3/本地路径** + **hash**,正文仍保留顶层抽取值。 +- **非空策略**:仅当某字段在 parser 结果中非空(或满足 QA 规则)时写入顶层;否则可省略或显式 `null`(需统一约定)。 +- **`track_loc` / `doc_loc`**:与现有发布规范对齐(见《多语种网站清洗计划》§4.4);`doc_loc` 指向 gzip 发布产物时需带 `?bytes=`。流水线产物路径如需落盘,可放在 `remark` 或单独 manifest,**不**在行顶层增加 `_w2j`。 + +### 5.2 对齐键(回填时如何「找到原始行」) + +必须能唯一对应: + +- **主键**:`(source_jsonl, line_no)` 或 `record_id`(全局唯一时)。 +- **簇内解析顺序**:若 parser 按切片文件顺序输出,需与 `global_index` 或 `(source_name, line_no)` 建立**显式映射**(建议在抽取阶段输出 **`manifest.jsonl`**:一行对应 `global_index` → 解析结果或 `result_xxx.json`)。 + +推荐在阶段三结束时为每个 cluster 产出一个 **`cluster_k_extract_manifest.jsonl`**: + +```json +{"global_index": 123, "source_name": "part-000.jsonl", "line_no": 45, "record_id": "uuid", "parse_ok": true} +``` + +回填脚本依赖:**`cluster_list.jsonl` + manifest + 每行 parse 结果**(三者 join)。 + +### 5.3 工程与分布式(评审点) + +- **Spark / 分区**:聚类前可分区读 jsonl;**回填**建议按 **原始 `source_name` 或分区键** 写回,避免单 task 超大 shuffle。 +- **幂等**:同一 `global_index` 重复跑应可覆盖或带 `schema_version`。 +- **与 Spark 的分工**:四段流程在集群上的拆分见 **§5.4**;其中布局侧须与 **web2json** 现网逻辑一致(`get_feature` → `fuse_features` → `cluster_html_layouts_optimized` 等)。 + +### 5.4 Spark 阶段划分草图(对齐 web2json 逻辑) + +以下与 §3「四段流程」一一对应;**布局聚类与单机 `classify_crawl_jsonl` / `classify_crawl_jsonl_dir`(`web2json.simple`)同源**,Spark 只负责可并行 I/O 与 join,**不另写一套特征或聚类定义**。 + +**web2json 侧关键符号(须保持一致)** + +| 名称 | 位置 | 作用 | +|------|------|------| +| `get_feature` | `web2json.tools.html_layout_cosin.get_feature` | 输入 HTML 字符串,输出**布局特征 dict**(`tags` / `attrs` 等层级结构)。 | +| `fuse_features` | `web2json.tools.html_layout_cosin.fuse_features`(由 `cluster` 内部调用) | 在统一 `layer_n`、`k` 下将多页 dict **融合为稠密向量**,供余弦相似度与 DBSCAN。 | +| `cluster_html_layouts_optimized` | `web2json.tools.cluster.cluster_html_layouts_optimized` | 与 `simple._execute_crawl_layout_cluster` 相同入口:`threshold`、`k`、`min_samples`、`use_knn_graph`、`n_neighbors` 等须与线上一致,否则簇 id 不可比。 | +| `classify_crawl_jsonl` / `classify_crawl_jsonl_dir` | `web2json.simple` | 单机全链路:读 JSONL → 上述特征与聚类 → 写 `cluster_list` / 按簇切片。 | +| `extract_schema` / `infer_code` / `extract_data` / `extract_data_with_code` | `web2json.simple` | §4.3 按簇流水线;回填依赖 manifest 与 `cluster_list` join(§5.2)。 | + +**阶段映射(文档 §3 ↔ Spark / 混部)** + +| 文档阶段 | Spark 上适合做的事 | 须沿用 web2json 或外置作业的部分 | +|----------|-------------------|----------------------------------| +| **[1] 合并聚类** | 并行读 JSONL、解析行;在 executor 上对每行 HTML 调用 **`get_feature`**(与 `web2json.tools.cluster._compute_features` 相同)。 | **全局聚类**:在单机路径中为 **`cluster_html_layouts_optimized`**(内部:`fuse_features` → `cosine_similarity` + DBSCAN,或 `use_knn_graph` 近似)。分布式上需**二次开发或外置**:例如先落盘全量 feature dict,再在**大内存单机 / Ray** 调同一套 `fuse_features`+聚类,产出 `global_index → layout_cluster_id` 表再回灌 Spark;**禁止**换用与 `get_feature` 无关的自研特征。 | +| **[2] cluster_list + 切片** | 对标签表生成 `cluster_list.jsonl`;按 `layout_cluster_id` **partitionBy** 写各簇 JSONL(及 `noise`)。可选行内附加 `layout_cluster_id`、`crawl_source_name`、`crawl_line_no`(与 `annotate_slice_rows` 约定一致)。 | 字段语义同 §4.2。 | +| **[3] 按簇抽取** | 若仅用生成好的 `final_parser.py` 做 CPU 解析,可用 **`mapPartitions`** 批量跑解析(等价于 `extract_data_with_code` 的解析段)。 | **`extract_schema` / `infer_code`**(及含 LLM 的 schema 生成)多为 **每簇独立作业**(编排起 Pod/单机),与现网 API 一致;产出每簇 **`cluster_k_extract_manifest.jsonl`**(§5.2)。 | +| **[4] 回填** | **`join`**:`cluster_list` + manifest + 解析结果;合并 `remark`、`track_loc`、`doc_loc`;**按 `source_name` 或分区键写出**,控制 shuffle。 | 逻辑同 §5.1,不新增 `_w2j` 顶层字段。 | + +**流程简图(执行形态)** + +```mermaid +flowchart TB + subgraph P1["[1] 合并聚类"] + S1[Spark: 读 JSONL] + S2["executor: get_feature(html)"] + S3["外置/大内存: fuse_features + cluster_html_layouts_optimized 同参"] + S4[标签表 global_index / layout_cluster_id] + S1 --> S2 --> S3 --> S4 + end + subgraph P2["[2] 切片"] + T1[Spark: cluster_list + partitionBy 簇] + end + subgraph P3["[3] 抽取"] + U1["每簇: extract_schema → infer_code → extract_data(_with_code)"] + U2[manifest] + U1 --> U2 + end + subgraph P4["[4] 回填"] + V1[Spark: join + 分区写出] + end + S4 --> T1 --> U1 + U2 --> V1 +``` + +**小结**:Spark 擅长 **[2][4]** 与 **[1] 中的读数 + **`get_feature`**;**与 `cluster_html_layouts_optimized` 等价的聚类**和 **含 LLM 的 [3] 前段**宜混部或编排,且 **特征与聚类参数必须来自 web2json**,以保证与 `Prod` 本地/单机试跑结果可对齐。 + +--- + +## 6. 待确认清单 + +1. **`format_clusters` 命名**是否改为 `slices/` 或 `by_cluster/`? +2. **回填产物**:按**源文件一对一**(`xxx.with_extract.jsonl`)还是**单文件 merged**? +3. **噪声簇**是否跑抽取,还是仅打标原样输出? +4. **ms-web-mma** 正式 **S3 前缀**与 **doc_loc** 桶是否已定稿(文档中示例路径需替换)? + +--- + +## 7. `Prod/ms-web-mma` 本地目录快照(便于对齐 S3 与聚类输入) + +约定见同目录 **`s3Path.txt`**(Input:`hcorpus-develop-hw60p/.../prod/ms-web-mma/jsonl/`;Output:`xyz2-process-hdd1/.../nlp/ms-web-mma/v0001`)。 + +当前仓库内该目录含 **6 个** `*.jsonl` + `s3Path.txt`: + +| 文件 | 行数(约) | 说明 | +|------|------------|------| +| `20260310094859_353_79bda33fa180eedac40d37876224609d.jsonl` | 191 | 主数据量(约 61MB,单行 HTML 较大) | +| `20260312172301_353_ac06ada8c9d8f53d11ff4ce459ff470e.jsonl` | 1 | 小样本 | +| `20260320141631_353_2f2ece9c1819287308a80962f0f108f5.jsonl` 等 4 个 | 各 1 | 同日多批次试探样(各约百 KB 级) | + +**清洗建议**:聚类/回填以 **191 行主文件**为基准即可;其余 5 个单行文件若仅为调试,可迁到子目录(如 `samples/`)或归档,避免与全量 `discover_jsonl_files` 合并时重复混入(若暂不移除,合并跑目录时需知悉总行数 ≈ **196** 且含重复 URL 风险)。是否搬迁由工程侧决定。 diff --git a/web2json/__init__.py b/web2json/__init__.py index eeeeac5..fa75dee 100644 --- a/web2json/__init__.py +++ b/web2json/__init__.py @@ -18,6 +18,8 @@ infer_code, extract_data_with_code, classify_html_dir, + classify_crawl_jsonl, + classify_crawl_jsonl_dir, # 返回数据类 ExtractDataResult, ExtractSchemaResult, @@ -38,6 +40,8 @@ "infer_code", "extract_data_with_code", "classify_html_dir", + "classify_crawl_jsonl", + "classify_crawl_jsonl_dir", # 返回数据类 "ExtractDataResult", "ExtractSchemaResult", diff --git a/web2json/simple.py b/web2json/simple.py index 95b8ebe..6629564 100644 --- a/web2json/simple.py +++ b/web2json/simple.py @@ -5,7 +5,7 @@ import sys import json from pathlib import Path -from typing import Optional, Dict, List, Any +from typing import Optional, Dict, List, Any, Callable from dataclasses import dataclass, asdict import numpy as np from loguru import logger @@ -99,7 +99,7 @@ class Web2JsonConfig: Args: name: 运行名称(在output_path下创建此名称的子目录) - html_path: HTML文件目录 + html_path: HTML 目录、单个 ``.html``/``.htm`` 文件,或 crawl 源 ``.jsonl``(见 ``crawl_html_field``) output_path: 输出主目录(默认为"output") iteration_rounds: 迭代轮数(用于Schema学习的样本数量,默认3) schema: Schema模板(可选,为None时使用auto模式,有值时使用predefined模式) @@ -129,6 +129,9 @@ class Web2JsonConfig: parser_code: Optional[str] = None save: Optional[List[str]] = None remove_null_fields: bool = True + # crawl JSONL:html_path 为 .jsonl 时物化 HTML 所用字段与主键(见 _resolve_pipeline_html_files) + crawl_html_field: str = "html" + crawl_jsonl_id_field: Optional[str] = "track_id" def __post_init__(self): """验证配置""" @@ -207,6 +210,359 @@ def _read_html_files(directory_path: str) -> List[str]: raise ValueError(f"路径既不是文件也不是目录: {directory_path}") +def _resolve_pipeline_html_files( + config: Web2JsonConfig, +) -> tuple[List[str], Optional[Callable[[], None]]]: + """ + 将 ``html_path`` 解析为 HTML 文件路径列表。 + + - 目录 / 单 ``.html``:沿用 ``_read_html_files``。 + - ``.jsonl``:把每行 ``html`` 字段物化到**临时目录**中的 ``.html``,避免在项目里落大量切片文件; + 返回第二个值 ``cleanup``,调用方须在 ``finally`` 中执行以删除临时目录。 + """ + import shutil + import tempfile + from web2json.tools.crawl_jsonl import materialize_jsonl_to_html_dir + + p = Path(config.html_path) + if not p.exists(): + raise FileNotFoundError(f"路径不存在: {config.html_path}") + + if p.is_file() and p.suffix.lower() == ".jsonl": + tmp = Path(tempfile.mkdtemp(prefix="w2j_crawl_jsonl_")) + try: + hf = getattr(config, "crawl_html_field", "html") + idf = getattr(config, "crawl_jsonl_id_field", "track_id") + files = materialize_jsonl_to_html_dir( + p, tmp, html_field=hf, id_field=idf + ) + except Exception: + shutil.rmtree(tmp, ignore_errors=True) + raise + if not files: + shutil.rmtree(tmp, ignore_errors=True) + raise ValueError( + f"JSONL 中无可用 html 字段(字段名: {getattr(config, 'crawl_html_field', 'html')}): {config.html_path}" + ) + + def cleanup() -> None: + shutil.rmtree(tmp, ignore_errors=True) + + logger.info( + f" [crawl jsonl] 已将 {len(files)} 条 HTML 物化到临时目录(流水线结束后自动删除)" + ) + return files, cleanup + + if p.is_file(): + if p.suffix.lower() in (".html", ".htm"): + return [str(p.resolve())], None + raise ValueError( + f"不支持的文件: {p}(请使用 HTML 目录、单个 .html 或 crawl .jsonl)" + ) + if p.is_dir(): + return _read_html_files(str(p)), None + raise ValueError(f"路径既不是文件也不是目录: {config.html_path}") + + +def _execute_crawl_layout_cluster( + line_metas: List[Dict[str, Any]], + config: Web2JsonConfig, + *, + output_stem: str, + hf: str, + rid_f: Optional[str], + report_extra: Optional[Dict[str, Any]] = None, + annotate_slice_rows: bool = False, +) -> ClusterResult: + """对已加载的 ``line_metas``(见 ``load_crawl_line_metas_for_file``)执行布局聚类并可选落盘。""" + from web2json.tools.cluster import cluster_html_layouts_optimized + from web2json.tools.crawl_jsonl import split_jsonl_by_cluster_labels, write_jsonl_lines + from web2json.tools.html_layout_cosin import get_feature + + if not line_metas: + raise ValueError("无有效行") + + surrogate_keys: List[str] = [ + f"{m['source_name']}:{m['line_no']}" for m in line_metas + ] + valid_keys: List[str] = [] + valid_contents: List[str] = [] + for m, sk in zip(line_metas, surrogate_keys): + if not m.get("html"): + continue + try: + feature = get_feature(m["html"]) + except Exception as e: + logger.warning(f" 跳过布局特征提取失败: {sk} ({e})") + continue + if not feature: + logger.warning(f" 跳过无有效布局特征: {sk}") + continue + valid_keys.append(sk) + valid_contents.append(m["html"]) + + if not valid_contents: + raise Exception("聚类失败: 没有可用于布局聚类的有效 HTML 页面") + + try: + labels_partial, _sim, _clusters = cluster_html_layouts_optimized( + valid_contents, + use_knn_graph=True, + ) + except Exception as e: + raise Exception(f"聚类失败: {e}") from e + + label_map = {k: int(lbl) for k, lbl in zip(valid_keys, labels_partial)} + labels = np.array([label_map.get(sk, -1) for sk in surrogate_keys], dtype=int) + + unique_labels = sorted(set(labels.tolist())) + noise_count = sum(1 for l in labels if l == -1) + cluster_count = len([l for l in unique_labels if l != -1]) + + logger.info("✓ 聚类分析完成") + logger.info(f" 总行数(非空行): {len(line_metas)}") + logger.info(f" 识别出的布局簇数: {cluster_count}") + logger.info(f" 噪声点(未归类): {noise_count}") + + clusters_dict: Dict[str, List[str]] = {} + noise_files: List[str] = [] + for lbl in unique_labels: + ids = [line_metas[i]["rid"] for i in range(len(line_metas)) if labels[i] == lbl] + if not ids: + continue + if lbl == -1: + noise_files = ids + clusters_dict["noise"] = ids + else: + clusters_dict[f"cluster_{lbl}"] = ids + logger.info(f" {'噪声点' if lbl == -1 else f'簇 {lbl}'}: {len(ids)} 条") + + if config.should_save(): + out_root = Path(config.get_full_output_path()) + out_root.mkdir(parents=True, exist_ok=True) + stem = output_stem + + if config.should_save_item("report"): + report_payload: Dict[str, Any] = { + "source": "crawl_jsonl", + "html_field": hf, + "record_id_field": rid_f, + "clusters": {k: v for k, v in clusters_dict.items() if k != "noise"}, + "noise_record_ids": noise_files, + "labels": labels.tolist(), + "record_ids": [m["rid"] for m in line_metas], + "record_keys": surrogate_keys, + "source_jsonl": [m["source_jsonl"] for m in line_metas], + "cluster_count": cluster_count, + "total_records": len(line_metas), + } + if report_extra: + report_payload.update(report_extra) + + report_json = out_root / "cluster_report.json" + with open(report_json, "w", encoding="utf-8") as f: + json.dump(report_payload, f, ensure_ascii=False, indent=2) + logger.info(f" ✓ 报告已保存: {report_json}") + + cluster_list_path = out_root / "cluster_list.jsonl" + list_rows: List[Dict[str, Any]] = [] + for i, m in enumerate(line_metas): + lab = int(labels[i]) + list_rows.append( + { + "global_index": i, + "layout_cluster_id": lab, + "source_jsonl": m["source_jsonl"], + "source_name": m["source_name"], + "line_no": m["line_no"], + "record_id": m["rid"], + } + ) + write_jsonl_lines(cluster_list_path, list_rows) + logger.info(f" ✓ 聚类清单已保存: {cluster_list_path}") + + info_txt = out_root / "cluster_info.txt" + with open(info_txt, "w", encoding="utf-8") as f: + f.write("Crawl JSONL 布局聚类结果\n") + f.write("=" * 70 + "\n\n") + if report_extra and report_extra.get("jsonl_files"): + f.write(f"源 JSONL 文件数: {len(report_extra['jsonl_files'])}\n") + f.write(f"总行数: {len(line_metas)}\n") + f.write(f"簇数: {cluster_count}\n") + f.write(f"噪声: {noise_count}\n\n") + for lbl in unique_labels: + ids = [line_metas[i]["rid"] for i in range(len(line_metas)) if labels[i] == lbl] + if lbl == -1: + f.write(f"噪声点: {len(ids)} 条\n") + else: + f.write(f"簇 {lbl}: {len(ids)} 条\n") + for rid in ids[:5]: + f.write(f" - {rid}\n") + if len(ids) > 5: + f.write(f" ... 还有 {len(ids) - 5} 条\n") + f.write("\n") + logger.info(f" ✓ 摘要已保存: {info_txt}") + + if config.should_save_item("jsonl"): + recs = [] + for i, m in enumerate(line_metas): + row = dict(m["obj"]) + if annotate_slice_rows: + row["layout_cluster_id"] = int(labels[i]) + row["crawl_source_name"] = m["source_name"] + row["crawl_line_no"] = m["line_no"] + recs.append(row) + split_jsonl_by_cluster_labels( + recs, + labels.tolist(), + out_dir=out_root, + stem=stem, + ) + logger.info(f" ✓ 已按簇写出 JSONL 切片到: {out_root}") + + logger.info(f"✓ 结果已保存到: {out_root}") + + clusters_only = {k: v for k, v in clusters_dict.items() if k.startswith("cluster_")} + return ClusterResult( + clusters=clusters_only, + labels=labels, + noise_files=noise_files, + cluster_count=cluster_count, + ) + + +def classify_crawl_jsonl( + config: Web2JsonConfig, + jsonl_path: Optional[str] = None, + *, + html_field: Optional[str] = None, + record_id_field: Optional[str] = None, + annotate_slice_rows: bool = False, +) -> ClusterResult: + """API:对 crawl JSONL 按布局聚类,并按簇写出切片 JSONL(不物化 HTML 到磁盘)。 + + 从每行 JSON 的 ``html``(或 ``html_field``)读入 HTML,布局特征与 + ``classify_html_dir`` 相同。输出文件名形如 + ``{jsonl 主名}_cluster_0.jsonl``、``{主名}_noise.jsonl``。 + + - ``config.html_path`` 或参数 ``jsonl_path``:指向 ``.jsonl`` 文件。 + - ``config.save`` 含 ``report`` 时写 ``cluster_report.json`` / ``cluster_info.txt`` / + ``cluster_list.jsonl``;含 ``jsonl`` 时写各簇切片(推荐 ``['report', 'jsonl']``)。 + + ``ClusterResult.clusters`` 的值为 **record_id** 列表(``track_id`` 或 ``line_{n}``), + 不是文件路径。 + """ + from web2json.tools.crawl_jsonl import load_crawl_line_metas_for_file + + _setup_logger() + path = Path(jsonl_path or config.html_path) + if not path.is_file() or path.suffix.lower() != ".jsonl": + raise ValueError("classify_crawl_jsonl 需要指向一个 .jsonl 文件") + + hf = html_field if html_field is not None else getattr(config, "crawl_html_field", "html") + rid_f = ( + record_id_field + if record_id_field is not None + else getattr(config, "crawl_jsonl_id_field", "track_id") + ) + + logger.info(f"[API] classify_crawl_jsonl - 从 JSONL 读 html 并布局聚类") + logger.info(f" JSONL: {path}") + logger.info(f" html 字段: {hf}, id 字段: {rid_f}") + if config.should_save(): + logger.info(f" 保存内容: {', '.join(config.save)}") + logger.info(f" 输出路径: {config.get_full_output_path()}") + + line_metas = load_crawl_line_metas_for_file(path, html_field=hf, record_id_field=rid_f) + report_extra = { + "jsonl_path": str(path.resolve()), + "mode": "single_file", + } + return _execute_crawl_layout_cluster( + line_metas, + config, + output_stem=path.stem, + hf=hf, + rid_f=rid_f, + report_extra=report_extra, + annotate_slice_rows=annotate_slice_rows, + ) + + +def classify_crawl_jsonl_dir( + config: Web2JsonConfig, + jsonl_dir: Optional[str] = None, + *, + recursive: bool = True, + html_field: Optional[str] = None, + record_id_field: Optional[str] = None, + output_stem: str = "ms_web_jwn_union", + annotate_slice_rows: bool = True, +) -> ClusterResult: + """API:将目录下**所有** ``*.jsonl`` 合并为一次布局聚类(不按文件顺序串行跑 schema 流水线)。 + + 适用于 ``Prod/ms-web-jwn`` 等多文件场景:先统一得到 ``cluster_list.jsonl``, + 再按簇分别对切片调用 ``extract_schema`` / ``infer_code`` / ``extract_data_with_code``。 + + - ``jsonl_dir`` 或 ``config.html_path``:目录路径。 + - ``recursive``:是否递归子目录查找 ``*.jsonl``。 + - ``output_stem``:输出切片文件名前缀,如 ``{stem}_cluster_0.jsonl``。 + - ``annotate_slice_rows``:为 ``jsonl`` 切片每行附加 ``layout_cluster_id``、``crawl_source_name``、``crawl_line_no``(与 ``cluster_list`` 对齐,无 ``_w2j`` 前缀)。 + + 需在 ``config.save`` 中包含 ``report``(含 ``cluster_list.jsonl``)与可选 ``jsonl``。 + """ + from web2json.tools.crawl_jsonl import discover_jsonl_files, load_crawl_line_metas_for_file + + _setup_logger() + root = Path(jsonl_dir or config.html_path) + if not root.is_dir(): + raise NotADirectoryError(f"classify_crawl_jsonl_dir 需要目录: {root}") + + hf = html_field if html_field is not None else getattr(config, "crawl_html_field", "html") + rid_f = ( + record_id_field + if record_id_field is not None + else getattr(config, "crawl_jsonl_id_field", "track_id") + ) + + files = discover_jsonl_files(root, recursive=recursive) + if not files: + raise FileNotFoundError(f"目录下未找到 .jsonl: {root}") + + logger.info(f"[API] classify_crawl_jsonl_dir - 合并 {len(files)} 个 JSONL 做一次布局聚类") + for fp in files[:20]: + logger.info(f" - {fp}") + if len(files) > 20: + logger.info(f" ... 共 {len(files)} 个文件") + + line_metas: List[Dict[str, Any]] = [] + for fp in files: + line_metas.extend(load_crawl_line_metas_for_file(fp, html_field=hf, record_id_field=rid_f)) + + if not line_metas: + raise ValueError("合并后无任何有效行") + + report_extra = { + "mode": "multi_jsonl_union", + "jsonl_dir": str(root.resolve()), + "jsonl_files": [str(p.resolve()) for p in files], + "jsonl_file_count": len(files), + } + if config.should_save(): + logger.info(f" 保存内容: {', '.join(config.save)}") + logger.info(f" 输出路径: {config.get_full_output_path()}") + + return _execute_crawl_layout_cluster( + line_metas, + config, + output_stem=output_stem, + hf=hf, + rid_f=rid_f, + report_extra=report_extra, + annotate_slice_rows=annotate_slice_rows, + ) + + def _cleanup_unwanted_files(output_path: Path, save_items: List[str], api_type: str = "extract_data"): """ 清理不需要保存的文件,只保留save列表中指定的内容 @@ -336,9 +692,9 @@ def extract_data(config: Web2JsonConfig) -> ExtractDataResult: logger.info(f" 保存内容: {', '.join(config.save)}") logger.info(f" 输出路径: {config.get_full_output_path()}") - # 读取HTML文件 - html_files = _read_html_files(config.html_path) - logger.info(f"找到 {len(html_files)} 个HTML文件") + jsonl_cleanup: Optional[Callable[[], None]] = None + html_files, jsonl_cleanup = _resolve_pipeline_html_files(config) + logger.info(f"找到 {len(html_files)} 个 HTML 文件") # 根据是否需要保存决定使用临时目录还是持久目录 import tempfile @@ -483,6 +839,8 @@ def extract_data(config: Web2JsonConfig) -> ExtractDataResult: ) finally: + if jsonl_cleanup: + jsonl_cleanup() # 根据配置决定清理策略 if use_temp_dir: # 临时目录:完全清理 @@ -539,9 +897,9 @@ def extract_schema(config: Web2JsonConfig) -> ExtractSchemaResult: logger.info(f" 保存内容: {', '.join(config.save)}") logger.info(f" 输出路径: {config.get_full_output_path()}") - # 读取HTML文件 - html_files = _read_html_files(config.html_path) - logger.info(f"找到 {len(html_files)} 个HTML文件") + jsonl_cleanup: Optional[Callable[[], None]] = None + html_files, jsonl_cleanup = _resolve_pipeline_html_files(config) + logger.info(f"找到 {len(html_files)} 个 HTML 文件") # 根据是否需要保存决定使用临时目录还是持久目录 import tempfile @@ -643,6 +1001,8 @@ def extract_schema(config: Web2JsonConfig) -> ExtractSchemaResult: ) finally: + if jsonl_cleanup: + jsonl_cleanup() # 根据配置决定清理策略 if use_temp_dir: # 临时目录:完全清理 @@ -714,16 +1074,9 @@ def infer_code(config: Web2JsonConfig) -> InferCodeResult: logger.info(f" 保存内容: {', '.join(config.save)}") logger.info(f" 输出路径: {config.get_full_output_path()}") - # 处理HTML路径(可能是目录或单个文件) - html_file_path = Path(config.html_path) - if html_file_path.is_dir(): - html_files = _read_html_files(config.html_path) - elif html_file_path.is_file(): - html_files = [str(html_file_path.absolute())] - else: - raise FileNotFoundError(f"HTML路径不存在: {config.html_path}") - - logger.info(f"找到 {len(html_files)} 个HTML文件") + jsonl_cleanup: Optional[Callable[[], None]] = None + html_files, jsonl_cleanup = _resolve_pipeline_html_files(config) + logger.info(f"找到 {len(html_files)} 个 HTML 文件") # 根据是否需要保存决定使用临时目录还是持久目录 import tempfile @@ -824,6 +1177,8 @@ def infer_code(config: Web2JsonConfig) -> InferCodeResult: ) finally: + if jsonl_cleanup: + jsonl_cleanup() # 根据配置决定清理策略 if use_temp_dir: # 临时目录:完全清理 @@ -913,16 +1268,9 @@ def extract_data_with_code(config: Web2JsonConfig) -> ParseResult: logger.info(f" 保存内容: {', '.join(config.save)}") logger.info(f" 输出路径: {config.get_full_output_path()}") - # 处理HTML路径(可能是目录或单个文件) - html_file_path = Path(config.html_path) - if html_file_path.is_dir(): - html_files = _read_html_files(config.html_path) - elif html_file_path.is_file(): - html_files = [str(html_file_path.absolute())] - else: - raise FileNotFoundError(f"HTML路径不存在: {config.html_path}") - - logger.info(f"找到 {len(html_files)} 个HTML文件") + jsonl_cleanup: Optional[Callable[[], None]] = None + html_files, jsonl_cleanup = _resolve_pipeline_html_files(config) + logger.info(f"找到 {len(html_files)} 个 HTML 文件") # 确定是否需要保存到磁盘 should_save = config.should_save() @@ -978,6 +1326,8 @@ def extract_data_with_code(config: Web2JsonConfig) -> ParseResult: ) finally: + if jsonl_cleanup: + jsonl_cleanup() # 清理临时parser文件 import os if os.path.exists(temp_parser_path): @@ -1034,6 +1384,20 @@ def classify_html_dir(config: Web2JsonConfig) -> ClusterResult: logger.info(f" 保存内容: {', '.join(config.save)}") logger.info(f" 输出路径: {config.get_full_output_path()}") + p_html = Path(config.html_path) + if p_html.is_file() and p_html.suffix.lower() == ".jsonl": + logger.info(" 检测到 crawl .jsonl:改用 classify_crawl_jsonl(按 html 字段聚类并写出切片)") + return classify_crawl_jsonl(config) + + if p_html.is_dir(): + has_html = bool(list(p_html.glob("*.html")) + list(p_html.glob("*.htm"))) + top_jsonl = list(p_html.glob("*.jsonl")) + if top_jsonl and not has_html: + logger.info( + " 检测到目录顶层仅有 .jsonl(无 .html):改用 classify_crawl_jsonl_dir 合并聚类" + ) + return classify_crawl_jsonl_dir(config, jsonl_dir=str(p_html), recursive=False) + # 读取HTML文件 html_files = _read_html_files(config.html_path) logger.info(f"找到 {len(html_files)} 个HTML文件") diff --git a/web2json/tools/crawl_jsonl.py b/web2json/tools/crawl_jsonl.py new file mode 100644 index 0000000..c9d8182 --- /dev/null +++ b/web2json/tools/crawl_jsonl.py @@ -0,0 +1,217 @@ +""" +Crawl JSONL helpers: cluster split & optional materialize to .html for legacy pipeline. + +JSONL rows are expected to be JSON objects with at least a string field for HTML +(default key ``html``). Record identity defaults to ``track_id`` or ``line_{n}``. +""" +from __future__ import annotations + +import json +import re +from pathlib import Path +from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple + +_SAFE_NAME = re.compile(r"[^a-zA-Z0-9._-]+") + + +def _safe_filename_part(s: str, max_len: int = 80) -> str: + s = _SAFE_NAME.sub("_", s.strip())[:max_len] + return s or "id" + + +def iter_crawl_jsonl_records( + jsonl_path: Path, + *, + html_field: str = "html", + id_field: Optional[str] = "track_id", +) -> Iterator[Tuple[int, str, Dict[str, Any], str]]: + """ + Yield (line_index_1based, record_id, obj, html_string) for each line with usable html. + + Lines that are empty, non-JSON, or missing html are skipped (not yielded). + """ + with jsonl_path.open("r", encoding="utf-8") as f: + for i, line in enumerate(f, 1): + line = line.strip() + if not line: + continue + try: + obj = json.loads(line) + except json.JSONDecodeError: + continue + if not isinstance(obj, dict): + continue + html = obj.get(html_field) + if not isinstance(html, str) or not html.strip(): + continue + if id_field and obj.get(id_field) is not None: + rid = str(obj[id_field]) + else: + rid = f"line_{i}" + yield i, rid, obj, html + + +def materialize_jsonl_to_html_dir( + jsonl_path: Path, + dest_dir: Path, + *, + html_field: str = "html", + id_field: Optional[str] = "track_id", +) -> List[str]: + """ + Write one ``.html`` file per JSONL row (only rows with non-empty ``html_field``). + + Filenames: ``{line_index:06d}_{safe_id}.html`` to preserve order and avoid collisions. + + Returns sorted list of absolute paths to written HTML files. + """ + dest_dir.mkdir(parents=True, exist_ok=True) + out: List[str] = [] + for line_no, rid, _obj, html in iter_crawl_jsonl_records( + jsonl_path, html_field=html_field, id_field=id_field + ): + safe = _safe_filename_part(rid) + name = f"{line_no:06d}_{safe}.html" + p = dest_dir / name + p.write_text(html, encoding="utf-8") + out.append(str(p.resolve())) + out.sort() + return out + + +def discover_jsonl_files(directory: Path, *, recursive: bool = True) -> List[Path]: + """Return sorted ``*.jsonl`` paths under ``directory`` (``rglob`` if ``recursive``).""" + if not directory.is_dir(): + raise NotADirectoryError(f"不是目录: {directory}") + if recursive: + found = sorted(directory.rglob("*.jsonl")) + else: + found = sorted(directory.glob("*.jsonl")) + return [p for p in found if p.is_file()] + + +def load_crawl_line_metas_for_file( + jsonl_path: Path, + *, + html_field: str = "html", + record_id_field: Optional[str] = "track_id", +) -> List[Dict[str, Any]]: + """ + 解析单个 crawl JSONL,每行一条 meta(与 ``classify_crawl_jsonl`` 规则一致)。 + + 每条 meta 含: ``line_no``, ``obj``, ``html`` (可 None), ``rid``, ``source_jsonl``, ``source_name``。 + """ + source_jsonl = str(jsonl_path.resolve()) + source_name = jsonl_path.name + metas: List[Dict[str, Any]] = [] + with jsonl_path.open("r", encoding="utf-8") as f: + for line_no, raw in enumerate(f, 1): + raw = raw.strip() + if not raw: + continue + try: + obj = json.loads(raw) + except json.JSONDecodeError: + metas.append( + { + "line_no": line_no, + "obj": { + "_classify_json_error": "json_decode_error", + "line": line_no, + "source_jsonl": source_jsonl, + }, + "html": None, + "rid": f"{source_name}:{line_no}", + "source_jsonl": source_jsonl, + "source_name": source_name, + } + ) + continue + if not isinstance(obj, dict): + metas.append( + { + "line_no": line_no, + "obj": { + "_classify_json_error": "not_a_json_object", + "line": line_no, + "source_jsonl": source_jsonl, + }, + "html": None, + "rid": f"{source_name}:{line_no}", + "source_jsonl": source_jsonl, + "source_name": source_name, + } + ) + continue + html = obj.get(html_field) + if not isinstance(html, str) or not html.strip(): + rid_local = ( + str(obj[record_id_field]) + if record_id_field and obj.get(record_id_field) is not None + else f"line_{line_no}" + ) + metas.append( + { + "line_no": line_no, + "obj": obj, + "html": None, + "rid": rid_local, + "source_jsonl": source_jsonl, + "source_name": source_name, + } + ) + continue + rid_local = ( + str(obj[record_id_field]) + if record_id_field and obj.get(record_id_field) is not None + else f"line_{line_no}" + ) + metas.append( + { + "line_no": line_no, + "obj": obj, + "html": html, + "rid": rid_local, + "source_jsonl": source_jsonl, + "source_name": source_name, + } + ) + return metas + + +def write_jsonl_lines(path: Path, rows: List[Dict[str, Any]]) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("w", encoding="utf-8") as f: + for row in rows: + f.write(json.dumps(row, ensure_ascii=False) + "\n") + + +def split_jsonl_by_cluster_labels( + records: List[Dict[str, Any]], + labels: List[int], + *, + out_dir: Path, + stem: str, +) -> Dict[str, Path]: + """ + Write ``{stem}_cluster_{k}.jsonl`` and ``{stem}_noise.jsonl`` for label -1. + + ``records`` and ``labels`` must have the same length. + """ + if len(records) != len(labels): + raise ValueError("records and labels length mismatch") + + buckets: Dict[str, List[Dict[str, Any]]] = {} + for rec, lab in zip(records, labels): + key = "noise" if lab == -1 else f"cluster_{lab}" + buckets.setdefault(key, []).append(rec) + + written: Dict[str, Path] = {} + for key, rows in buckets.items(): + if key == "noise": + path = out_dir / f"{stem}_noise.jsonl" + else: + path = out_dir / f"{stem}_{key}.jsonl" + write_jsonl_lines(path, rows) + written[key] = path + return written