diff --git a/README_JUPYTER.md b/README_JUPYTER.md new file mode 100644 index 0000000..101160a --- /dev/null +++ b/README_JUPYTER.md @@ -0,0 +1,195 @@ +# web2json-agent Jupyter Guide + +这个文档专门基于 [scripts/run_jsonl_web2json_pipeline.py](/Users/luqing/Downloads/multiModal/web2json-agent/scripts/run_jsonl_web2json_pipeline.py) 来写,目标是在 Jupyter 里直接跑完整 `jsonl -> html -> classify -> schema -> code -> data` 流水线。 + +它不覆盖项目原始 [README.md](/Users/luqing/Downloads/multiModal/web2json-agent/README.md)。 + +## 这份文档对应哪条执行链路 + +这里用的不是最简单的 `extract_data(...)` 单接口方案,而是项目里的完整脚本流水线: + +- 入口脚本: [scripts/run_jsonl_web2json_pipeline.py](/Users/luqing/Downloads/multiModal/web2json-agent/scripts/run_jsonl_web2json_pipeline.py) +- Jupyter 包装: [jupyter_helper.py](/Users/luqing/Downloads/multiModal/web2json-agent/jupyter_helper.py) +- Notebook helper 实现: [notebooks/jupyter_helper.py](/Users/luqing/Downloads/multiModal/web2json-agent/notebooks/jupyter_helper.py) +- 示例 notebook: [notebooks/web2json_quickstart.ipynb](/Users/luqing/Downloads/multiModal/web2json-agent/notebooks/web2json_quickstart.ipynb) + +## 流水线做了什么 + +脚本会按下面顺序执行: + +1. 读取 `jsonl` +2. 从每条记录里取出 `html` 字段 +3. 拆成一批 `.html` 文件,并生成 `manifest.jsonl` +4. 对 HTML 做 `classify_html_dir` +5. 对每个 cluster 执行 `extract_schema` +6. 执行 `infer_code` +7. 用生成的 parser 执行 `extract_data_with_code` +8. 输出 `pipeline_summary.json` + +适合这种输入数据: + +- 原始数据是 `jsonl` +- 每行是一条网页记录 +- 每条记录里有 `html` 字段 +- 可能还带 `url`、`track_id`、`status` + +## Jupyter 最短路径 + +### 1. 进入项目目录 + +```bash +cd /Users/luqing/Downloads/multiModal/web2json-agent +``` + +### 2. 安装项目 + +请显式使用 `python3.11`,不要用系统默认的旧版 `python3`。 + +```bash +python3.11 -m pip install . +``` + +### 3. 启动 Jupyter + +```bash +python3.11 -m notebook +``` + +或者: + +```bash +python3.11 -m jupyter lab +``` + +### 4. 打开示例 notebook + +打开: + +`notebooks/web2json_quickstart.ipynb` + +## Notebook 最小示例 + +### Cell 1: 初始化环境 + +```python +from jupyter_helper import prepare_notebook + +prepare_notebook( + api_key="YOUR_API_KEY", + api_base="https://api.openai.com/v1", +) +``` + +### Cell 2: 运行完整 JSONL pipeline + +```python +from jupyter_helper import run_jsonl_pipeline, summarize_pipeline_result + +result = run_jsonl_pipeline( + source_jsonl="ToClassify/sample.json", + work_id="sample_run", + input_root="input_html", + output_root="output", + html_key="html", + iteration_rounds=3, + cluster_limit=1, +) + +summarize_pipeline_result(result) +``` + +### Cell 3: 查看完整结果 + +```python +result.to_dict() +``` + +## 也可以直接调用原脚本 + +如果你不想通过 helper,也可以在 notebook 里直接 import 原脚本里的函数: + +```python +from scripts.run_jsonl_web2json_pipeline import run_jsonl_pipeline + +result = run_jsonl_pipeline( + source_jsonl="ToClassify/sample.json", + work_id="sample_run", +) +``` + +这就是 [scripts/run_jsonl_web2json_pipeline.py](/Users/luqing/Downloads/multiModal/web2json-agent/scripts/run_jsonl_web2json_pipeline.py) 里新增的 notebook-friendly 入口。 + +## 参数说明 + +`run_jsonl_pipeline(...)` 主要参数: + +- `source_jsonl`: 源 `jsonl` 路径 +- `work_id`: 这次运行的标识;为空时按文件名自动生成 +- `input_root`: 拆分后 HTML 的输出根目录,默认 `input_html` +- `output_root`: pipeline 输出根目录,默认 `output` +- `html_key`: `jsonl` 中 HTML 字段名,默认 `html` +- `iteration_rounds`: schema 学习轮数上限,默认 `3` +- `cluster_limit`: 最多处理多少个 cluster,默认 `0`,表示全部 + +## 结果会落到哪里 + +如果你设置: + +```python +result = run_jsonl_pipeline( + source_jsonl="ToClassify/sample.json", + work_id="sample_run", +) +``` + +通常会生成: + +- `input_html/sample_run/` +- `output/sample_run_pipeline/` +- `output/sample_run_pipeline/pipeline_summary.json` + +每个 cluster 下面还会有: + +- schema 输出目录 +- code 输出目录 +- data 输出目录 +- 最终 parser 文件 + +## API Key 配置 + +你可以二选一: + +### 方式 A: 在 notebook 里设置 + +```python +from jupyter_helper import prepare_notebook + +prepare_notebook( + api_key="YOUR_API_KEY", + api_base="https://api.openai.com/v1", +) +``` + +### 方式 B: 在项目根目录放 `.env` + +```env +OPENAI_API_KEY=YOUR_API_KEY +OPENAI_API_BASE=https://api.openai.com/v1 +DEFAULT_MODEL=gpt-4.1 +``` + +## 已知前提 + +- Python 要求 `>= 3.10` +- 当前这台机器上默认 `python3` 是旧的 `3.7.3` +- 建议始终显式使用 `python3.11` +- 这条流水线依赖模型 API,可用前需要配置好 key/base + +## 相关文件 + +- [README.md](/Users/luqing/Downloads/multiModal/web2json-agent/README.md) +- [README_JUPYTER.md](/Users/luqing/Downloads/multiModal/web2json-agent/README_JUPYTER.md) +- [scripts/run_jsonl_web2json_pipeline.py](/Users/luqing/Downloads/multiModal/web2json-agent/scripts/run_jsonl_web2json_pipeline.py) +- [jupyter_helper.py](/Users/luqing/Downloads/multiModal/web2json-agent/jupyter_helper.py) +- [notebooks/jupyter_helper.py](/Users/luqing/Downloads/multiModal/web2json-agent/notebooks/jupyter_helper.py) +- [notebooks/web2json_quickstart.ipynb](/Users/luqing/Downloads/multiModal/web2json-agent/notebooks/web2json_quickstart.ipynb) diff --git a/jupyter_helper.py b/jupyter_helper.py new file mode 100644 index 0000000..d82c824 --- /dev/null +++ b/jupyter_helper.py @@ -0,0 +1,3 @@ +"""Compatibility wrapper so notebooks can import jupyter_helper from multiple locations.""" + +from notebooks.jupyter_helper import * # noqa: F401,F403 diff --git a/notebooks/jupyter_helper.py b/notebooks/jupyter_helper.py new file mode 100644 index 0000000..801d70e --- /dev/null +++ b/notebooks/jupyter_helper.py @@ -0,0 +1,136 @@ +"""Utilities for running web2json-agent inside Jupyter notebooks.""" + +from __future__ import annotations + +import json +import os +import sys +from pathlib import Path +from typing import Any, Optional, Sequence + +PROJECT_ROOT = Path(__file__).resolve().parents[1] + + +def prepare_notebook( + api_key: Optional[str] = None, + api_base: Optional[str] = None, + project_root: Optional[str] = None, +) -> Path: + """Prepare the notebook process for local package imports and env loading.""" + root = Path(project_root).expanduser().resolve() if project_root else PROJECT_ROOT + + if str(root) not in sys.path: + sys.path.insert(0, str(root)) + + os.chdir(root) + + if api_key: + os.environ["OPENAI_API_KEY"] = api_key + + if api_base: + os.environ["OPENAI_API_BASE"] = api_base + + return root + + +def make_extract_config( + name: str, + html_path: str, + output_path: str = "output", + save: Optional[Sequence[str]] = ("schema", "code", "data"), + schema: Optional[dict[str, Any]] = None, + iteration_rounds: int = 3, + enable_schema_edit: bool = False, + remove_null_fields: bool = True, + parser_code: Optional[str] = None, +): + """Build a Web2JsonConfig with notebook-friendly path resolution.""" + prepare_notebook() + + from web2json import Web2JsonConfig + + html_target = _resolve_project_path(html_path) + output_target = _resolve_project_path(output_path) + + return Web2JsonConfig( + name=name, + html_path=str(html_target), + output_path=str(output_target), + iteration_rounds=iteration_rounds, + schema=schema, + enable_schema_edit=enable_schema_edit, + parser_code=parser_code, + save=list(save) if save is not None else None, + remove_null_fields=remove_null_fields, + ) + + +def preview_records(records: Sequence[dict[str, Any]], limit: int = 3) -> list[dict[str, Any]]: + """Return the first few parsed records so a notebook cell renders them directly.""" + return list(records[:limit]) + + +def print_schema(schema: dict[str, Any]) -> None: + """Pretty print schema content inside notebooks.""" + print(json.dumps(schema, ensure_ascii=False, indent=2)) + + +def summarize_cluster_result(cluster_result: Any) -> dict[str, Any]: + """Convert a cluster result into a compact notebook-friendly summary.""" + return { + "cluster_count": cluster_result.cluster_count, + "clusters": {name: len(files) for name, files in cluster_result.clusters.items()}, + "noise_files": len(cluster_result.noise_files), + } + + +def run_jsonl_pipeline( + source_jsonl: str, + work_id: str = "", + input_root: str = "input_html", + output_root: str = "output", + html_key: str = "html", + iteration_rounds: int = 3, + cluster_limit: int = 0, +): + """Run the full JSONL pipeline from a notebook and return the structured summary.""" + prepare_notebook() + + from scripts.run_jsonl_web2json_pipeline import run_jsonl_pipeline as _run_jsonl_pipeline + + return _run_jsonl_pipeline( + source_jsonl=str(_resolve_project_path(source_jsonl)), + work_id=work_id, + input_root=str(_resolve_project_path(input_root)), + output_root=str(_resolve_project_path(output_root)), + html_key=html_key, + iteration_rounds=iteration_rounds, + cluster_limit=cluster_limit, + ) + + +def summarize_pipeline_result(result: Any) -> dict[str, Any]: + """Build a compact summary view for notebook display.""" + return { + "source_jsonl": result.source_jsonl, + "pipeline_root": result.pipeline_root, + "cluster_count": result.cluster_count, + "clusters": [ + { + "cluster_name": cluster["cluster_name"], + "cluster_size": cluster["cluster_size"], + "parse_success_count": cluster["parse_success_count"], + "parse_failed_count": cluster["parse_failed_count"], + } + for cluster in result.clusters + ], + "total_token_usage": result.total_token_usage, + "summary_path": result.summary_path, + } + + +def _resolve_project_path(path_str: str) -> Path: + path = Path(path_str).expanduser() + if path.is_absolute(): + return path + return (PROJECT_ROOT / path).resolve() diff --git a/notebooks/web2json_quickstart.ipynb b/notebooks/web2json_quickstart.ipynb new file mode 100644 index 0000000..da203aa --- /dev/null +++ b/notebooks/web2json_quickstart.ipynb @@ -0,0 +1,100 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# web2json-agent JSONL Pipeline Quickstart\n", + "\n", + "这个 notebook 基于 `scripts/run_jsonl_web2json_pipeline.py`,按顺序运行下面几个单元即可。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "import sys\n", + "\n", + "PROJECT_ROOT = Path.cwd()\n", + "if PROJECT_ROOT.name == \"notebooks\":\n", + " PROJECT_ROOT = PROJECT_ROOT.parent\n", + "if str(PROJECT_ROOT) not in sys.path:\n", + " sys.path.insert(0, str(PROJECT_ROOT))\n", + "\n", + "from jupyter_helper import prepare_notebook\n", + "\n", + "PROJECT_ROOT = prepare_notebook(\n", + " api_key=\"YOUR_API_KEY\",\n", + " api_base=\"https://api.openai.com/v1\", # 如果你使用兼容网关,请替换这里\n", + ")\n", + "\n", + "PROJECT_ROOT" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from jupyter_helper import run_jsonl_pipeline, summarize_pipeline_result\n", + "\n", + "result = run_jsonl_pipeline(\n", + " source_jsonl=\"ToClassify/sample.json\",\n", + " work_id=\"sample_run\",\n", + " input_root=\"input_html\",\n", + " output_root=\"output\",\n", + " html_key=\"html\",\n", + " iteration_rounds=3,\n", + " cluster_limit=1,\n", + ")\n", + "\n", + "summarize_pipeline_result(result)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result.to_dict()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result.summary_path" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "output_dir = PROJECT_ROOT / \"output\" / \"sample_run_pipeline\"\n", + "output_dir" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/scripts/build_html_manifest.py b/scripts/build_html_manifest.py new file mode 100644 index 0000000..8d1508f --- /dev/null +++ b/scripts/build_html_manifest.py @@ -0,0 +1,141 @@ +#!/usr/bin/env python3 +""" +从 crawl jsonl 拆分出 HTML 文件,并生成 manifest.jsonl 索引。 + +示例: +python scripts/build_html_manifest.py \ + --source ToClassify/example.jsonl \ + --output-dir input_html/example_set +""" + +from __future__ import annotations + +import argparse +import json +from pathlib import Path +from typing import Any + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Split a crawl jsonl into numbered HTML files plus manifest.jsonl." + ) + parser.add_argument( + "--source", + required=True, + help="源 jsonl 文件路径,每行应至少包含 html 字段。", + ) + parser.add_argument( + "--output-dir", + required=True, + help="输出目录,会写入 0001.html... 和 manifest.jsonl。", + ) + parser.add_argument( + "--html-key", + default="html", + help="HTML 内容字段名,默认 html。", + ) + parser.add_argument( + "--start-index", + type=int, + default=1, + help="输出编号起始值,默认 1。", + ) + parser.add_argument( + "--width", + type=int, + default=4, + help="输出文件编号宽度,默认 4,例如 0001.html。", + ) + parser.add_argument( + "--limit", + type=int, + default=0, + help="最多处理多少条记录,0 表示不限制。", + ) + parser.add_argument( + "--skip-empty-html", + action="store_true", + help="遇到缺失或空 html 时跳过该记录;默认直接报错。", + ) + return parser.parse_args() + + +def ensure_text(value: Any) -> str: + if value is None: + return "" + if isinstance(value, str): + return value + return str(value) + + +def main() -> None: + args = parse_args() + + source = Path(args.source) + output_dir = Path(args.output_dir) + + if not source.exists(): + raise FileNotFoundError(f"Source jsonl not found: {source}") + + output_dir.mkdir(parents=True, exist_ok=True) + manifest_path = output_dir / "manifest.jsonl" + + processed_count = 0 + skipped_count = 0 + current_index = args.start_index + + with source.open("r", encoding="utf-8") as src, manifest_path.open( + "w", encoding="utf-8" + ) as manifest_fp: + for source_line, line in enumerate(src, start=1): + if args.limit and processed_count >= args.limit: + break + + line = line.strip() + if not line: + skipped_count += 1 + continue + + try: + record = json.loads(line) + except json.JSONDecodeError as exc: + raise ValueError(f"Invalid JSON at line {source_line}: {exc}") from exc + + html = ensure_text(record.get(args.html_key)) + if not html.strip(): + if args.skip_empty_html: + skipped_count += 1 + continue + raise ValueError( + f"Missing or empty '{args.html_key}' at line {source_line}" + ) + + filename = f"{current_index:0{args.width}d}.html" + html_path = output_dir / filename + html_path.write_text(html, encoding="utf-8") + + manifest_record = { + "sample_no": current_index, + "source_line": source_line, + "filename": filename, + "track_id": record.get("track_id"), + "url": record.get("url"), + "status": record.get("status"), + "html_len": len(html), + } + + manifest_fp.write(json.dumps(manifest_record, ensure_ascii=False) + "\n") + + processed_count += 1 + current_index += 1 + + print(f"source: {source}") + print(f"output_dir: {output_dir}") + print(f"manifest: {manifest_path}") + print(f"processed: {processed_count}") + print(f"skipped: {skipped_count}") + + +if __name__ == "__main__": + main() diff --git a/scripts/run_jsonl_web2json_pipeline.py b/scripts/run_jsonl_web2json_pipeline.py new file mode 100644 index 0000000..bf7ff33 --- /dev/null +++ b/scripts/run_jsonl_web2json_pipeline.py @@ -0,0 +1,296 @@ +#!/usr/bin/env python3 +""" +对 jsonl 执行完整 web2json 流水线: +1. 拆分 html + manifest +2. classify_html_dir +3. 对每个 cluster 执行 extract_schema +4. infer_code +5. extract_data_with_code +6. 汇总 token 使用 +""" + +from __future__ import annotations + +import argparse +import json +import re +import sys +from dataclasses import asdict, dataclass +from pathlib import Path +from typing import Any + +PROJECT_ROOT = Path(__file__).resolve().parents[1] +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) + +from web2json import ( + Web2JsonConfig, + classify_html_dir, + extract_schema, + infer_code, + extract_data_with_code, +) +from web2json.utils.llm_client import LLMClient + + +@dataclass +class PipelineRunResult: + source_jsonl: str + manifest: str + html_dir: str + pipeline_root: str + cluster_count: int + clusters: list[dict[str, Any]] + total_token_usage: dict[str, int] + summary_path: str + + def to_dict(self) -> dict[str, Any]: + return asdict(self) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Run web2json pipeline on a crawl jsonl.") + parser.add_argument("--source-jsonl", required=True, help="源 jsonl 文件路径。") + parser.add_argument( + "--work-id", + default="", + help="输出目录标识。默认根据 jsonl 文件名自动生成。", + ) + parser.add_argument( + "--input-root", + default="input_html", + help="HTML 输出根目录,默认 input_html。", + ) + parser.add_argument( + "--output-root", + default="output", + help="结果输出根目录,默认 output。", + ) + parser.add_argument( + "--html-key", + default="html", + help="jsonl 中 HTML 字段名,默认 html。", + ) + parser.add_argument( + "--iteration-rounds", + type=int, + default=3, + help="schema 学习轮数上限,默认 3。", + ) + parser.add_argument( + "--cluster-limit", + type=int, + default=0, + help="最多处理多少个 cluster,0 表示全部处理。", + ) + return parser.parse_args() + + +def slugify(value: str) -> str: + value = re.sub(r"[^a-zA-Z0-9]+", "_", value).strip("_").lower() + return value or "run" + + +def load_jsonl(path: Path) -> list[tuple[int, dict[str, Any]]]: + rows: list[tuple[int, dict[str, Any]]] = [] + with path.open("r", encoding="utf-8") as fp: + for line_no, line in enumerate(fp, start=1): + line = line.strip() + if not line: + continue + try: + rows.append((line_no, json.loads(line))) + except json.JSONDecodeError as exc: + print(f"skip invalid json line {line_no}: {exc}") + return rows + + +def build_html_manifest(source_jsonl: Path, output_dir: Path, html_key: str) -> Path: + rows = load_jsonl(source_jsonl) + output_dir.mkdir(parents=True, exist_ok=True) + manifest_path = output_dir / "manifest.jsonl" + + with manifest_path.open("w", encoding="utf-8") as manifest_fp: + for idx, (source_line, row) in enumerate(rows, start=1): + html = row.get(html_key) + if not isinstance(html, str) or not html.strip(): + continue + + filename = f"{idx:04d}.html" + (output_dir / filename).write_text(html, encoding="utf-8") + + manifest_row = { + "sample_no": idx, + "source_line": source_line, + "filename": filename, + "track_id": row.get("track_id"), + "url": row.get("url"), + "status": row.get("status"), + "html_len": len(html), + } + manifest_fp.write(json.dumps(manifest_row, ensure_ascii=False) + "\n") + + return manifest_path + + +def usage_delta(before: dict[str, int], after: dict[str, int]) -> dict[str, int]: + return { + "total_input_tokens": after["total_input_tokens"] - before["total_input_tokens"], + "total_completion_tokens": after["total_completion_tokens"] - before["total_completion_tokens"], + "total_tokens": after["total_tokens"] - before["total_tokens"], + } + + +def run_jsonl_pipeline( + source_jsonl: str, + work_id: str = "", + input_root: str = "input_html", + output_root: str = "output", + html_key: str = "html", + iteration_rounds: int = 3, + cluster_limit: int = 0, +) -> PipelineRunResult: + source_jsonl_path = Path(source_jsonl).expanduser() + if not source_jsonl_path.is_absolute(): + source_jsonl_path = (PROJECT_ROOT / source_jsonl_path).resolve() + + work_id = work_id or slugify(source_jsonl_path.stem) + + input_root_path = Path(input_root).expanduser() + if not input_root_path.is_absolute(): + input_root_path = (PROJECT_ROOT / input_root_path).resolve() + + output_root_path = Path(output_root).expanduser() + if not output_root_path.is_absolute(): + output_root_path = (PROJECT_ROOT / output_root_path).resolve() + + html_dir = input_root_path / work_id + pipeline_root = output_root_path / f"{work_id}_pipeline" + pipeline_root.mkdir(parents=True, exist_ok=True) + + print(f"source_jsonl: {source_jsonl_path}") + print(f"work_id: {work_id}") + print(f"html_dir: {html_dir}") + print(f"pipeline_root: {pipeline_root}") + + manifest_path = build_html_manifest(source_jsonl_path, html_dir, html_key) + print(f"manifest: {manifest_path}") + + classify_config = Web2JsonConfig( + name="classify", + html_path=str(html_dir), + output_path=str(pipeline_root), + save=["report", "files"], + ) + classify_result = classify_html_dir(classify_config) + + clusters_dir = pipeline_root / "classify" / "clusters" + cluster_names = sorted(classify_result.clusters.keys()) + if cluster_limit: + cluster_names = cluster_names[:cluster_limit] + + LLMClient.reset_usage() + cluster_summaries: list[dict[str, Any]] = [] + + for cluster_name in cluster_names: + cluster_html_dir = clusters_dir / cluster_name + cluster_files = classify_result.clusters[cluster_name] + cluster_size = len(cluster_files) + rounds = min(iteration_rounds, cluster_size) + + print(f"\n=== {cluster_name} ({cluster_size} files) ===") + + before_schema = LLMClient.get_total_usage() + schema_result = extract_schema( + Web2JsonConfig( + name=f"{cluster_name}_schema", + html_path=str(cluster_html_dir), + output_path=str(pipeline_root), + iteration_rounds=rounds, + save=["schema"], + ) + ) + after_schema = LLMClient.get_total_usage() + + before_code = LLMClient.get_total_usage() + code_result = infer_code( + Web2JsonConfig( + name=f"{cluster_name}_code", + html_path=str(cluster_html_dir), + output_path=str(pipeline_root), + schema=schema_result.final_schema, + save=["schema", "code"], + ) + ) + after_code = LLMClient.get_total_usage() + + parser_path = pipeline_root / f"{cluster_name}_code" / "final_parser.py" + parse_result = extract_data_with_code( + Web2JsonConfig( + name=f"{cluster_name}_extract_data", + html_path=str(cluster_html_dir), + output_path=str(pipeline_root), + parser_code=str(parser_path), + save=["data"], + ) + ) + + cluster_summary = { + "cluster_name": cluster_name, + "cluster_size": cluster_size, + "html_dir": str(cluster_html_dir), + "schema_output": str(pipeline_root / f"{cluster_name}_schema"), + "code_output": str(pipeline_root / f"{cluster_name}_code"), + "data_output": str(pipeline_root / f"{cluster_name}_extract_data"), + "parser_path": str(parser_path), + "schema_fields": list(schema_result.final_schema.keys()), + "schema_token_usage": usage_delta(before_schema, after_schema), + "code_token_usage": usage_delta(before_code, after_code), + "parse_success_count": parse_result.success_count, + "parse_failed_count": parse_result.failed_count, + } + cluster_summaries.append(cluster_summary) + + total_usage = LLMClient.get_total_usage() + summary = { + "source_jsonl": str(source_jsonl_path), + "manifest": str(manifest_path), + "html_dir": str(html_dir), + "pipeline_root": str(pipeline_root), + "cluster_count": len(cluster_names), + "clusters": cluster_summaries, + "total_token_usage": total_usage, + } + + summary_path = pipeline_root / "pipeline_summary.json" + summary_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8") + print(f"\nsummary: {summary_path}") + print(json.dumps(total_usage, ensure_ascii=False, indent=2)) + + return PipelineRunResult( + source_jsonl=summary["source_jsonl"], + manifest=summary["manifest"], + html_dir=summary["html_dir"], + pipeline_root=summary["pipeline_root"], + cluster_count=summary["cluster_count"], + clusters=summary["clusters"], + total_token_usage=summary["total_token_usage"], + summary_path=str(summary_path), + ) + + +def main() -> None: + args = parse_args() + run_jsonl_pipeline( + source_jsonl=args.source_jsonl, + work_id=args.work_id, + input_root=args.input_root, + output_root=args.output_root, + html_key=args.html_key, + iteration_rounds=args.iteration_rounds, + cluster_limit=args.cluster_limit, + ) + + +if __name__ == "__main__": + main() diff --git a/scripts/verify_extract_alignment.py b/scripts/verify_extract_alignment.py new file mode 100644 index 0000000..1da78a8 --- /dev/null +++ b/scripts/verify_extract_alignment.py @@ -0,0 +1,282 @@ +#!/usr/bin/env python3 +""" +校验 source jsonl / manifest / html / result json 之间的一致性。 + +示例: +python scripts/verify_extract_alignment.py \ + --source-jsonl ToClassify/source.jsonl \ + --manifest input_html/npi_sample_2000/manifest.jsonl \ + --html-dir input_html/npi_category_detail_cluster_1 \ + --result-dir output/npi_category_detail_cluster_1_code/result \ + --output output/npi_category_detail_cluster_1_code/qa_report.json + +或者直接使用 cluster manifest: +python scripts/verify_extract_alignment.py \ + --source-jsonl ToClassify/source.jsonl \ + --cluster-manifest output/npi_category_detail_cluster_1_code/cluster_manifest.json \ + --manifest input_html/npi_sample_2000/manifest.jsonl +""" + +from __future__ import annotations + +import argparse +import json +from dataclasses import dataclass, asdict +from pathlib import Path +from typing import Any +from bs4 import BeautifulSoup + + +@dataclass +class FileReport: + filename: str + source_line: int | None + url: str | None + track_id: str | None + html_exists: bool + result_exists: bool + source_match: bool + html_len_match: bool + field_checks: dict[str, dict[str, Any]] + ok: bool + errors: list[str] + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Verify alignment across source jsonl, manifest, html files, and result json." + ) + parser.add_argument("--source-jsonl", required=True, help="原始 crawl jsonl 文件路径。") + parser.add_argument("--manifest", required=True, help="完整 manifest.jsonl 文件路径。") + parser.add_argument( + "--cluster-manifest", + default="", + help="cluster_manifest.json 路径。提供后会从其中自动读取 html-dir / result-dir / 文件子集 / schema-path。", + ) + parser.add_argument("--html-dir", default="", help="HTML 文件目录。") + parser.add_argument("--result-dir", default="", help="解析结果 JSON 目录。") + parser.add_argument("--schema-json", default="", help="cluster 对应的 schema.json 路径。") + parser.add_argument("--output", default="", help="QA 报告输出路径(可选)。") + parser.add_argument( + "--fields", + nargs="*", + default=None, + help="要校验是否出现在 HTML 中的结果字段。未提供时会优先从 schema 自动推断,否则回退到 title content。", + ) + return parser.parse_args() + + +def load_jsonl(path: Path) -> list[dict[str, Any]]: + rows: list[dict[str, Any]] = [] + with path.open("r", encoding="utf-8") as fp: + for line_no, line in enumerate(fp, start=1): + line = line.strip() + if not line: + continue + try: + rows.append(json.loads(line)) + except json.JSONDecodeError as exc: + raise ValueError(f"Invalid JSON in {path} line {line_no}: {exc}") from exc + return rows + + +def load_target_files( + manifest_rows: list[dict[str, Any]], cluster_manifest_path: Path | None +) -> tuple[list[dict[str, Any]], str, str, str]: + if not cluster_manifest_path: + return manifest_rows, "", "", "" + + cluster_manifest = json.loads(cluster_manifest_path.read_text(encoding="utf-8")) + wanted = {item["filename"] for item in cluster_manifest.get("files", [])} + filtered_rows = [row for row in manifest_rows if row.get("filename") in wanted] + html_dir = cluster_manifest.get("input_dir", "") + result_dir = cluster_manifest.get("result_dir", "") + schema_path = cluster_manifest.get("schema_path", "") + return filtered_rows, html_dir, result_dir, schema_path + + +def derive_fields_from_schema(schema_path: Path) -> list[str]: + schema = json.loads(schema_path.read_text(encoding="utf-8")) + fields: list[str] = [] + for field_name, field_meta in schema.items(): + if not isinstance(field_meta, dict): + continue + if field_meta.get("type") == "string": + fields.append(field_name) + return fields + + +def normalize_text(value: Any) -> str: + if value is None: + return "" + if not isinstance(value, str): + value = str(value) + return " ".join(value.split()) + + +def check_field_in_html(field_value: Any, html_text: str) -> dict[str, Any]: + normalized_value = normalize_text(field_value) + normalized_html = normalize_text(html_text) + normalized_text = normalize_text(BeautifulSoup(html_text, "html.parser").get_text(" ", strip=True)) + + if not normalized_value: + return { + "value_present": False, + "raw_html_match": False, + "text_match": False, + "substring_match": False, + "value_len": 0, + } + + return { + "value_present": True, + "raw_html_match": normalized_value in normalized_html, + "text_match": normalized_value in normalized_text, + "substring_match": normalized_value in normalized_html or normalized_value in normalized_text, + "value_len": len(normalized_value), + } + + +def main() -> None: + args = parse_args() + + source_jsonl = Path(args.source_jsonl) + manifest_path = Path(args.manifest) + cluster_manifest_path = Path(args.cluster_manifest) if args.cluster_manifest else None + output_path = Path(args.output) if args.output else None + + source_rows = load_jsonl(source_jsonl) + manifest_rows = load_jsonl(manifest_path) + target_rows, cluster_html_dir, cluster_result_dir, cluster_schema_path = load_target_files( + manifest_rows, cluster_manifest_path + ) + + html_dir_str = args.html_dir or cluster_html_dir + result_dir_str = args.result_dir or cluster_result_dir + schema_json_str = args.schema_json or cluster_schema_path + if not html_dir_str or not result_dir_str: + raise ValueError("html-dir 和 result-dir 不能为空;可直接传参,或通过 cluster-manifest 提供。") + + html_dir = Path(html_dir_str) + result_dir = Path(result_dir_str) + schema_json_path = Path(schema_json_str) if schema_json_str else None + + if args.fields is not None: + fields_to_check = args.fields + elif schema_json_path and schema_json_path.exists(): + fields_to_check = derive_fields_from_schema(schema_json_path) + if not fields_to_check: + fields_to_check = ["title", "content"] + else: + fields_to_check = ["title", "content"] + + reports: list[FileReport] = [] + ok_count = 0 + + for manifest_row in target_rows: + filename = manifest_row["filename"] + source_line = manifest_row.get("source_line") + url = manifest_row.get("url") + track_id = manifest_row.get("track_id") + + html_path = html_dir / filename + result_path = result_dir / filename.replace(".html", ".json") + + errors: list[str] = [] + html_exists = html_path.exists() + result_exists = result_path.exists() + source_match = False + html_len_match = False + field_checks: dict[str, dict[str, Any]] = {} + + html_text = "" + if html_exists: + html_text = html_path.read_text(encoding="utf-8") + else: + errors.append(f"missing_html:{html_path}") + + result_data: dict[str, Any] = {} + if result_exists: + result_data = json.loads(result_path.read_text(encoding="utf-8")) + else: + errors.append(f"missing_result:{result_path}") + + if source_line is not None and 1 <= source_line <= len(source_rows): + source_row = source_rows[source_line - 1] + source_match = ( + source_row.get("track_id") == track_id + and source_row.get("url") == url + ) + if not source_match: + errors.append("source_manifest_mismatch") + + source_html = source_row.get("html", "") + html_len_match = len(source_html) == manifest_row.get("html_len") + if not html_len_match: + errors.append("source_manifest_html_len_mismatch") + + if html_exists and len(html_text) != len(source_html): + errors.append("source_html_file_len_mismatch") + elif html_exists and html_text != source_html: + errors.append("source_html_file_content_mismatch") + else: + errors.append("invalid_source_line") + + if html_exists and result_exists: + for field in fields_to_check: + field_checks[field] = check_field_in_html(result_data.get(field), html_text) + if field_checks[field]["value_present"] and not field_checks[field]["substring_match"]: + errors.append(f"field_not_found_in_html:{field}") + + ok = not errors + if ok: + ok_count += 1 + + reports.append( + FileReport( + filename=filename, + source_line=source_line, + url=url, + track_id=track_id, + html_exists=html_exists, + result_exists=result_exists, + source_match=source_match, + html_len_match=html_len_match, + field_checks=field_checks, + ok=ok, + errors=errors, + ) + ) + + summary = { + "source_jsonl": str(source_jsonl), + "manifest": str(manifest_path), + "cluster_manifest": str(cluster_manifest_path) if cluster_manifest_path else "", + "html_dir": str(html_dir), + "result_dir": str(result_dir), + "schema_json": str(schema_json_path) if schema_json_path else "", + "fields_checked": fields_to_check, + "total_files": len(reports), + "ok_files": ok_count, + "failed_files": len(reports) - ok_count, + "reports": [asdict(report) for report in reports], + } + + if output_path: + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8") + print(f"qa_report: {output_path}") + + print(f"total_files: {summary['total_files']}") + print(f"ok_files: {summary['ok_files']}") + print(f"failed_files: {summary['failed_files']}") + + for report in reports: + status = "OK" if report.ok else "FAIL" + print(f"{status} {report.filename}") + if report.errors: + print(f" errors: {', '.join(report.errors)}") + + +if __name__ == "__main__": + main() diff --git a/start.sh b/start.sh index f2cd712..f7d735c 100755 --- a/start.sh +++ b/start.sh @@ -3,6 +3,9 @@ # Web2JSON Agent - Startup Script # Starts both the backend API and the frontend UI simultaneously +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PYTHON_BIN="${PYTHON_BIN:-python3.11}" + echo "🚀 Starting Web2JSON Agent..." echo "" @@ -21,14 +24,14 @@ fi # Start the backend echo "📡 Starting backend API (port 8000)..." -cd /Users/brown/Projects/AILabProject/web2json-agent +cd "$SCRIPT_DIR" # Create logs directory if it doesn't exist mkdir -p logs # Production Mode: Disable automatic reloading to avoid restarts triggered by changes in the output directory # If you need reload for development, use: --reload --reload-exclude 'output/**' --reload-exclude 'logs/**' -uvicorn web2json_api.main:app --host 0.0.0.0 --port 8000 \ +"$PYTHON_BIN" -m uvicorn web2json_api.main:app --host 0.0.0.0 --port 8000 \ --reload-exclude 'output/**' \ --reload-exclude 'logs/**' \ --reload-exclude '*.log' \ @@ -49,7 +52,7 @@ fi # Start the frontend echo "" echo "🎨 Starting frontend UI (port 5173)..." -cd web2json_ui && npm run dev > ../logs/ui.log 2>&1 & +cd "$SCRIPT_DIR/web2json_ui" && npm run dev > "$SCRIPT_DIR/logs/ui.log" 2>&1 & FRONTEND_PID=$! echo " Frontend PID: $FRONTEND_PID" @@ -70,8 +73,8 @@ echo "Or press Ctrl+C and run: pkill -f 'uvicorn|vite'" echo "" # Save PID -echo $BACKEND_PID > .backend.pid -echo $FRONTEND_PID > .frontend.pid +echo $BACKEND_PID > "$SCRIPT_DIR/.backend.pid" +echo $FRONTEND_PID > "$SCRIPT_DIR/.frontend.pid" # Wait for user interruption wait diff --git a/web2json/simple.py b/web2json/simple.py index 235d7c5..95b8ebe 100644 --- a/web2json/simple.py +++ b/web2json/simple.py @@ -7,6 +7,7 @@ from pathlib import Path from typing import Optional, Dict, List, Any from dataclasses import dataclass, asdict +import numpy as np from loguru import logger from web2json.agent import ParserAgent @@ -1050,15 +1051,39 @@ def classify_html_dir(config: Web2JsonConfig) -> ClusterResult: # 执行聚类分析 logger.info("正在进行布局聚类分析...") from web2json.tools.cluster import cluster_html_layouts_optimized + from web2json.tools.html_layout_cosin import get_feature + + valid_html_files = [] + valid_html_contents = [] + invalid_html_files = [] + for file_path, html_content in zip(html_files, html_contents): + try: + feature = get_feature(html_content) + except Exception as e: + logger.warning(f" 跳过布局特征提取失败页面: {file_path} ({e})") + invalid_html_files.append(file_path) + continue + if not feature: + logger.warning(f" 跳过无有效布局特征页面: {file_path}") + invalid_html_files.append(file_path) + continue + valid_html_files.append(file_path) + valid_html_contents.append(html_content) + + if not valid_html_contents: + raise Exception("聚类失败: 没有可用于布局聚类的有效HTML页面") try: labels, sim_mat, clusters = cluster_html_layouts_optimized( - html_contents, + valid_html_contents, use_knn_graph=True ) except Exception as e: raise Exception(f"聚类失败: {e}") + label_map = {file_path: int(label) for file_path, label in zip(valid_html_files, labels)} + labels = np.array([label_map.get(file_path, -1) for file_path in html_files], dtype=int) + # 统计聚类结果 unique_labels = sorted(set(labels)) noise_count = sum(1 for l in labels if l == -1) @@ -1160,4 +1185,3 @@ def classify_html_dir(config: Web2JsonConfig) -> ClusterResult: noise_files=noise_files, cluster_count=cluster_count ) - diff --git a/web2json/tools/html_simplifier.py b/web2json/tools/html_simplifier.py index 33f34ae..1a2581c 100644 --- a/web2json/tools/html_simplifier.py +++ b/web2json/tools/html_simplifier.py @@ -265,6 +265,77 @@ def remove_empty_tags( return root +def _contains_token_attr(element: html.HtmlElement, attr_name: str, patterns: List[str]) -> bool: + """检查元素指定属性中是否包含目标模式。""" + attr_value = element.get(attr_name, '') + if not attr_value: + return False + + attr_value = attr_value.lower() + return any(pattern in attr_value for pattern in patterns) + + +def is_sharepoint_html(html_str: str) -> bool: + """粗略识别 SharePoint 页面。""" + markers = [ + 'microsoft sharepoint', + '_sppagecontextinfo', + 's4-workspace', + '_layouts/15', + ] + html_lower = html_str.lower() + return any(marker in html_lower for marker in markers) + + +def remove_sharepoint_noise(root: html.HtmlElement) -> html.HtmlElement: + """ + 删除 SharePoint 门户模板中的高噪音区域。 + + 主要清理全站导航、页眉页脚、社媒区、noindex 容器和 mega menu。 + """ + class_patterns = [ + 'noindex', + 'mega-menu', + 'mega-sub-menu', + 'mega-menu-wrap', + 'mega-menu-toggle', + 'top-header', + 'button-close-top-header', + 'social-media-header', + 'breadcrumbs', + 'breadcrumb', + 'ms-csrlistview-controldiv', + ] + id_patterns = [ + 'top-header', + 'top-menu', + 'main-menu', + 'mega-menu', + 'social-media-header', + 'navigationmenu', + 'footer', + 'ctl00_placeholdersitenamen', + ] + tag_names = {'header', 'footer', 'nav'} + + remove_targets = [] + for element in root.iter(): + tag = str(element.tag).lower() if hasattr(element, 'tag') else '' + if tag in tag_names: + remove_targets.append(element) + continue + if _contains_token_attr(element, 'class', class_patterns): + remove_targets.append(element) + continue + if _contains_token_attr(element, 'id', id_patterns): + remove_targets.append(element) + + # 去重,避免重复删除同一元素 + unique_targets = list(dict.fromkeys(remove_targets)) + remove_reversely(unique_targets) + return root + + def clean_attributes( root: html.HtmlElement, keep_attrs: List[str] = None @@ -440,6 +511,11 @@ def simplify_html( clean_attrs=True, keep_attrs=keep_attrs_list ) + if is_sharepoint_html(html_str): + simplified_root = html_to_element(result) + simplified_root = remove_sharepoint_noise(simplified_root) + simplified_root = remove_empty_tags(simplified_root) + result = element_to_html(simplified_root) # 根据aggressive参数选择模式 elif aggressive: # 激进模式:删除所有无用内容 diff --git a/web2json/utils/llm_client.py b/web2json/utils/llm_client.py index 59913a8..46c62e7 100644 --- a/web2json/utils/llm_client.py +++ b/web2json/utils/llm_client.py @@ -17,9 +17,9 @@ env_path = project_root / ".env" load_dotenv(env_path) -# 验证 -if not os.getenv("OPENAI_API_KEY"): - raise ValueError(f".env 文件路径: {env_path}, API Key未加载") +# 验证(延迟到实际使用时) +_api_key_missing = not os.getenv("OPENAI_API_KEY") +_env_path_for_error = env_path # 定义场景类型 ScenarioType = Literal["default", "code_gen", "agent"] @@ -81,6 +81,9 @@ def __init__( if self._initialized: return + if _api_key_missing and not os.getenv("OPENAI_API_KEY"): + raise ValueError(f".env 文件路径: {_env_path_for_error}, API Key未加载") + self.api_key = api_key or settings.openai_api_key self.api_base = api_base or settings.openai_api_base self.model = model or settings.default_model