ccprocessor · yogacc33 · Feb 21, 2025 · Feb 24, 2025 · Feb 24, 2025 · Feb 24, 2025
diff --git a/.codecov.yml b/.codecov.yml
@@ -0,0 +1,19 @@
+coverage:
+  status:
+    patch:  # 只检查变更部分的覆盖率
+      default:
+        target: 85%  # 变更代码的覆盖率目标
+        threshold: 2%  # 允许的浮动范围
+        base: auto     # 基于当前分支的覆盖率
+
+  # 忽略特定路径
+  ignore:
+    - "tests/**/*"           # 忽略所有测试目录
+    - "**/__pycache__/**/*"  # 忽略缓存文件
+    - "**/__init__.py"       # 忽略初始化文件
+
+# 可选：调整报告显示
+comment:
+  layout: "reach, diff, flags, files"
+  behavior: default
+  require_changes: false
diff --git a/.github/workflows/pr_stage_test.yml b/.github/workflows/pr_stage_test.yml
@@ -24,11 +24,11 @@ jobs:
   pr_ut_test:
     runs-on: ubuntu-latest
     env:
-      LLM_WEB_KIT_CFG_PATH: ${{ github.workspace }}/llm_web_kit/pipeline/pipe_tpl/pipeline_html_tpl.jsonc
+      LLM_WEB_KIT_CFG_PATH: ${{ github.workspace }}/bench/config/ours_config.jsonc
       PYTHONPATH: $PYTHONPATH:${{ github.workspace }}
     strategy:
       matrix:
-        python-version: [3.10.15]
+        python-version: [3.10.16]
     steps:
       - name: Checkout code
         uses: actions/checkout@v4

diff --git a/.github/workflows/pr_ut_test_extra.yml b/.github/workflows/pr_ut_test_extra.yml
@@ -0,0 +1,47 @@
+name: pr_stage_ut_extra
+
+on:
+  pull_request:
+    paths:
+      - 'requirements/**'
+      - 'setup.py'
+  push:
+    branches:
+      - main
+      - dev
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  # 额外的 Python 版本测试，只在 requirements 目录有修改时运行
+  pr_ut_test_extra:
+    runs-on: ubuntu-latest
+    env:
+      LLM_WEB_KIT_CFG_PATH: ${{ github.workspace }}/bench/config/ours_config.jsonc
+      PYTHONPATH: $PYTHONPATH:${{ github.workspace }}
+    strategy:
+      matrix:
+        python-version: [3.11.11, 3.12.8]
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Build llm_web_kit from source
+        run: |
+          pip install -e .
+          pip list | grep llm_web_kit
+      - name: Install unit tests dependencies
+        run: |
+          pip install -r requirements/runtime.txt
+          pip install -r requirements/dev.txt
+      - name: Run tests and collect coverage
+        run: pytest --cov --cov-report=xml -n auto ./tests/llm_web_kit
+      - name: Upload coverage reports to Codecov
+        uses: codecov/codecov-action@v5
+        with:
+          token: ${{ secrets.CODECOV_TOKEN }}
diff --git a/.gitignore b/.gitignore
@@ -4,6 +4,7 @@
 venv*/
 envs/
 slurm_logs/
+local_tests/
 
 __pycache__
 *.log
@@ -45,3 +46,5 @@ output/
 coverage.xml
 
 llm_web_kit.egg-info/*
+.llm-web-kit.jsonc
+.llm-web-kit-pageclassify.jsonc
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,11 +1,11 @@
-exclude: ^tests/llm_web_kit/pipeline/extractor/html/magic_html/assets/
+exclude: ^tests/llm_web_kit/extractor/html/magic_html/assets/
 
 repos:
   - repo: https://github.com/PyCQA/flake8
     rev: 5.0.4
     hooks:
       - id: flake8
-        args: [ "--max-line-length=2200", "--ignore=E131,E125,W503,W504,E203,E231,E702" ]
+        args: [ "--max-line-length=2200", "--ignore=E131,E125,W503,W504,E203,E231,E702,E128" ]
   - repo: https://github.com/PyCQA/isort
     rev: 5.11.5
     hooks:
@@ -19,19 +19,19 @@ repos:
   #   rev: v2.2.1
   #   hooks:
   #     - id: codespell
-  #       exclude: '^tests/.*/assets/'
+  #       exclude: '^tests/.*/assets/|llm_web_kit/model/assets/.*'
   #       args: ['--skip', '*.json']
   - repo: https://github.com/pre-commit/pre-commit-hooks
     rev: v4.3.0
     hooks:
       - id: trailing-whitespace
-        exclude: '^tests/.*/assets/'
+        exclude: '^tests/.*/assets/|llm_web_kit/model/assets/.*'
       - id: check-yaml
       - id: end-of-file-fixer
-        exclude: '^tests/.*/assets/'
+        exclude: '^tests/.*/assets/|llm_web_kit/model/assets/.*'
       - id: requirements-txt-fixer
       - id: double-quote-string-fixer
-        exclude: '^tests/.*/assets/'
+        exclude: '^tests/.*/assets/|llm_web_kit/model/assets/.*'
       - id: check-merge-conflict
       - id: fix-encoding-pragma
         args: [ "--remove" ]
@@ -46,7 +46,7 @@ repos:
           - mdformat-openmmlab
           - mdformat_frontmatter
           - linkify-it-py
-        exclude: '^tests/.*/assets/'
+        exclude: '^tests/.*/assets/|llm_web_kit/model/assets/.*'
   - repo: https://github.com/myint/docformatter
     rev: v1.3.1
     hooks:

diff --git a/README.md b/README.md
@@ -75,6 +75,26 @@ llm-web-kit is a python library that ..
 
 ## Quick Start
 
+```python
+from llm_web_kit.simple import extract_html_to_md
+import traceback
+from loguru import logger
+
+def extract(url:str, html:str) -> str:
+    try:
+        nlp_md = extract_html_to_md(url, html)
+        # or mm_nlp_md = extract_html_to_mm_md(url, html)
+        return nlp_md
+    except Exception as e:
+        logger.exception(e)
+    return None
+
+if __name__=="__main__":
+    url = ""
+    html = ""
+    markdown = extract(url, html)
+```
+
 ## Usage
 
 # TODO

diff --git a/bench/Bench.md b/bench/Bench.md
@@ -4,22 +4,65 @@
 
 # 目录结构
 
+- `bench/config/data_config.jsonl`: 数据处理配置文件
+- `bench/config/ours_config.jsonc`: 提取器配置文件
+- `bench/eval/`: 不同评估工具的实现
+- `bench/common/`: 通用评估工具和指标计算
+- `bench/output/`: 评测结果输出目录
+
 数据集：原始网页数据在`bench/data/origin`目录下，GT默认保存在`bench/data/groundtruth`目录下。
-评测结果：评测结果默认保存在`bench/output`目录下日期+随机数的文件夹中，如`20250212_113509_5bbf75c0`。
+评测结果：评测结果默认保存在`bench/output/{task_id}`目录下，其中`task_id`为UUID形式，如`5bbf7c8c-e8f2-11ef-a5a8-acde48001122`。
 
 # 使用方法
 
+## 命令行参数
+
+```bash
+python bench/run.py [--input INPUT_PATH] [--output OUTPUT_PATH] [--tool {ours,magic_html,unstructured}]
+```
+
+参数说明:
+
+- `--input`: 指定HTML文件路径
+- `--output`: 指定输出结果保存路径
+- `--tool`: 选择使用的提取工具，可选值:
+  - `ours`: 使用本项目提供的提取工具(默认)
+  - `magic_html`: 使用magic_html工具进行评估
+  - `unstructured`: 使用unstructured工具进行评估
+
+## 运行示例
+
+1. 使用默认提取器运行评估:
+
+```bash
+python bench/run.py
+```
+
+2. 使用其他提取器进行对比:
+
+```bash
+python bench/run.py --tool magic_html
 ```
-python run.py
+
+3. 指定输入和输出路径:
+
+```bash
+python bench/run.py --input path/to/input.html --output path/to/output
 ```
 
 # 评估报告及评估指标
 
 每一个评测结果包含`summary.json`和`detail.json`两个文件，`summary.json`是整个评测集的汇总结果，`detail.json`是单个网页的详细结果。
 
+主要评估指标包括:
+
+- `type_acc`: 元素类型识别准确率
+- `content_acc`: 元素内容识别准确率
+- 各种元素类型的识别统计
+
 # 评估报告示例
 
-`summary.json`主要展示所有评测数据“评测指标”，“评测耗时”的整体和元素级别的结果：
+`summary.json`主要展示所有评测数据"评测指标"，"评测耗时"的整体和元素级别的结果：
 
 ```json
 {
@@ -52,7 +95,7 @@ python run.py
 }
 ```
 
-`detail.json`主要展示每个评测数据“评测指标”，“评测耗时”等元素级别的结果详情，方便分析哪一个网页的哪一个元素抽取效果不好：
+`detail.json`主要展示每个评测数据"评测指标"，"评测耗时"等元素级别的结果详情，方便分析哪一个网页的哪一个元素抽取效果不好：
 
 ```json
 {
@@ -96,6 +139,60 @@ python run.py
 }
 ```
 
+# 输出文件格式
+
+评估结果将保存在指定的输出目录中，针对不同工具的输出格式如下:
+
+## `ours`工具输出
+
+输出为JSONL格式，每行是一个JSON对象，包含以下字段:
+
+- `url`: 原始网页URL
+- `content`: 提取的内容
+- `main_html`: 提取的主要HTML内容
+- `content_list`: 提取的内容列表
+- `html`: 原始HTML
+- `statics`: 统计信息
+
+## `magic_html`和`unstructured`工具输出
+
+输出为JSONL格式，每行是一个JSON对象，包含以下字段:
+
+- `url`: 原始网页URL
+- `content`: 提取的内容
+- `html`: 原始HTML
+
+# 故障排除
+
+## 常见问题
+
+1. 文件路径问题
+
+   如果遇到文件路径相关错误，请检查:
+
+   - 配置文件中的路径是否正确
+   - 文件路径是否存在
+   - 是否有足够的权限读写相关目录
+
+2. 编码问题
+
+   对于包含XML声明的HTML文件，系统会自动进行转换处理。如果遇到编码相关错误，可以:
+
+   - 确保HTML文件使用UTF-8编码
+   - 检查XML声明是否正确格式化
+
+3. 结果输出问题
+
+   若输出目录创建失败或结果无法写入:
+
+   - 检查目标目录的写入权限
+   - 确保磁盘空间足够
+
 # 如何新增评估数据
 
 评测数据集会根据`pipeline`的功能迭代新增数据，如何快速构建新增数据的`groundtruth`按照下面方法：
+
+1. 准备原始HTML文件，放入`bench/data/origin`目录下
+2. 在`bench/config/data_config.jsonl`中添加新的测试数据条目
+3. 运行评估工具生成初步结果
+4. 人工审核并修正结果作为groundtruth，放入`bench/data/groundtruth`目录