diff --git a/examples/basic_usage.py b/examples/basic_usage.py index 9e5c786..1b9b3ea 100755 --- a/examples/basic_usage.py +++ b/examples/basic_usage.py @@ -1,12 +1,12 @@ #!/usr/bin/env python3 """ -WebMainBench 基本使用示例 +WebMainBench Basic Usage Example """ import json from pathlib import Path -# 导入 WebMainBench 模块 +# Import WebMainBench modules from webmainbench import ( DataLoader, DataSaver, BenchmarkDataset, DataSample, ExtractorFactory, Evaluator, @@ -15,44 +15,44 @@ def create_sample_dataset(): - """创建示例数据集""" - - # 创建示例数据 - 包含多种内容类型(代码、公式、表格等) + """Create a sample dataset""" + + # Create sample data - includes multiple content types (code, formulas, tables, etc.) samples = [ { "track_id": "sample-001-programming-tutorial", "html": ''' -

Python编程教程

-

这是一个Python基础教程,展示如何定义函数。

+

Python Programming Tutorial

+

This is a basic Python tutorial demonstrating how to define functions.

def greet(name):
-    """问候函数"""
+    """Greeting function"""
     return f"Hello, {name}!"
 
-# 使用示例
+# Usage example
 result = greet("World")
 print(result)
-

这个函数可以用来问候任何人。

+

This function can be used to greet anyone.

''', - "groundtruth_content": '''# Python编程教程 + "groundtruth_content": '''# Python Programming Tutorial -这是一个Python基础教程,展示如何定义函数。 +This is a basic Python tutorial demonstrating how to define functions. ```python def greet(name): - """问候函数""" + """Greeting function""" return f"Hello, {name}!" -# 使用示例 +# Usage example result = greet("World") print(result) ``` -这个函数可以用来问候任何人。''', +This function can be used to greet anyone.''', "groundtruth_content_list": [ - {"type": "heading", "content": "Python编程教程", "level": 1}, - {"type": "paragraph", "content": "这是一个Python基础教程,展示如何定义函数。"}, - {"type": "code", "content": 'def greet(name):\n """问候函数"""\n return f"Hello, {name}!"\n\n# 使用示例\nresult = greet("World")\nprint(result)'}, - {"type": "paragraph", "content": "这个函数可以用来问候任何人。"} + {"type": "heading", "content": "Python Programming Tutorial", "level": 1}, + {"type": "paragraph", "content": "This is a basic Python tutorial demonstrating how to define functions."}, + {"type": "code", "content": 'def greet(name):\n """Greeting function"""\n return f"Hello, {name}!"\n\n# Usage example\nresult = greet("World")\nprint(result)'}, + {"type": "paragraph", "content": "This function can be used to greet anyone."} ], "url": "https://python-tutorial.example.com/functions", "layout_id": "python-tutorial_1", @@ -70,44 +70,44 @@ def greet(name): { "track_id": "sample-002-math-formulas", "html": ''' -

数学公式示例

-

这里展示一些基本的数学公式。

-

勾股定理:a² + b² = c²

+

Math Formula Examples

+

Here are some basic math formulas.

+

Pythagorean theorem: a² + b² = c²

-

二次方程的解为:

+

The solution to the quadratic equation is:

x = (-b ± √(b² - 4ac)) / 2a

-

欧拉公式是数学中最美丽的公式之一:e^(iπ) + 1 = 0

+

Euler's formula is one of the most beautiful formulas in mathematics: e^(iπ) + 1 = 0

- +
函数导数
FunctionDerivative
2x
sin(x)cos(x)
''', - "groundtruth_content": '''# 数学公式示例 + "groundtruth_content": '''# Math Formula Examples -这里展示一些基本的数学公式。 +Here are some basic math formulas. -勾股定理:$a^2 + b^2 = c^2$ +Pythagorean theorem: $a^2 + b^2 = c^2$ -二次方程的解为: +The solution to the quadratic equation is: $$x = \\frac{-b \\pm \\sqrt{b^2 - 4ac}}{2a}$$ -欧拉公式是数学中最美丽的公式之一:$e^{i\\pi} + 1 = 0$ +Euler's formula is one of the most beautiful formulas in mathematics: $e^{i\\pi} + 1 = 0$ -| 函数 | 导数 | +| Function | Derivative | |------|------| | x² | 2x | | sin(x) | cos(x) |''', "groundtruth_content_list": [ - {"type": "heading", "content": "数学公式示例", "level": 1}, - {"type": "paragraph", "content": "这里展示一些基本的数学公式。"}, - {"type": "paragraph", "content": "勾股定理:a² + b² = c²"}, - {"type": "paragraph", "content": "二次方程的解为:"}, + {"type": "heading", "content": "Math Formula Examples", "level": 1}, + {"type": "paragraph", "content": "Here are some basic math formulas."}, + {"type": "paragraph", "content": "Pythagorean theorem: a² + b² = c²"}, + {"type": "paragraph", "content": "The solution to the quadratic equation is:"}, {"type": "equation-interline", "content": "x = \\frac{-b \\pm \\sqrt{b^2 - 4ac}}{2a}"}, - {"type": "paragraph", "content": "欧拉公式是数学中最美丽的公式之一:e^(iπ) + 1 = 0"}, - {"type": "table", "content": "| 函数 | 导数 |\n|------|------|\n| x² | 2x |\n| sin(x) | cos(x) |"} + {"type": "paragraph", "content": "Euler's formula is one of the most beautiful formulas in mathematics: e^(iπ) + 1 = 0"}, + {"type": "table", "content": "| Function | Derivative |\n|------|------|\n| x² | 2x |\n| sin(x) | cos(x) |"} ], "url": "https://math-examples.edu/formulas", "layout_id": "math-examples_2", @@ -125,66 +125,66 @@ def greet(name): { "track_id": "sample-003-data-analysis", "html": ''' -

数据分析报告

-

以下是2024年第一季度的销售数据分析。

-

数据处理代码

+

Data Analysis Report

+

The following is a sales data analysis for Q1 2024.

+

Data Processing Code

import pandas as pd
 import numpy as np
 
-# 读取数据
+# Read data
 df = pd.read_csv('sales_q1_2024.csv')
 
-# 计算统计信息
+# Calculate statistics
 monthly_avg = df.groupby('month')['sales'].mean()
-print(f"平均销售额: {monthly_avg}")
-

销售统计

+print(f"Average sales: {monthly_avg}") +

Sales Statistics

- - - - + + + +
月份销售额(万元)增长率
1月120.5+15.2%
2月135.8+12.7%
3月148.3+9.2%
MonthSales (10k)Growth Rate
Jan120.5+15.2%
Feb135.8+12.7%
Mar148.3+9.2%
-

标准差公式:σ = √(Σ(xi - μ)² / n)

-

总体来看,第一季度销售表现良好,呈现稳定增长趋势。

+

Standard deviation formula: σ = √(Σ(xi - μ)² / n)

+

Overall, Q1 sales performance was strong, showing a steady growth trend.

''', - "groundtruth_content": '''# 数据分析报告 + "groundtruth_content": '''# Data Analysis Report -以下是2024年第一季度的销售数据分析。 +The following is a sales data analysis for Q1 2024. -## 数据处理代码 +## Data Processing Code ```python import pandas as pd import numpy as np -# 读取数据 +# Read data df = pd.read_csv('sales_q1_2024.csv') -# 计算统计信息 +# Calculate statistics monthly_avg = df.groupby('month')['sales'].mean() -print(f"平均销售额: {monthly_avg}") +print(f"Average sales: {monthly_avg}") ``` -## 销售统计 +## Sales Statistics -| 月份 | 销售额(万元) | 增长率 | +| Month | Sales (10k) | Growth Rate | |------|-------------|--------| -| 1月 | 120.5 | +15.2% | -| 2月 | 135.8 | +12.7% | -| 3月 | 148.3 | +9.2% | +| Jan | 120.5 | +15.2% | +| Feb | 135.8 | +12.7% | +| Mar | 148.3 | +9.2% | -标准差公式:$\\sigma = \\sqrt{\\frac{\\Sigma(x_i - \\mu)^2}{n}}$ +Standard deviation formula: $\\sigma = \\sqrt{\\frac{\\Sigma(x_i - \\mu)^2}{n}}$ -总体来看,第一季度销售表现良好,呈现稳定增长趋势。''', +Overall, Q1 sales performance was strong, showing a steady growth trend.''', "groundtruth_content_list": [ - {"type": "heading", "content": "数据分析报告", "level": 1}, - {"type": "paragraph", "content": "以下是2024年第一季度的销售数据分析。"}, - {"type": "heading", "content": "数据处理代码", "level": 2}, - {"type": "code", "content": "import pandas as pd\nimport numpy as np\n\n# 读取数据\ndf = pd.read_csv('sales_q1_2024.csv')\n\n# 计算统计信息\nmonthly_avg = df.groupby('month')['sales'].mean()\nprint(f\"平均销售额: {monthly_avg}\")"}, - {"type": "heading", "content": "销售统计", "level": 2}, - {"type": "table", "content": "| 月份 | 销售额(万元) | 增长率 |\n|------|-------------|--------|\n| 1月 | 120.5 | +15.2% |\n| 2月 | 135.8 | +12.7% |\n| 3月 | 148.3 | +9.2% |"}, - {"type": "paragraph", "content": "标准差公式:σ = √(Σ(xi - μ)² / n)"}, - {"type": "paragraph", "content": "总体来看,第一季度销售表现良好,呈现稳定增长趋势。"} + {"type": "heading", "content": "Data Analysis Report", "level": 1}, + {"type": "paragraph", "content": "The following is a sales data analysis for Q1 2024."}, + {"type": "heading", "content": "Data Processing Code", "level": 2}, + {"type": "code", "content": "import pandas as pd\nimport numpy as np\n\n# Read data\ndf = pd.read_csv('sales_q1_2024.csv')\n\n# Calculate statistics\nmonthly_avg = df.groupby('month')['sales'].mean()\nprint(f\"Average sales: {monthly_avg}\")"}, + {"type": "heading", "content": "Sales Statistics", "level": 2}, + {"type": "table", "content": "| Month | Sales (10k) | Growth Rate |\n|------|-------------|--------|\n| Jan | 120.5 | +15.2% |\n| Feb | 135.8 | +12.7% |\n| Mar | 148.3 | +9.2% |"}, + {"type": "paragraph", "content": "Standard deviation formula: σ = √(Σ(xi - μ)² / n)"}, + {"type": "paragraph", "content": "Overall, Q1 sales performance was strong, showing a steady growth trend."} ], "url": "https://data-report.company.com/q1-2024-analysis", "layout_id": "data-report_3", @@ -202,68 +202,68 @@ def greet(name): { "track_id": "sample-004-algorithm-explanation", "html": ''' -

算法复杂度分析

-

这里介绍常见算法的时间复杂度。

-

快速排序实现

+

Algorithm Complexity Analysis

+

Here we introduce the time complexity of common algorithms.

+

Quicksort Implementation

def quicksort(arr):
     if len(arr) <= 1:
         return arr
-    
+
     pivot = arr[len(arr) // 2]
     left = [x for x in arr if x < pivot]
     middle = [x for x in arr if x == pivot]
     right = [x for x in arr if x > pivot]
-    
+
     return quicksort(left) + middle + quicksort(right)
-

复杂度对比

+

Complexity Comparison

- - - - + + + +
算法最好情况平均情况最坏情况
快速排序O(n log n)O(n log n)O(n²)
归并排序O(n log n)O(n log n)O(n log n)
冒泡排序O(n)O(n²)O(n²)
AlgorithmBest CaseAverage CaseWorst Case
QuicksortO(n log n)O(n log n)O(n²)
Merge SortO(n log n)O(n log n)O(n log n)
Bubble SortO(n)O(n²)O(n²)
-

Master定理:T(n) = aT(n/b) + f(n)

-

其中 a ≥ 1, b > 1 是常数,f(n) 是正函数。

+

Master Theorem: T(n) = aT(n/b) + f(n)

+

Where a ≥ 1, b > 1 are constants, and f(n) is a positive function.

''', - "groundtruth_content": '''# 算法复杂度分析 + "groundtruth_content": '''# Algorithm Complexity Analysis -这里介绍常见算法的时间复杂度。 +Here we introduce the time complexity of common algorithms. -## 快速排序实现 +## Quicksort Implementation ```python def quicksort(arr): if len(arr) <= 1: return arr - + pivot = arr[len(arr) // 2] left = [x for x in arr if x < pivot] middle = [x for x in arr if x == pivot] right = [x for x in arr if x > pivot] - + return quicksort(left) + middle + quicksort(right) ``` -## 复杂度对比 +## Complexity Comparison -| 算法 | 最好情况 | 平均情况 | 最坏情况 | +| Algorithm | Best Case | Average Case | Worst Case | |------|----------|----------|----------| -| 快速排序 | O(n log n) | O(n log n) | O(n²) | -| 归并排序 | O(n log n) | O(n log n) | O(n log n) | -| 冒泡排序 | O(n) | O(n²) | O(n²) | +| Quicksort | O(n log n) | O(n log n) | O(n²) | +| Merge Sort | O(n log n) | O(n log n) | O(n log n) | +| Bubble Sort | O(n) | O(n²) | O(n²) | -Master定理:$T(n) = aT(n/b) + f(n)$ +Master Theorem: $T(n) = aT(n/b) + f(n)$ -其中 $a \\geq 1, b > 1$ 是常数,$f(n)$ 是正函数。''', +Where $a \\geq 1, b > 1$ are constants, and $f(n)$ is a positive function.''', "groundtruth_content_list": [ - {"type": "heading", "content": "算法复杂度分析", "level": 1}, - {"type": "paragraph", "content": "这里介绍常见算法的时间复杂度。"}, - {"type": "heading", "content": "快速排序实现", "level": 2}, + {"type": "heading", "content": "Algorithm Complexity Analysis", "level": 1}, + {"type": "paragraph", "content": "Here we introduce the time complexity of common algorithms."}, + {"type": "heading", "content": "Quicksort Implementation", "level": 2}, {"type": "code", "content": "def quicksort(arr):\n if len(arr) <= 1:\n return arr\n \n pivot = arr[len(arr) // 2]\n left = [x for x in arr if x < pivot]\n middle = [x for x in arr if x == pivot]\n right = [x for x in arr if x > pivot]\n \n return quicksort(left) + middle + quicksort(right)"}, - {"type": "heading", "content": "复杂度对比", "level": 2}, - {"type": "table", "content": "| 算法 | 最好情况 | 平均情况 | 最坏情况 |\n|------|----------|----------|----------|\n| 快速排序 | O(n log n) | O(n log n) | O(n²) |\n| 归并排序 | O(n log n) | O(n log n) | O(n log n) |\n| 冒泡排序 | O(n) | O(n²) | O(n²) |"}, + {"type": "heading", "content": "Complexity Comparison", "level": 2}, + {"type": "table", "content": "| Algorithm | Best Case | Average Case | Worst Case |\n|------|----------|----------|----------|\n| Quicksort | O(n log n) | O(n log n) | O(n²) |\n| Merge Sort | O(n log n) | O(n log n) | O(n log n) |\n| Bubble Sort | O(n) | O(n²) | O(n²) |"}, {"type": "equation-inline", "content": "T(n) = aT(n/b) + f(n)"}, - {"type": "paragraph", "content": "其中 a ≥ 1, b > 1 是常数,f(n) 是正函数。"} + {"type": "paragraph", "content": "Where a ≥ 1, b > 1 are constants, and f(n) is a positive function."} ], "url": "https://algorithm-guide.cs.edu/complexity-analysis", "layout_id": "algorithm-guide_4", @@ -280,8 +280,8 @@ def quicksort(arr): } ] - # 创建数据集 - dataset = BenchmarkDataset(name="sample_dataset", description="示例评测数据集") + # Create dataset + dataset = BenchmarkDataset(name="sample_dataset", description="Sample evaluation dataset") for sample_data in samples: sample = DataSample.from_dict(sample_data) @@ -291,140 +291,140 @@ def quicksort(arr): def demo_basic_mock_evaluation(): - """演示基本评测流程""" - - print("=== WebMainBench 基本使用示例 ===\n") - - # 设置日志 + """Demonstrate the basic evaluation workflow""" + + print("=== WebMainBench Basic Usage Example ===\n") + + # Set up logging setup_logging(level="INFO") - - # 1. 创建或加载数据集 - print("1. 创建示例数据集...") + + # 1. Create or load dataset + print("1. Creating sample dataset...") dataset = create_sample_dataset() - print(f"数据集包含 {len(dataset)} 个样本") - print(f"数据集统计: {dataset.get_statistics()}\n") - - # 2. 保存数据集到文件 + print(f"Dataset contains {len(dataset)} samples") + print(f"Dataset statistics: {dataset.get_statistics()}\n") + + # 2. Save dataset to file data_dir = Path("data") data_dir.mkdir(exist_ok=True) - + dataset_path = data_dir / "sample_dataset.jsonl" DataSaver.save_jsonl(dataset, dataset_path, include_results=False) - print(f"数据集已保存到: {dataset_path}\n") - - # 3. 重新加载数据集 - print("2. 重新加载数据集...") + print(f"Dataset saved to: {dataset_path}\n") + + # 3. Reload dataset + print("2. Reloading dataset...") loaded_dataset = DataLoader.load_jsonl(dataset_path) - print(f"加载的数据集包含 {len(loaded_dataset)} 个样本\n") - - # 4. 列出可用的抽取器 - print("3. 可用的抽取器:") + print(f"Loaded dataset contains {len(loaded_dataset)} samples\n") + + # 4. List available extractors + print("3. Available extractors:") available_extractors = ExtractorFactory.list_available() for extractor_name in available_extractors: print(f" - {extractor_name}") print() - - # 5. 创建评测器 - print("4. 创建评测器...") + + # 5. Create evaluator + print("4. Creating evaluator...") evaluator = Evaluator() - print(f"可用的评测指标: {evaluator.metric_calculator.list_available_metrics()}\n") - - # 6. 创建一个模拟抽取器进行演示 - print("5. 创建模拟抽取器...") - + print(f"Available evaluation metrics: {evaluator.metric_calculator.list_available_metrics()}\n") + + # 6. Create a mock extractor for demonstration + print("5. Creating mock extractor...") + from webmainbench.extractors import BaseExtractor, ExtractionResult - + class MockExtractor(BaseExtractor): - """模拟抽取器,用于演示""" - + """Mock extractor for demonstration""" + def _setup(self): pass - + def _extract_content(self, html, url=None): - # 简单的模拟抽取逻辑 - if "标题" in html: - content = "# 提取的标题\n\n提取的正文内容。" + # Simple mock extraction logic + if "heading" in html.lower() or "title" in html.lower(): + content = "# Extracted Title\n\nExtracted body content." content_list = [ - {"type": "heading", "content": "提取的标题", "level": 1}, - {"type": "paragraph", "content": "提取的正文内容。"} + {"type": "heading", "content": "Extracted Title", "level": 1}, + {"type": "paragraph", "content": "Extracted body content."} ] else: - content = "提取的内容" - content_list = [{"type": "paragraph", "content": "提取的内容"}] - + content = "Extracted content" + content_list = [{"type": "paragraph", "content": "Extracted content"}] + return ExtractionResult( content=content, content_list=content_list, success=True, confidence_score=0.85 ) - - # 注册模拟抽取器 + + # Register mock extractor ExtractorFactory.register("mock", MockExtractor) mock_extractor = ExtractorFactory.create("mock") - print("模拟抽取器已创建\n") - - # 7. 运行评测 - print("6. 运行评测...") + print("Mock extractor created\n") + + # 7. Run evaluation + print("6. Running evaluation...") result = evaluator.evaluate( dataset=loaded_dataset, extractor=mock_extractor, - max_samples=2 # 限制样本数量用于演示 + max_samples=2 # Limit sample count for demonstration ) - - # 8. 显示结果 - print("\n7. 评测结果:") + + # 8. Display results + print("\n7. Evaluation results:") print("=" * 50) formatted_results = format_results(result.to_dict()) print(formatted_results) - - # 9. 保存结果 + + # 9. Save results results_dir = Path("results") results_dir.mkdir(exist_ok=True) - + results_path = results_dir / "mock_evaluation_results.json" DataSaver.save_evaluation_results(result, results_path) - print(f"\n结果已保存到: {results_path}") - - # 10. 生成报告 + print(f"\nResults saved to: {results_path}") + + # 10. Generate report report_path = results_dir / "mock_evaluation_report.csv" DataSaver.save_summary_report(result, report_path) - print(f"报告已保存到: {report_path}") + print(f"Report saved to: {report_path}") def demo_llm_webkit_evaluation(): - """演示LLM-WebKit抽取器的6项指标评测""" - - print("=== LLM-WebKit Extractor 6项指标评测示例 ===\n") - - # 设置日志 + """Demonstrate 6-metric evaluation with LLM-WebKit extractor""" + + print("=== LLM-WebKit Extractor 6-Metric Evaluation Example ===\n") + + # Set up logging setup_logging(level="INFO") - - # 1. 创建包含各种内容类型的测试数据集 - print("1. 创建包含多种内容类型的测试数据集...") - + + # 1. Create test dataset with various content types + print("1. Creating test dataset with multiple content types...") + samples = [] - - # 样本1: 包含文本和代码 + + # Sample 1: text and code samples.append(DataSample( id="text_code_sample", html=""" -

Python编程示例

-

这是一段关于Python编程的介绍文本。

+

Python Programming Example

+

This is an introductory text about Python programming.


 def hello_world():
     print("Hello, World!")
     return True
             
-

以上代码展示了一个简单的Python函数。

+

The code above demonstrates a simple Python function.

""", - groundtruth_content="""# Python编程示例 + groundtruth_content="""# Python Programming Example -这是一段关于Python编程的介绍文本。 +This is an introductory text about Python programming. ```python def hello_world(): @@ -432,38 +432,38 @@ def hello_world(): return True ``` -以上代码展示了一个简单的Python函数。""", +The code above demonstrates a simple Python function.""", groundtruth_content_list=[ - {"type": "heading", "content": "Python编程示例", "level": 1}, - {"type": "text", "content": "这是一段关于Python编程的介绍文本。"}, + {"type": "heading", "content": "Python Programming Example", "level": 1}, + {"type": "text", "content": "This is an introductory text about Python programming."}, {"type": "code", "content": "def hello_world():\n print(\"Hello, World!\")\n return True", "language": "python"}, - {"type": "text", "content": "以上代码展示了一个简单的Python函数。"} + {"type": "text", "content": "The code above demonstrates a simple Python function."} ] )) - - # 样本2: 包含表格 + + # Sample 2: table samples.append(DataSample( id="table_sample", html=""" -

销售数据统计

+

Sales Data Summary

- - - + + + - + - + @@ -472,277 +472,276 @@ def hello_world(): """, - groundtruth_content="""## 销售数据统计 + groundtruth_content="""## Sales Data Summary -| 产品 | 销量 | 收入 | +| Product | Sales | Revenue | |------|------|------| -| 产品A | 100 | 1000 | -| 产品B | 200 | 3000 |""", +| Product A | 100 | 1000 | +| Product B | 200 | 3000 |""", groundtruth_content_list=[ - {"type": "heading", "content": "销售数据统计", "level": 2}, - {"type": "table", "content": "| 产品 | 销量 | 收入 |\n|------|------|------|\n| 产品A | 100 | 1000 |\n| 产品B | 200 | 3000 |"} + {"type": "heading", "content": "Sales Data Summary", "level": 2}, + {"type": "table", "content": "| Product | Sales | Revenue |\n|------|------|------|\n| Product A | 100 | 1000 |\n| Product B | 200 | 3000 |"} ] )) - - # 样本3: 包含公式 + + # Sample 3: formulas samples.append(DataSample( id="formula_sample", html=""" -

数学公式示例

-

这是一个行内公式: $E = mc^2$

-

这是一个行间公式:

+

Math Formula Example

+

This is an inline formula: $E = mc^2$

+

This is a block formula:

$$\\int_{-\\infty}^{\\infty} e^{-x^2} dx = \\sqrt{\\pi}$$
""", - groundtruth_content="""## 数学公式示例 + groundtruth_content="""## Math Formula Example -这是一个行内公式: $E = mc^2$ +This is an inline formula: $E = mc^2$ -这是一个行间公式: +This is a block formula: $$\\int_{-\\infty}^{\\infty} e^{-x^2} dx = \\sqrt{\\pi}$$""", groundtruth_content_list=[ - {"type": "heading", "content": "数学公式示例", "level": 2}, - {"type": "text", "content": "这是一个行内公式: $E = mc^2$"}, - {"type": "text", "content": "这是一个行间公式:"}, + {"type": "heading", "content": "Math Formula Example", "level": 2}, + {"type": "text", "content": "This is an inline formula: $E = mc^2$"}, + {"type": "text", "content": "This is a block formula:"}, {"type": "formula", "content": "\\int_{-\\infty}^{\\infty} e^{-x^2} dx = \\sqrt{\\pi}"} ] )) - - # 创建数据集并添加样本 - dataset = BenchmarkDataset(name="llm_webkit_test", description="LLM-WebKit 6项指标测试数据集") + + # Create dataset and add samples + dataset = BenchmarkDataset(name="llm_webkit_test", description="LLM-WebKit 6-metric test dataset") for sample in samples: dataset.add_sample(sample) - - print(f"测试数据集包含 {len(dataset)} 个样本") - print(f"样本类型: 文本+代码, 表格, 公式\n") - - # 2. 创建LLM-WebKit抽取器 - print("2. 创建LLM-WebKit抽取器...") - - # 显示所有可用的抽取器 + + print(f"Test dataset contains {len(dataset)} samples") + print(f"Sample types: text+code, table, formula\n") + + # 2. Create LLM-WebKit extractor + print("2. Creating LLM-WebKit extractor...") + + # Show all available extractors available_extractors = ExtractorFactory.list_available() - print(f"可用的抽取器: {available_extractors}") - - # 直接创建LLM-WebKit抽取器,设置模型路径 + print(f"Available extractors: {available_extractors}") + + # Create LLM-WebKit extractor directly with model path config = { "model_path": "/Users/chupei/model/checkpoint-3296" } extractor = ExtractorFactory.create("llm-webkit", config=config) - print(f"✅ LLM-WebKit抽取器创建成功,模型路径: {config['model_path']}") - + print(f"LLM-WebKit extractor created successfully, model path: {config['model_path']}") + print() - - # 3. 创建评测器并显示所有可用指标 - print("3. 创建评测器...") + + # 3. Create evaluator and show all available metrics + print("3. Creating evaluator...") evaluator = Evaluator() available_metrics = evaluator.metric_calculator.list_available_metrics() - print(f"✅ 可用的评测指标 ({len(available_metrics)}项):") - - # 按照6项指标分类显示 + print(f"Available evaluation metrics ({len(available_metrics)} total):") + + # Display by the 6 metric categories target_metrics = ["overall", "text_edit", "code_edit", "table_edit", "table_TEDS", "formula_edit"] - + for metric in target_metrics: if metric in available_metrics: - print(f" ✅ {metric}") + print(f" {metric}") else: - print(f" ❌ {metric} (未注册)") - + print(f" {metric} (not registered)") + print() - - # 4. 运行评测 - print("4. 开始评测...") + + # 4. Run evaluation + print("4. Starting evaluation...") print("=" * 60) - + result = evaluator.evaluate( dataset=dataset, extractor=extractor, - max_samples=None # 评测所有样本 + max_samples=None # Evaluate all samples ) - - # 5. 显示详细的6项指标结果 - print("\n5. 📊 6项指标详细评测结果:") + + # 5. Display detailed 6-metric results + print("\n5. 6-metric detailed evaluation results:") print("=" * 60) - + results_dict = result.to_dict() - - # 从overall_metrics中提取指标结果 + + # Extract metric results from overall_metrics metrics = results_dict.get('overall_metrics', {}) - - # 按照指标分类显示 - print(f"\n🏆 综合指标:") + + # Display by metric category + print(f"\nOverall metrics:") if 'overall' in metrics: - print(f" overall (综合得分): {metrics['overall']:.4f}") + print(f" overall (combined score): {metrics['overall']:.4f}") else: - print(" overall: 未计算") - - print(f"\n📝 文本相关指标:") + print(" overall: not calculated") + + print(f"\nText-related metrics:") if 'text_edit' in metrics: - print(f" text_edit (文本编辑距离): {metrics['text_edit']:.4f}") + print(f" text_edit (text edit distance): {metrics['text_edit']:.4f}") else: - print(" text_edit: 未计算") + print(" text_edit: not calculated") if 'code_edit' in metrics: - print(f" code_edit (代码编辑距离): {metrics['code_edit']:.4f}") + print(f" code_edit (code edit distance): {metrics['code_edit']:.4f}") else: - print(" code_edit: 未计算") - - print(f"\n📊 表格相关指标:") + print(" code_edit: not calculated") + + print(f"\nTable-related metrics:") if 'table_edit' in metrics: - print(f" table_edit (表格编辑距离): {metrics['table_edit']:.4f}") + print(f" table_edit (table edit distance): {metrics['table_edit']:.4f}") else: - print(" table_edit: 未计算") + print(" table_edit: not calculated") if 'table_TEDS' in metrics: - print(f" table_TEDS (表格结构相似度): {metrics['table_TEDS']:.4f}") + print(f" table_TEDS (table structure similarity): {metrics['table_TEDS']:.4f}") else: - print(" table_TEDS: 未计算") - - print(f"\n🧮 公式相关指标:") + print(" table_TEDS: not calculated") + + print(f"\nFormula-related metrics:") if 'formula_edit' in metrics: - print(f" formula_edit (公式编辑距离): {metrics['formula_edit']:.4f}") + print(f" formula_edit (formula edit distance): {metrics['formula_edit']:.4f}") else: - print(" formula_edit: 未计算") - - print(f"\n📈 详细统计:") - print(f" 总样本数: {len(dataset)}") + print(" formula_edit: not calculated") + + print(f"\nDetailed statistics:") + print(f" Total samples: {len(dataset)}") success_count = len([s for s in results_dict.get('sample_results', []) if s.get('extraction_success', False)]) failure_count = len(dataset) - success_count - print(f" 成功样本数: {success_count}") - print(f" 失败样本数: {failure_count}") - - # 6. 保存结果到文件 + print(f" Successful samples: {success_count}") + print(f" Failed samples: {failure_count}") + + # 6. Save results to file print("\n" + "=" * 60) - print("6. 保存评测结果...") - + print("6. Saving evaluation results...") + results_dir = Path("results") results_dir.mkdir(exist_ok=True) - - # 保存详细结果 + + # Save detailed results results_path = results_dir / "llm_webkit_evaluation_results.json" - DataSaver.save_evaluation_results(result, results_path) # 直接传递result对象 - print(f"✅ 详细结果已保存到: {results_path}") - - # 生成CSV报告 + DataSaver.save_evaluation_results(result, results_path) # Pass result object directly + print(f"Detailed results saved to: {results_path}") + + # Generate CSV report report_path = results_dir / "llm_webkit_evaluation_report.csv" - DataSaver.save_summary_report(result, report_path) # 直接传递result对象 - print(f"✅ CSV报告已保存到: {report_path}") - + DataSaver.save_summary_report(result, report_path) # Pass result object directly + print(f"CSV report saved to: {report_path}") + print("\n" + "=" * 60) - print("✅ LLM-WebKit 6项指标评测完成!") + print("LLM-WebKit 6-metric evaluation complete!") def demo_dataset_with_extraction(): - """演示保存带有抽取内容的数据集""" - print("=== 演示:保存带有抽取内容的数据集 ===") - + """Demonstrate saving a dataset with extracted content""" + print("=== Demo: Saving a Dataset with Extracted Content ===") + from webmainbench import DataLoader, DataSaver, Evaluator, ExtractorFactory from pathlib import Path - - # 配置文件路径 + + # Configure file paths data_dir = Path("data") dataset_path = data_dir / "sample_dataset.jsonl" # dataset_path = "/Users/chupei/Downloads/WebMainBench_dataset_merge_2549.jsonl" - - print(f"📂 数据集文件: {dataset_path}") - - # 🔧 创建llm-webkit抽取器(统一使用) + + print(f"Dataset file: {dataset_path}") + + # Create llm-webkit extractor (used uniformly) extractor_config = {"model_path": "/Users/chupei/model/checkpoint-3296"} extractor = ExtractorFactory.create("llm-webkit", config=extractor_config) - print(f"🤖 使用抽取器: {extractor.name}") - - # 创建评测器 + print(f"Using extractor: {extractor.name}") + + # Create evaluator evaluator = Evaluator() - - # 🔧 选择评测模式:内存模式 vs 批处理模式 - USE_BATCHED_MODE = True # 设置为True使用批处理模式(适用于大数据集) - + + # Choose evaluation mode: in-memory mode vs batched mode + USE_BATCHED_MODE = True # Set to True to use batched mode (suitable for large datasets) + if USE_BATCHED_MODE: - print("🔄 使用批处理模式(内存优化)") - - # 🚀 批处理评测(适用于大数据集) + print("Using batched mode (memory-optimized)") + + # Batched evaluation (suitable for large datasets) result = evaluator.evaluate_batched( jsonl_file_path=dataset_path, - extractor=extractor, # 直接传递extractor对象 - batch_size=10, # 小批次 - max_samples=20 # 演示用 + extractor=extractor, # Pass extractor object directly + batch_size=10, # Small batch size + max_samples=20 # For demonstration ) - print(f"✅ 批处理评测完成,总体得分: {result.overall_metrics.get('overall', 0):.4f}") - - # 为了保存带有抽取内容的数据集,需要重新加载原始数据集 - # 注:这里只是短暂加载用于保存,不影响前面的内存优化评测 + print(f"Batched evaluation complete, overall score: {result.overall_metrics.get('overall', 0):.4f}") + + # To save the dataset with extraction content, reload the original dataset temporarily + # Note: this is only a brief load for saving and does not affect the memory-optimized evaluation above dataset = DataLoader.load_jsonl(dataset_path, include_results=False) dataset.name = result.dataset_name - + else: - print("🔄 使用传统内存模式") - - # 从文件加载数据集 - print(f"📂 从文件加载数据集: {dataset_path}") + print("Using traditional in-memory mode") + + # Load dataset from file + print(f"Loading dataset from file: {dataset_path}") dataset = DataLoader.load_jsonl(dataset_path, include_results=False) dataset.name = "WebMainBench_with_extraction" - dataset.description = "演示抽取内容保存的测试数据集" - - print(f"📊 加载数据集完成,包含 {len(dataset.samples)} 个样本") - - # 运行评测 + dataset.description = "Test dataset demonstrating extraction content saving" + + print(f"Dataset loaded, contains {len(dataset.samples)} samples") + + # Run evaluation result = evaluator.evaluate(dataset, extractor) - - print(f"✅ 评测完成,总体得分: {result.overall_metrics.get('overall', 0):.4f}") - - # 保存带有抽取内容的数据集 + + print(f"Evaluation complete, overall score: {result.overall_metrics.get('overall', 0):.4f}") + + # Save dataset with extracted content results_dir = Path("results") enriched_dataset_path = results_dir / f"{dataset.name}_with_{extractor.name}_extraction.jsonl" - + DataSaver.save_dataset_with_extraction( results=result, - dataset=dataset, + dataset=dataset, file_path=enriched_dataset_path, extractor_name=extractor.name ) - - print(f"💾 已保存带有抽取内容的数据集到: {enriched_dataset_path}") - - # 保存评测结果和摘要报告 + + print(f"Dataset with extracted content saved to: {enriched_dataset_path}") + + # Save evaluation results and summary report evaluation_results_path = results_dir / f"{dataset.name}_{extractor.name}_evaluation_results.json" summary_report_path = results_dir / f"{dataset.name}_{extractor.name}_evaluation_report.csv" - + DataSaver.save_evaluation_results(result, evaluation_results_path) DataSaver.save_summary_report(result, summary_report_path) - - print(f"📊 已保存评测结果到: {evaluation_results_path}") - print(f"📈 已保存摘要报告到: {summary_report_path}") - - # 显示保存的字段信息 - print("\n📋 保存的新字段包括:") - print(f" - {extractor.name}_content: 抽取的内容") - print(f" - {extractor.name}_content_list: 抽取的结构化内容列表") - print(f" - {extractor.name}_success: 抽取是否成功") - print(f" - {extractor.name}_time: 抽取耗时") - print(f" - {extractor.name}_*_score: 各项指标分数") + + print(f"Evaluation results saved to: {evaluation_results_path}") + print(f"Summary report saved to: {summary_report_path}") + + # Display saved field info + print("\nNewly saved fields include:") + print(f" - {extractor.name}_content: extracted content") + print(f" - {extractor.name}_content_list: extracted structured content list") + print(f" - {extractor.name}_success: whether extraction succeeded") + print(f" - {extractor.name}_time: extraction time") + print(f" - {extractor.name}_*_score: metric scores") def demo_multi_extraction(): - """演示保存带有多个抽取器抽取内容的数据集(支持批处理模式)""" - print("=== 演示:保存带有多个抽取器抽取内容的数据集 ===") + """Demonstrate saving a dataset with content from multiple extractors (supports batched mode)""" + print("=== Demo: Saving a Dataset with Multiple Extractor Results ===") from webmainbench import DataLoader, DataSaver, Evaluator, ExtractorFactory from pathlib import Path import time - - # 设置日志 + # Set up logging setup_logging(level="INFO") - # 配置文件路径 + # Configure file paths data_dir = Path("../data") # dataset_path = data_dir / "sample_dataset.jsonl" dataset_path = "/home/lulindong/Pycharm_projects/cc/WebMainBench_1904_v1_WebMainBench_dataset_merge_with_llm_webkit.jsonl" - print(f"📂 数据集文件: {dataset_path}") + print(f"Dataset file: {dataset_path}") - # 🔧 定义要使用的抽取器列表及配置 + # Define list of extractors and their configurations extractors_info = [ {"name": "resiliparse", "config": { "main_content": True, @@ -755,68 +754,68 @@ def demo_multi_extraction(): {"name": "magic-html", "config": {}}, ] - # 🔧 选择评测模式:内存模式 vs 批处理模式 - USE_BATCHED_MODE = True # 大数据集建议设为True - BATCH_SIZE = 10 # 批处理大小 - MAX_SAMPLES = None # 演示用(全量评测可设为None) + # Choose evaluation mode: in-memory mode vs batched mode + USE_BATCHED_MODE = True # Recommended True for large datasets + BATCH_SIZE = 10 # Batch size + MAX_SAMPLES = None # For demonstration (set None for full evaluation) - # 创建结果目录 + # Create results directory results_dir = Path("results") results_dir.mkdir(exist_ok=True) - # 存储所有抽取器的评测结果和性能数据 + # Store evaluation results and performance data for all extractors all_results = [] extractor_performance = [] - # 为每个抽取器运行评测 + # Run evaluation for each extractor for info in extractors_info: extractor_name = info["name"] config = info["config"] try: - # 创建抽取器实例 + # Create extractor instance extractor = ExtractorFactory.create(extractor_name, config=config) - print(f"\n🤖 使用抽取器: {extractor.name}") + print(f"\nUsing extractor: {extractor.name}") except Exception as e: - print(f"⚠️ {extractor_name} 抽取器创建失败: {e}") + print(f"Failed to create extractor {extractor_name}: {e}") continue - # 记录总耗时 + # Record total elapsed time start_time = time.time() - # 初始化评测器 + # Initialize evaluator evaluator = Evaluator() - # 选择批处理模式或传统模式 + # Choose batched or traditional mode if USE_BATCHED_MODE: - print(f"🔄 使用批处理模式(批大小: {BATCH_SIZE},最大样本: {MAX_SAMPLES or '全部'})") - # 批处理评测(内存优化) + print(f"Using batched mode (batch size: {BATCH_SIZE}, max samples: {MAX_SAMPLES or 'all'})") + # Batched evaluation (memory-optimized) result = evaluator.evaluate_batched( jsonl_file_path=dataset_path, extractor=extractor, batch_size=BATCH_SIZE, max_samples=MAX_SAMPLES ) - # 为保存数据集,临时加载原始数据(不影响内存优化) + # Temporarily load original data for saving (does not affect memory-optimized evaluation) dataset = DataLoader.load_jsonl(dataset_path, include_results=False, max_samples=MAX_SAMPLES) dataset.name = result.dataset_name else: - print("🔄 使用传统内存模式") - # 加载完整数据集到内存 + print("Using traditional in-memory mode") + # Load full dataset into memory dataset = DataLoader.load_jsonl(dataset_path, include_results=False, max_samples=MAX_SAMPLES) dataset.name = "WebMainBench_with_multi_extraction" - dataset.description = "多抽取器内容保存演示数据集" - print(f"📊 加载数据集完成,包含 {len(dataset.samples)} 个样本") + dataset.description = "Multi-extractor content saving demo dataset" + print(f"Dataset loaded, contains {len(dataset.samples)} samples") - # 传统模式评测 + # Traditional mode evaluation result = evaluator.evaluate(dataset, extractor) - # 计算耗时指标 + # Calculate elapsed time metrics total_time = time.time() - start_time total_samples = len(dataset.samples) avg_time_per_sample = total_time / total_samples if total_samples else 0 - # 保存性能数据 + # Save performance data extractor_performance.append({ "name": extractor_name, "total_samples": total_samples, @@ -824,19 +823,19 @@ def demo_multi_extraction(): "avg_time_per_sample": avg_time_per_sample }) - # 输出评测结果 - print(f"⏱️ 总耗时: {total_time:.4f}秒(单样本平均: {avg_time_per_sample:.4f}秒)") - print(f"📊 核心指标:") + # Output evaluation results + print(f"Total time: {total_time:.4f}s (avg per sample: {avg_time_per_sample:.4f}s)") + print(f"Core metrics:") print(f" code_edit: {result.overall_metrics.get('code_edit', 0):.4f}") print(f" formula_edit: {result.overall_metrics.get('formula_edit', 0):.4f}") print(f" table_TEDS: {result.overall_metrics.get('table_TEDS', 0):.4f}") print(f" table_edit: {result.overall_metrics.get('table_edit', 0):.4f}") print(f" text_edit: {result.overall_metrics.get('text_edit', 0):.4f}") - print(f"✅ 总体得分: {result.overall_metrics.get('overall', 0):.4f}") + print(f"Overall score: {result.overall_metrics.get('overall', 0):.4f}") all_results.append(result) - # 保存带有当前抽取器内容的数据集 + # Save dataset with current extractor's content enriched_dataset_path = results_dir / f"{dataset.name}_{extractor.name}_extraction_infer.jsonl" DataSaver.save_dataset_with_extraction( results=result, @@ -844,153 +843,153 @@ def demo_multi_extraction(): file_path=enriched_dataset_path, extractor_name=extractor.name ) - print(f"💾 已保存抽取内容到: {enriched_dataset_path}") + print(f"Extracted content saved to: {enriched_dataset_path}") - # 保存单个抽取器的评测结果 + # Save individual extractor evaluation results eval_results_path = results_dir / f"{dataset.name}_{extractor.name}_evaluation_results.json" DataSaver.save_evaluation_results(result, eval_results_path) - print(f"📋 已保存评测结果到: {eval_results_path}") + print(f"Evaluation results saved to: {eval_results_path}") - # 保存所有抽取器的汇总报告 + # Save summary report for all extractors if all_results: summary_path = results_dir / f"{dataset.name}_multi_extractors_summary_report.csv" DataSaver.save_summary_report(all_results, summary_path) - print(f"\n📈 已保存汇总报告到: {summary_path}") + print(f"\nSummary report saved to: {summary_path}") - # 展示性能对比 + # Display performance comparison if extractor_performance: - print("\n⚡ 抽取器性能对比:") + print("\nExtractor performance comparison:") for perf in extractor_performance: print(f" {perf['name']}:") - print(f" 样本数: {perf['total_samples']}") - print(f" 总耗时: {perf['total_time']:.4f}秒") - print(f" 单样本耗时: {perf['avg_time_per_sample']:.4f}秒") - print(f" 效率: {1 / perf['avg_time_per_sample']:.2f}样本/秒") + print(f" Samples: {perf['total_samples']}") + print(f" Total time: {perf['total_time']:.4f}s") + print(f" Time per sample: {perf['avg_time_per_sample']:.4f}s") + print(f" Throughput: {1 / perf['avg_time_per_sample']:.2f} samples/s") - # 展示保存的字段信息 - print("\n📋 保存的新字段说明:") + # Display saved field information + print("\nSaved new field descriptions:") for info in extractors_info: name = info["name"] - print(f" {name}相关字段:") - print(f" - {name}_content: 抽取的原始内容") - print(f" - {name}_content_list: 结构化内容列表(含type字段)") - print(f" - {name}_success: 抽取是否成功(布尔值)") - print(f" - {name}_time: 单样本抽取耗时(秒)") - print(f" - {name}_*_score: 各指标得分(如{name}_text_edit)") + print(f" {name} related fields:") + print(f" - {name}_content: extracted raw content") + print(f" - {name}_content_list: structured content list (with type field)") + print(f" - {name}_success: whether extraction succeeded (boolean)") + print(f" - {name}_time: per-sample extraction time (seconds)") + print(f" - {name}_*_score: metric scores (e.g. {name}_text_edit)") def demo_llm_webkit_with_preprocessed_html_evaluation(): - """演示LLM-WebKit预处理HTML功能的评测""" - - print("\n=== LLM-WebKit 预处理HTML功能演示 ===\n") - - # 设置日志 + """Demonstrate evaluation of LLM-WebKit preprocessed HTML feature""" + + print("\n=== LLM-WebKit Preprocessed HTML Feature Demo ===\n") + + # Set up logging setup_logging(level="INFO") - - # 1. 从真实数据集加载包含预处理HTML的数据 - print("1. 从真实数据集加载预处理HTML数据...") + + # 1. Load preprocessed HTML data from the real dataset + print("1. Loading preprocessed HTML data from the real dataset...") dataset_path = Path("data/track_id_diff_result_56.jsonl") - print(f"📂 数据集文件: {dataset_path}") - - # 加载数据集 + print(f"Dataset file: {dataset_path}") + + # Load dataset dataset = DataLoader.load_jsonl(dataset_path, include_results=False) dataset.name = "real_preprocessed_html_test" - dataset.description = "基于真实数据的预处理HTML功能测试" - - print(f"✅ 真实数据集加载成功,包含 {len(dataset)} 个样本") - print("📋 真实数据样本包含:") - print(" - html: 原始网页HTML") - print(" - llm_webkit_html: LLM预处理后的简化HTML(包含_item_id标记)") - print(" - groundtruth_content: 人工标注的标准答案") - print(" - llm_webkit_md: LLM提取的markdown内容") - - - # 2. 创建预处理HTML模式的LLM-WebKit抽取器 - print("2. 创建预处理HTML模式的LLM-WebKit抽取器...") - + dataset.description = "Preprocessed HTML feature test based on real data" + + print(f"Real dataset loaded successfully, contains {len(dataset)} samples") + print("Real data samples include:") + print(" - html: raw web page HTML") + print(" - llm_webkit_html: LLM-preprocessed simplified HTML (with _item_id markers)") + print(" - groundtruth_content: manually annotated ground truth") + print(" - llm_webkit_md: LLM-extracted markdown content") + + + # 2. Create LLM-WebKit extractor in preprocessed HTML mode + print("2. Creating LLM-WebKit extractor in preprocessed HTML mode...") + config = { - "use_preprocessed_html": True, # 🔑 关键配置:启用预处理HTML模式 - "preprocessed_html_field": "llm_webkit_html" # 指定预处理HTML字段名 + "use_preprocessed_html": True, # Key config: enable preprocessed HTML mode + "preprocessed_html_field": "llm_webkit_html" # Specify preprocessed HTML field name } - + extractor = ExtractorFactory.create("llm-webkit", config=config) - - # 4. 运行评测 - print("4. 开始评测...") + + # 4. Run evaluation + print("4. Starting evaluation...") print("=" * 50) - + evaluator = Evaluator() result = evaluator.evaluate( dataset=dataset, extractor=extractor, max_samples=None ) - - # 5. 显示评测结果 - print("\n5. 📊 预处理HTML模式评测结果:") + + # 5. Display evaluation results + print("\n5. Preprocessed HTML mode evaluation results:") print("=" * 50) - + results_dict = result.to_dict() metrics = results_dict.get('overall_metrics', {}) - - # 显示关键指标 - print(f"\n🏆 综合指标:") + + # Display key metrics + print(f"\nOverall metrics:") print(f" overall: {metrics.get('overall', 0):.4f}") - - print(f"\n📝 内容提取质量:") + + print(f"\nContent extraction quality:") print(f" text_edit: {metrics.get('text_edit', 0):.4f}") print(f" code_edit: {metrics.get('code_edit', 0):.4f}") print(f" table_edit: {metrics.get('table_edit', 0):.4f}") print(f" table_TEDS: {metrics.get('table_TEDS', 0):.4f}") - - print(f"\n⚡ 性能统计:") + + print(f"\nPerformance statistics:") sample_results = results_dict.get('sample_results', []) if sample_results: extraction_times = [s.get('extraction_time', 0) for s in sample_results if s.get('extraction_success')] if extraction_times: avg_time = sum(extraction_times) / len(extraction_times) - print(f" 平均提取时间: {avg_time:.3f}秒") - print(f" 处理速度: {1/avg_time:.1f}样本/秒") - + print(f" Average extraction time: {avg_time:.3f}s") + print(f" Processing speed: {1/avg_time:.1f} samples/s") + success_count = len([s for s in sample_results if s.get('extraction_success', False)]) - print(f" 成功样本数: {success_count}/{len(dataset)}") - - # 7. 保存结果 - print(f"\n7. 💾 保存评测结果...") - + print(f" Successful samples: {success_count}/{len(dataset)}") + + # 7. Save results + print(f"\n7. Saving evaluation results...") + results_dir = Path("results") results_dir.mkdir(exist_ok=True) - # 新增:保存带抽取结果的增强数据集(JSONL格式) + # Save enhanced dataset with extraction results (JSONL format) jsonl_dataset_path = results_dir / f"{extractor.name}_preprocessed_html_dataset_with_results.jsonl" DataSaver.save_dataset_with_extraction( results=result, - dataset=dataset, # 原始数据集对象 + dataset=dataset, # Original dataset object file_path=jsonl_dataset_path, - extractor_name="llm-webkit" # 抽取器名称前缀 + extractor_name="llm-webkit" # Extractor name prefix ) - print(f"✅ 带抽取结果的JSONL数据集已保存到: {jsonl_dataset_path}") + print(f"JSONL dataset with extraction results saved to: {jsonl_dataset_path}") results_path = results_dir / f"{extractor.name}_preprocessed_html_evaluation_results.json" report_path = results_dir / f"{extractor.name}_preprocessed_html_evaluation_report.csv" - + DataSaver.save_evaluation_results(result, results_path) DataSaver.save_summary_report(result, report_path) - - print(f"✅ 详细结果已保存到: {results_path}") - print(f"✅ CSV报告已保存到: {report_path}") + + print(f"Detailed results saved to: {results_path}") + print(f"CSV report saved to: {report_path}") if __name__ == "__main__": try: # demo_basic_mock_evaluation() - # demo_llm_webkit_evaluation() # 使用LLM-WebKit评测示例 + # demo_llm_webkit_evaluation() # LLM-WebKit evaluation example demo_llm_webkit_with_preprocessed_html_evaluation() # demo_extractor_comparison() - # demo_dataset_with_extraction() # 演示保存带有抽取内容的数据集 - # demo_multi_extraction() # 演示多个抽取器同时评测 - print("\n✅ 示例运行完成!") - + # demo_dataset_with_extraction() # Demo saving dataset with extracted content + # demo_multi_extraction() # Demo evaluating with multiple extractors simultaneously + print("\nExample completed!") + except Exception as e: - print(f"\n❌ 运行出错: {e}") + print(f"\nRuntime error: {e}") import traceback traceback.print_exc() \ No newline at end of file diff --git a/examples/demo.py b/examples/demo.py index b460ad5..0c016bd 100644 --- a/examples/demo.py +++ b/examples/demo.py @@ -1,15 +1,15 @@ from webmainbench import DataLoader, Evaluator, ExtractorFactory from pathlib import Path -# 1. 加载评测数据集 +# 1. Load evaluation dataset dataset = DataLoader.load_jsonl(Path("data/sample_dataset.jsonl")) -# 2. 创建抽取器 +# 2. Create extractor extractor = ExtractorFactory.create("llm-webkit") -# 3. 运行评测 +# 3. Run evaluation evaluator = Evaluator() result = evaluator.evaluate(dataset, extractor) -# 4. 查看结果 +# 4. View results print(f"Overall Score: {result}") diff --git a/examples/llm_webkit_usage.py b/examples/llm_webkit_usage.py index 4300f55..229cde0 100644 --- a/examples/llm_webkit_usage.py +++ b/examples/llm_webkit_usage.py @@ -1,8 +1,8 @@ #!/usr/bin/env python3 """ -LLM-WebKit Extractor使用示例 +LLM-WebKit Extractor Usage Example -本示例展示如何使用集成了VLLM推理能力的LLM-WebKit extractor。 +This example demonstrates how to use the LLM-WebKit extractor integrated with VLLM inference capabilities. """ import time @@ -10,138 +10,138 @@ def main(): - print("🚀 LLM-WebKit Extractor 使用示例\n") - - # 1. 创建带有自定义配置的extractor + print("LLM-WebKit Extractor Usage Example\n") + + # 1. Create extractor with custom configuration config = { - "model_path": "/Users/chupei/model/checkpoint-3296", # 替换为您的模型路径 - "use_logits_processor": True, # 启用JSON格式约束 - "temperature": 0.0, # 确定性输出 - "max_item_count": 500, # 处理的最大item数量 - "max_output_tokens": 4096, # 最大输出token数 - "dtype": "bfloat16", # 模型精度 - "tensor_parallel_size": 1 # 张量并行大小 + "model_path": "/Users/chupei/model/checkpoint-3296", # Replace with your model path + "use_logits_processor": True, # Enable JSON format constraint + "temperature": 0.0, # Deterministic output + "max_item_count": 500, # Maximum number of items to process + "max_output_tokens": 4096, # Maximum output tokens + "dtype": "bfloat16", # Model precision + "tensor_parallel_size": 1 # Tensor parallel size } - + try: extractor = ExtractorFactory.create("llm-webkit", config=config) - print(f"✅ Extractor创建成功: {extractor.description}") - print(f"📋 版本: {extractor.version}") - print(f"⚙️ 配置: {extractor.inference_config.__dict__}\n") - + print(f"Extractor created successfully: {extractor.description}") + print(f"Version: {extractor.version}") + print(f"Config: {extractor.inference_config.__dict__}\n") + except Exception as e: - print(f"❌ Extractor创建失败: {e}") - print("💡 请确保已安装所需依赖:") + print(f"Extractor creation failed: {e}") + print("Please ensure the required dependencies are installed:") print(" pip install vllm transformers torch llm_web_kit") return - - # 2. 准备测试HTML(包含_item_id属性的结构化HTML) + + # 2. Prepare test HTML (structured HTML with _item_id attributes) test_html = """ - 测试文章 - 人工智能的发展趋势 + Test Article - AI Development Trends - +
-

人工智能的发展趋势

-

作者:张三 | 发布时间:2024-01-15 | 阅读量:1,234

+

AI Development Trends

+

Author: John Doe | Published: 2024-01-15 | Views: 1,234

- +
-

人工智能(AI)技术正在快速发展,对各行各业产生深远影响。本文将探讨AI的主要发展趋势和未来展望。

- -

1. 机器学习的进步

-

深度学习和大语言模型的突破使得AI系统能够理解和生成更自然的语言,在对话、翻译、创作等领域表现出色。

- -

2. 自动化应用

-

从制造业的机器人到软件开发的代码生成,AI正在各个领域实现流程自动化,提高效率并降低成本。

- -

3. 个性化服务

-

基于用户数据的个性化推荐和服务正变得越来越精准,为用户提供更好的体验。

+

Artificial Intelligence (AI) technology is rapidly advancing, with far-reaching impacts across all industries. This article explores the major development trends and future prospects of AI.

+ +

1. Advances in Machine Learning

+

Breakthroughs in deep learning and large language models have enabled AI systems to understand and generate more natural language, excelling in dialogue, translation, and creative tasks.

+ +

2. Automation Applications

+

From robots in manufacturing to code generation in software development, AI is automating processes across domains, improving efficiency and reducing costs.

+ +

3. Personalized Services

+

Personalized recommendations and services based on user data are becoming increasingly precise, providing better user experiences.

- + - + """ - - # 3. 执行内容提取 - print("🔍 开始内容提取...") + + # 3. Execute content extraction + print("Starting content extraction...") start_time = time.time() try: result = extractor.extract(test_html) end_time = time.time() - print(f"⏱️ 提取耗时: {end_time - start_time:.2f}秒\n") - - # 4. 显示提取结果 + print(f"⏱️ Extraction time: {end_time - start_time:.2f}s\n") + + # 4. Display extraction results if result.success: - print("✅ 内容提取成功!\n") - - print("📄 提取的主要内容:") + print("✅ Content extracted successfully!\n") + + print("📄 Extracted main content:") print("=" * 50) print(result.content[:500] + "..." if len(result.content) > 500 else result.content) print("=" * 50) - - print(f"\n📊 提取统计:") - print(f" • 内容长度: {len(result.content)} 字符") - print(f" • 置信度: {result.confidence_score:.3f}") - print(f" • 标题: {result.title}") - print(f" • 语言: {result.language}") - print(f" • 提取时间: {result.extraction_time:.3f}秒") - + + print(f"\n📊 Extraction statistics:") + print(f" • Content length: {len(result.content)} characters") + print(f" • Confidence: {result.confidence_score:.3f}") + print(f" • Title: {result.title}") + print(f" • Language: {result.language}") + print(f" • Extraction time: {result.extraction_time:.3f}s") + if result.content_list: - print(f" • 结构化内容块: {len(result.content_list)}个") - for i, item in enumerate(result.content_list[:3]): # 显示前3个 + print(f" • Structured content blocks: {len(result.content_list)}") + for i, item in enumerate(result.content_list[:3]): # Show first 3 print(f" [{i+1}] {item.get('type', 'unknown')}: {item.get('content', '')[:50]}...") - + else: - print("❌ 内容提取失败") - print(f"错误信息: {result.error_message}") + print("❌ Content extraction failed") + print(f"Error message: {result.error_message}") if result.error_traceback: - print(f"错误详情:\n{result.error_traceback}") - + print(f"Error details:\n{result.error_traceback}") + except Exception as e: - print(f"❌ 提取过程中发生异常: {e}") - - print("\n🎯 高级功能说明:") - print("• 智能分类: 使用LLM理解HTML元素语义,准确区分主要内容和辅助内容") - print("• 格式约束: 通过logits processor确保LLM输出有效的JSON格式") - print("• 性能优化: 自动跳过过于复杂的HTML,支持延迟加载模型") - print("• 详细反馈: 提供分类结果、置信度和性能指标") + print(f"❌ Exception during extraction: {e}") + + print("\n🎯 Advanced feature notes:") + print("• Smart classification: Uses LLM to understand HTML element semantics, accurately distinguishing main content from auxiliary content") + print("• Format constraint: Uses logits processor to ensure valid JSON output from the LLM") + print("• Performance optimization: Automatically skips overly complex HTML, supports lazy model loading") + print("• Detailed feedback: Provides classification results, confidence scores, and performance metrics") if __name__ == "__main__": main() - - print("\n💡 使用提示:") - print("1. 确保已安装所需依赖: vllm, transformers, torch, llm_web_kit") - print("2. 设置正确的模型路径") - print("3. 根据硬件资源调整tensor_parallel_size和dtype") - print("4. 对于大规模HTML,适当调整max_item_count限制") - print("5. 使用use_logits_processor=True确保输出格式可靠性") \ No newline at end of file + + print("\n💡 Usage tips:") + print("1. Ensure required dependencies are installed: vllm, transformers, torch, llm_web_kit") + print("2. Set the correct model path") + print("3. Adjust tensor_parallel_size and dtype based on hardware resources") + print("4. For large-scale HTML, adjust max_item_count accordingly") + print("5. Use use_logits_processor=True to ensure reliable output format") \ No newline at end of file diff --git a/examples/magic_html_extract_demo.py b/examples/magic_html_extract_demo.py index 726c054..ef90532 100644 --- a/examples/magic_html_extract_demo.py +++ b/examples/magic_html_extract_demo.py @@ -1,68 +1,68 @@ import time from webmainbench.extractors import ExtractorFactory -# 配置 MagicHTML 抽取器(这里可根据需要添加更多配置) +# Configure MagicHTML extractor (add more configuration as needed) config = {} try: - # 创建 MagicHTML 抽取器实例 + # Create MagicHTML extractor instance extractor = ExtractorFactory.create("magic-html", config=config) - print(f"✅ Extractor创建成功: {extractor.description}") - print(f"📋 版本: {extractor.version}") - print(f"⚙️ 配置: {extractor.get_config()}\n") + print(f"✅ Extractor created successfully: {extractor.description}") + print(f"📋 Version: {extractor.version}") + print(f"⚙️ Config: {extractor.get_config()}\n") except Exception as e: - print(f"❌ Extractor创建失败: {e}") + print(f"❌ Failed to create extractor: {e}") -# 测试 HTML +# Test HTML test_html = """ -

Python编程教程

-

这是一个Python基础教程,展示如何定义函数。

+

Python Programming Tutorial

+

This is a basic Python tutorial demonstrating how to define functions.

def greet(name):
-    ""问候函数""
+    ""Greeting function""
     return f"Hello, {name}!"
 
-# 使用示例
+# Usage example
 result = greet("World")
 print(result)
-

这个函数可以用来问候任何人。

+

This function can be used to greet anyone.

""" -print("🔍 开始内容提取...") +print("🔍 Starting content extraction...") start_time = time.time() try: result = extractor.extract(test_html) end_time = time.time() - print(f"⏱️ 提取耗时: {end_time - start_time:.2f}秒\n") + print(f"⏱️ Extraction time: {end_time - start_time:.2f}s\n") - # 显示提取结果 + # Display extraction results if result.success: - print("✅ 内容提取成功!\n") + print("✅ Content extracted successfully!\n") - print("📄 提取的主要内容:") + print("📄 Extracted main content:") print("=" * 50) print(result.content[:500] + "..." if len(result.content) > 500 else result.content) print("=" * 50) - print(f"\n📊 提取统计:") - print(f" • 内容长度: {len(result.content)} 字符") - print(f" • 标题: {result.title}") - print(f" • 语言: {result.language}") - print(f" • 提取时间: {result.extraction_time:.3f}秒") + print(f"\n📊 Extraction statistics:") + print(f" • Content length: {len(result.content)} characters") + print(f" • Title: {result.title}") + print(f" • Language: {result.language}") + print(f" • Extraction time: {result.extraction_time:.3f}s") if result.content_list: - print(f" • 结构化内容块: {len(result.content_list)}个") - for i, item in enumerate(result.content_list[:3]): # 显示前3个 + print(f" • Structured content blocks: {len(result.content_list)}") + for i, item in enumerate(result.content_list[:3]): # Show first 3 print(f" [{i + 1}] {item.get('type', 'unknown')}: {item.get('content', '')[:50]}...") else: - print("❌ 内容提取失败") - print(f"错误信息: {result.error_message}") + print("❌ Content extraction failed") + print(f"Error message: {result.error_message}") if result.error_traceback: - print(f"错误详情:\n{result.error_traceback}") + print(f"Error details:\n{result.error_traceback}") except Exception as e: - print(f"❌ 提取过程中发生异常: {e}") \ No newline at end of file + print(f"❌ Exception during extraction: {e}") diff --git a/examples/main_html_eval.py b/examples/main_html_eval.py index cdeee0c..b29d395 100755 --- a/examples/main_html_eval.py +++ b/examples/main_html_eval.py @@ -1,12 +1,12 @@ #!/usr/bin/env python3 """ -WebMainBench 基本使用示例 +WebMainBench Basic Usage Example """ import json from pathlib import Path -# 导入 WebMainBench 模块 +# Import WebMainBench modules from webmainbench import ( DataLoader, DataSaver, BenchmarkDataset, DataSample, ExtractorFactory, MainHTMLEvaluator, @@ -16,17 +16,17 @@ def load_benchdata(dataset_path: str) -> BenchmarkDataset: dataset_path = Path(dataset_path) - print(f"📂 数据集文件: {dataset_path}") - + print(f"📂 Dataset file: {dataset_path}") + if not dataset_path.exists(): - print(f"❌ 数据文件不存在: {dataset_path}") - print("请确保已运行数据提取命令创建样本数据集") + print(f"❌ Data file does not exist: {dataset_path}") + print("Please ensure the data extraction command has been run to create the sample dataset") return - - # 加载数据集 + + # Load dataset dataset = DataLoader.load_jsonl(dataset_path, include_results=False) dataset.name = "real_preprocessed_html_test" - dataset.description = "基于真实数据的预处理HTML功能测试" + dataset.description = "Preprocessed HTML feature test based on real data" return dataset @@ -39,104 +39,103 @@ def save_results(result_file: Path, results: list[dict]): with result_file.open("w", encoding="utf-8") as f: for res in results: f.write(json.dumps(res, ensure_ascii=False) + "\n") - - + + def demo_llm_webkit_with_preprocessed_html_evaluation(model_path: str): - """演示LLM-WebKit预处理HTML功能的评测""" - - print("\n=== LLM-WebKit 预处理HTML功能演示 ===\n") - - # 设置日志 + """Demonstrate evaluation of the LLM-WebKit preprocessed HTML feature""" + + print("\n=== LLM-WebKit Preprocessed HTML Feature Demo ===\n") + + # Set up logging setup_logging(level="INFO") - - # 1. 从真实数据集加载包含预处理HTML的数据 - print("1. 从真实数据集加载预处理HTML数据...") - - # 使用DataLoader加载真实的样本数据 - + + # 1. Load preprocessed HTML data from the real dataset + print("1. Loading preprocessed HTML data from the real dataset...") + + # Load real sample data using DataLoader dataset = load_benchdata("data/WebMainBench_llm-webkit_v1_WebMainBench_1827_v1_WebMainBench_dataset_merge_with_llm_webkit.jsonl") - print(f"✅ 真实数据集加载成功,包含 {len(dataset)} 个样本") - + print(f"✅ Real dataset loaded successfully, contains {len(dataset)} samples") + + + + # 2. Create LLM-WebKit extractor in preprocessed HTML mode + print("2. Creating LLM-WebKit extractor in preprocessed HTML mode...") - - # 2. 创建预处理HTML模式的LLM-WebKit抽取器 - print("2. 创建预处理HTML模式的LLM-WebKit抽取器...") - extractor = load_extractor(model_path) - print(f"✅ 抽取器创建成功") - print(f"📋 配置信息:") - print(f" - 跳过LLM推理: 是(直接处理预处理HTML)") + print(f"✅ Extractor created successfully") + print(f"📋 Configuration:") + print(f" - Skip LLM inference: Yes (process preprocessed HTML directly)") print() - - # 4. 运行评测 - print("4. 开始评测...") + + # 4. Run evaluation + print("4. Starting evaluation...") print("=" * 50) - + evaluator = MainHTMLEvaluator() result = evaluator.evaluate( dataset=dataset, extractor=extractor, max_samples=None ) - - # 5. 显示评测结果 - print("\n5. 📊 预处理HTML模式评测结果:") + + # 5. Display evaluation results + print("\n5. 📊 Preprocessed HTML mode evaluation results:") print("=" * 50) - + results_dict = result.to_dict() metrics = results_dict.get('overall_metrics', {}) - - # 显示关键指标 - print(f"\n🏆 综合指标:") + + # Display key metrics + print(f"\n🏆 Overall metrics:") for key in metrics.keys(): print(f" {key}: {metrics[key]:.4f}") - - print(f"\n⚡ 性能统计:") + + print(f"\n⚡ Performance statistics:") sample_results = results_dict.get('sample_results', []) if sample_results: extraction_times = [s.get('extraction_time', 0) for s in sample_results if s.get('extraction_success')] if extraction_times: avg_time = sum(extraction_times) / len(extraction_times) - print(f" 平均提取时间: {avg_time:.3f}秒") - print(f" 处理速度: {1/avg_time:.1f}样本/秒") - + print(f" Average extraction time: {avg_time:.3f}s") + print(f" Processing speed: {1/avg_time:.1f} samples/s") + success_count = len([s for s in sample_results if s.get('extraction_success', False)]) - print(f" 成功样本数: {success_count}/{len(dataset)}") - - # 7. 保存结果 - print(f"\n6. 💾 保存评测结果...") - + print(f" Successful samples: {success_count}/{len(dataset)}") + + # 7. Save results + print(f"\n6. 💾 Saving evaluation results...") + results_dir = Path("results") results_dir.mkdir(exist_ok=True) - # 新增:保存带抽取结果的增强数据集(JSONL格式) + # Save enhanced dataset with extraction results (JSONL format) jsonl_dataset_path = results_dir / f"{extractor.name}_preprocessed_html_dataset_with_results.jsonl" save_results(jsonl_dataset_path, result.sample_results) - print(f"✅ 结果已保存到: {jsonl_dataset_path}") - - - print(f"✅ 带抽取结果的JSONL数据集已保存到: {jsonl_dataset_path}") + print(f"✅ Results saved to: {jsonl_dataset_path}") + + + print(f"✅ JSONL dataset with extraction results saved to: {jsonl_dataset_path}") results_path = results_dir / f"{extractor.name}_preprocessed_html_evaluation_results.json" report_path = results_dir / f"{extractor.name}_preprocessed_html_evaluation_report.csv" - + DataSaver.save_evaluation_results(result, results_path) DataSaver.save_summary_report(result, report_path) - - print(f"✅ 详细结果已保存到: {results_path}") - print(f"✅ CSV报告已保存到: {report_path}") - + + print(f"✅ Detailed results saved to: {results_path}") + print(f"✅ CSV report saved to: {report_path}") + if __name__ == "__main__": import argparse - parser = argparse.ArgumentParser(description="WebMainBench 基本使用示例") - parser.add_argument("--model_path", required=True, help="LLM model路径") + parser = argparse.ArgumentParser(description="WebMainBench Basic Usage Example") + parser.add_argument("--model_path", required=True, help="LLM model path") args = parser.parse_args() try: demo_llm_webkit_with_preprocessed_html_evaluation(args.model_path) - print("\n✅ 示例运行完成!") - + print("\n✅ Example completed!") + except Exception as e: - print(f"\n❌ 运行出错: {e}") + print(f"\n❌ Runtime error: {e}") import traceback - traceback.print_exc() \ No newline at end of file + traceback.print_exc() diff --git a/examples/multi_extractor_compare.py b/examples/multi_extractor_compare.py index 9b3a56f..6b3390d 100644 --- a/examples/multi_extractor_compare.py +++ b/examples/multi_extractor_compare.py @@ -1,56 +1,56 @@ from webmainbench import DataLoader, Evaluator, ExtractorFactory, DataSaver from pathlib import Path -# 如需调用LLM修正抽取结果,在 webmainbench/config.py 中配置 LLM api +# To use LLM to correct extraction results, configure the LLM API in webmainbench/config.py def all_extractor_comparison(): - """演示多抽取器对比""" - - print("\n=== 多抽取器对比演示 ===\n") - - # 创建数据集 + """Demonstrate multi-extractor comparison""" + + print("\n=== Multi-Extractor Comparison Demo ===\n") + + # Create dataset dataset_path = Path("../data/WebMainBench_llm-webkit_v1_WebMainBench_7887_within_formula.jsonl") dataset = DataLoader.load_jsonl(dataset_path) - # 创建webkit抽取器 + # Create webkit extractor config = { - "use_preprocessed_html": True, # 🔑 关键配置:启用预处理HTML模式 - "preprocessed_html_field": "llm_webkit_html" # 指定预处理HTML字段名 + "use_preprocessed_html": True, # Key config: enable preprocessed HTML mode + "preprocessed_html_field": "llm_webkit_html" # Specify the preprocessed HTML field name } webkit_extractor = ExtractorFactory.create("llm-webkit", config=config) - # 创建magic-extractor抽取器 + # Create magic-extractor extractor magic_extractor = ExtractorFactory.create("magic-html") - # 创建trafilatura抽取器,抽取成markdown + # Create trafilatura extractor, extract to markdown trafilatura_extractor = ExtractorFactory.create("trafilatura") - # 创建trafilatura抽取器,抽取成txt + # Create trafilatura extractor, extract to txt trafilatura_txt_extractor = ExtractorFactory.create("trafilatura_txt") - # 创建resiliparse抽取器 + # Create resiliparse extractor resiliparse_extractor = ExtractorFactory.create("resiliparse") - - # 运行对比 + + # Run comparison evaluator = Evaluator() extractors = [webkit_extractor, magic_extractor, trafilatura_extractor,trafilatura_txt_extractor, resiliparse_extractor] # extractors = [webkit_extractor] - + results = evaluator.compare_extractors( dataset=dataset, extractors=extractors ) - - # 显示对比结果 - print("对比结果:") + + # Display comparison results + print("Comparison results:") print("-" * 40) for extractor_name, result in results.items(): overall_score = result.overall_metrics.get('overall', 0) print(f"{extractor_name}: {overall_score:.4f}") - - # 保存多抽取器对比榜单 + + # Save multi-extractor comparison leaderboard all_results = [] for extractor_name, result in results.items(): all_results.append(result.to_dict()) - + results_dir = Path("results") results_dir.mkdir(exist_ok=True) leaderboard_path = results_dir / "leaderboard.csv" @@ -60,10 +60,10 @@ def all_extractor_comparison(): DataSaver.save_evaluation_results(all_results, evaluation_results_path) DataSaver.save_dataset_with_extraction( results=all_results, - dataset=dataset, # 原始数据集对象 + dataset=dataset, # Original dataset object file_path=jsonl_dataset_path ) - print(f"\n📊 榜单已保存到: {leaderboard_path}") + print(f"\nLeaderboard saved to: {leaderboard_path}") if __name__ == "__main__": diff --git a/examples/resiliparse_extract_demo.py b/examples/resiliparse_extract_demo.py index ba33a14..17c941f 100644 --- a/examples/resiliparse_extract_demo.py +++ b/examples/resiliparse_extract_demo.py @@ -1,7 +1,7 @@ import time from webmainbench.extractors import ExtractorFactory -# 配置 Resiliparse 抽取器 +# Configure Resiliparse extractor config = { "main_content": True, "alt_texts": True, @@ -14,66 +14,66 @@ } try: - # 创建 Resiliparse 抽取器实例 + # Create Resiliparse extractor instance extractor = ExtractorFactory.create("resiliparse", config=config) - print(f"✅ Extractor创建成功: {extractor.description}") - print(f"📋 版本: {extractor.version}") - print(f"⚙️ 配置: {extractor.get_config()}\n") + print(f"✅ Extractor created successfully: {extractor.description}") + print(f"📋 Version: {extractor.version}") + print(f"⚙️ Config: {extractor.get_config()}\n") except Exception as e: - print(f"❌ Extractor创建失败: {e}") + print(f"❌ Failed to create extractor: {e}") -# 测试 HTML +# Test HTML test_html = """ -

Python编程教程

-

这是一个Python基础教程,展示如何定义函数。

+

Python Programming Tutorial

+

This is a basic Python tutorial demonstrating how to define functions.

def greet(name):
-    ""问候函数""
+    ""Greeting function""
     return f"Hello, {name}!"
 
-# 使用示例
+# Usage example
 result = greet("World")
 print(result)
-

这个函数可以用来问候任何人。

+

This function can be used to greet anyone.

""" -print("🔍 开始内容提取...") +print("🔍 Starting content extraction...") start_time = time.time() try: result = extractor.extract(test_html) end_time = time.time() - print(f"⏱️ 提取耗时: {end_time - start_time:.2f}秒\n") + print(f"⏱️ Extraction time: {end_time - start_time:.2f}s\n") - # 显示提取结果 + # Display extraction results if result.success: - print("✅ 内容提取成功!\n") + print("✅ Content extracted successfully!\n") - print("📄 提取的主要内容:") + print("📄 Extracted main content:") print("=" * 50) print(result.content[:500] + "..." if len(result.content) > 500 else result.content) print("=" * 50) - print(f"\n📊 提取统计:") - print(f" • 内容长度: {len(result.content)} 字符") - print(f" • 标题: {result.title}") - print(f" • 语言: {result.language}") - print(f" • 提取时间: {result.extraction_time:.3f}秒") + print(f"\n📊 Extraction statistics:") + print(f" • Content length: {len(result.content)} characters") + print(f" • Title: {result.title}") + print(f" • Language: {result.language}") + print(f" • Extraction time: {result.extraction_time:.3f}s") if result.content_list: - print(f" • 结构化内容块: {len(result.content_list)}个") - for i, item in enumerate(result.content_list[:3]): # 显示前3个 + print(f" • Structured content blocks: {len(result.content_list)}") + for i, item in enumerate(result.content_list[:3]): # Show first 3 print(f" [{i + 1}] {item.get('type', 'unknown')}: {item.get('content', '')[:50]}...") else: - print("❌ 内容提取失败") - print(f"错误信息: {result.error_message}") + print("❌ Content extraction failed") + print(f"Error message: {result.error_message}") if result.error_traceback: - print(f"错误详情:\n{result.error_traceback}") + print(f"Error details:\n{result.error_traceback}") except Exception as e: - print(f"❌ 提取过程中发生异常: {e}") + print(f"❌ Exception during extraction: {e}") diff --git a/examples/teds_usage.py b/examples/teds_usage.py index 000f288..70d4e51 100644 --- a/examples/teds_usage.py +++ b/examples/teds_usage.py @@ -1,8 +1,9 @@ #!/usr/bin/env python3 """ -WebMainBench TEDS 算法使用示例 +WebMainBench TEDS Algorithm Usage Example -展示如何在评估中使用 TEDS (Tree-Edit Distance based Similarity) 算法进行表格评估 +Demonstrates how to use the TEDS (Tree-Edit Distance based Similarity) algorithm +for table evaluation in assessments. """ import sys @@ -18,268 +19,268 @@ def demo_teds_configuration(): - """演示如何配置 TEDS 算法""" - print("=== 🔧 TEDS 配置示例 ===\n") - - # 方法1: 使用 TableTEDSMetric 指标 - print("**方法1: 使用专用的 TableTEDSMetric 指标**") + """Demonstrate how to configure the TEDS algorithm""" + print("=== TEDS Configuration Example ===\n") + + # Method 1: Use the TableTEDSMetric metric + print("**Method 1: Use the dedicated TableTEDSMetric metric**") evaluation_config = { "metrics": { "table_extraction": { - "use_teds": True, # 启用 TEDS 算法 - "structure_only": False # 同时考虑结构和内容 + "use_teds": True, # Enable TEDS algorithm + "structure_only": False # Consider both structure and content } } } - print("配置:", evaluation_config) + print("Config:", evaluation_config) print() - - # 方法2: 直接使用 TEDS 指标 - print("**方法2: 直接使用独立的 TEDS 指标**") + + # Method 2: Use TEDS metric directly + print("**Method 2: Use the standalone TEDS metric directly**") teds_config = { "metrics": { "teds": { "structure_only": False, "ignore_nodes": ["tbody", "thead", "tfoot"] }, - "s_teds": { # 结构化 TEDS + "s_teds": { # Structural TEDS "structure_only": True } } } - print("配置:", teds_config) + print("Config:", teds_config) print() def demo_teds_comparison(): - """演示 TEDS 与简单算法的对比""" - print("=== ⚖️ TEDS vs 简单算法对比 ===\n") - - # 准备测试数据 + """Demonstrate comparison of TEDS vs simple algorithm""" + print("=== TEDS vs Simple Algorithm Comparison ===\n") + + # Prepare test data test_cases = [ { - "name": "完全匹配的表格", + "name": "Perfectly matching table", "extracted": """
产品销量收入ProductSalesRevenue
产品AProduct A 100 1000
产品BProduct B 200 3000
- - - + + +
产品价格
苹果5元
橙子3元
ProductPrice
Apple$5
Orange$3
""", "groundtruth": """ - - - + + +
产品价格
苹果5元
橙子3元
ProductPrice
Apple$5
Orange$3
""" }, { - "name": "缺少行的表格", + "name": "Table with missing row", "extracted": """ - - + +
产品价格
苹果5元
ProductPrice
Apple$5
""", "groundtruth": """ - - - - + + + +
产品价格
苹果5元
橙子3元
香蕉4元
ProductPrice
Apple$5
Orange$3
Banana$4
""" }, { - "name": "结构不同的表格", + "name": "Table with different structure", "extracted": """ - - + +
产品价格
苹果5元
ProductPrice
Apple$5
""", "groundtruth": """ - - + +
产品价格库存
苹果5元100
ProductPriceStock
Apple$5100
""" } ] - - print("| 测试用例 | 简单算法 | TEDS算法 | S-TEDS | 差异 |") + + print("| Test case | Simple | TEDS | S-TEDS | Diff |") print("|---------|---------|---------|--------|------|") - + for case in test_cases: - # 简单算法评估 + # Simple algorithm evaluation simple_evaluator = Evaluator(task_config={ "metrics": { "table_extraction": {"use_teds": False} } }) - - # TEDS 算法评估 + + # TEDS algorithm evaluation teds_evaluator = Evaluator(task_config={ "metrics": { "table_extraction": {"use_teds": True} } }) - - # 创建模拟数据 + + # Create mock data sample = DataSample( id=f"test_{case['name']}", - html="
测试HTML
", - content="测试内容", + html="
Test HTML
", + content="Test content", content_list=[{"table": case["groundtruth"]}] ) - + extraction_result = ExtractionResult( extractor_name="test", - extracted_content="测试内容", + extracted_content="Test content", extracted_content_list=[{"table": case["extracted"]}] ) - - # 计算得分 + + # Calculate scores try: simple_result = simple_evaluator.evaluate_single(sample, extraction_result) teds_result = teds_evaluator.evaluate_single(sample, extraction_result) - + simple_score = simple_result.overall_metrics.get("table_extraction", 0.0) teds_score = teds_result.overall_metrics.get("table_extraction", 0.0) - - # S-TEDS (结构化) 评估 + + # S-TEDS (structure-only) evaluation s_teds = StructureTEDSMetric("s_teds") s_teds_result = s_teds.calculate(case["extracted"], case["groundtruth"]) s_teds_score = s_teds_result.score - + diff = abs(simple_score - teds_score) - + print(f"| {case['name'][:10]}... | {simple_score:.4f} | {teds_score:.4f} | {s_teds_score:.4f} | {diff:.4f} |") - + except Exception as e: - print(f"| {case['name'][:10]}... | 错误 | 错误 | 错误 | - |") - print(f" 错误信息: {e}") - + print(f"| {case['name'][:10]}... | Error | Error | Error | - |") + print(f" Error message: {e}") + print() def demo_advanced_teds_features(): - """演示 TEDS 的高级功能""" - print("=== 🚀 TEDS 高级功能演示 ===\n") - - # 1. 处理 Markdown 表格 - print("**1. Markdown 表格支持**") + """Demonstrate advanced TEDS features""" + print("=== TEDS Advanced Feature Demo ===\n") + + # 1. Handle Markdown tables + print("**1. Markdown Table Support**") teds = TEDSMetric("teds") - + markdown_table = """ - | 姓名 | 年龄 | 职业 | + | Name | Age | Occupation | |------|------|------| - | 张三 | 25 | 工程师 | - | 李四 | 30 | 设计师 | + | Alice | 25 | Engineer | + | Bob | 30 | Designer | """ - + html_table = """ - - - + + +
姓名年龄职业
张三25工程师
李四30设计师
NameAgeOccupation
Alice25Engineer
Bob30Designer
""" - + result = teds.calculate(markdown_table, html_table) - print(f"Markdown vs HTML 表格 TEDS 得分: {result.score:.4f}") - print(f"详细信息: {result.details}") + print(f"Markdown vs HTML table TEDS score: {result.score:.4f}") + print(f"Details: {result.details}") print() - - # 2. 复杂表格结构 - print("**2. 复杂表格结构支持 (colspan, rowspan)**") + + # 2. Complex table structure + print("**2. Complex Table Structure Support (colspan, rowspan)**") complex_table1 = """ - - - - + + + +
学生信息
姓名成绩
张三95
李四87
Student Info
NameScore
Alice95
Bob87
""" - + complex_table2 = """ - - - - + + + +
类别详情
姓名成绩
张三95
李四87
CategoryDetails
NameScore
Alice95
Bob87
""" - + result = teds.calculate(complex_table1, complex_table2) - print(f"复杂表格结构 TEDS 得分: {result.score:.4f}") - print(f"编辑距离: {result.details.get('edit_distance')}") - print(f"节点数量: 预测={result.details.get('predicted_nodes')}, 真实={result.details.get('groundtruth_nodes')}") + print(f"Complex table structure TEDS score: {result.score:.4f}") + print(f"Edit distance: {result.details.get('edit_distance')}") + print(f"Node count: predicted={result.details.get('predicted_nodes')}, groundtruth={result.details.get('groundtruth_nodes')}") print() - - # 3. 结构化 vs 内容敏感评估 - print("**3. 结构化 vs 内容敏感评估对比**") + + # 3. Structure-only vs content-sensitive evaluation + print("**3. Structure-only vs Content-sensitive Evaluation Comparison**") content_teds = TEDSMetric("content_teds", {"structure_only": False}) structure_teds = StructureTEDSMetric("structure_teds") - + table_diff_content = """ - +
AB
数据1数据2
Data1Data2
""" - + table_same_structure = """ - +
XY
值1值2
Value1Value2
""" - + content_result = content_teds.calculate(table_diff_content, table_same_structure) structure_result = structure_teds.calculate(table_diff_content, table_same_structure) - - print(f"内容敏感 TEDS 得分: {content_result.score:.4f}") - print(f"仅结构 S-TEDS 得分: {structure_result.score:.4f}") - print(f"说明: S-TEDS 忽略文本内容差异,只关注表格结构") + + print(f"Content-sensitive TEDS score: {content_result.score:.4f}") + print(f"Structure-only S-TEDS score: {structure_result.score:.4f}") + print(f"Note: S-TEDS ignores text content differences and only focuses on table structure") print() def demo_evaluation_workflow(): - """演示完整的评估工作流程""" - print("=== 📋 完整评估工作流程 ===\n") - - print("**步骤 1: 准备数据**") - # 模拟评估数据 + """Demonstrate the complete evaluation workflow""" + print("=== Complete Evaluation Workflow ===\n") + + print("**Step 1: Prepare data**") + # Simulated evaluation data sample_data = DataSample( id="sample_001", html="""
-

产品价格表

+

Product Price List

- - - - + + + +
产品价格库存
iPhone5999元50
iPad3999元30
MacBook12999元10
ProductPriceStock
iPhone$59950
iPad$39930
MacBook$129910
""", - content="产品价格表\n\n| 产品 | 价格 | 库存 |\n|------|------|------|\n| iPhone | 5999元 | 50 |\n| iPad | 3999元 | 30 |\n| MacBook | 12999元 | 10 |", + content="Product Price List\n\n| Product | Price | Stock |\n|------|------|------|\n| iPhone | $599 | 50 |\n| iPad | $399 | 30 |\n| MacBook | $1299 | 10 |", content_list=[ { "type": "title", - "content": "产品价格表" + "content": "Product Price List" }, { "type": "table", - "content": "| 产品 | 价格 | 库存 |\n|------|------|------|\n| iPhone | 5999元 | 50 |\n| iPad | 3999元 | 30 |\n| MacBook | 12999元 | 10 |" + "content": "| Product | Price | Stock |\n|------|------|------|\n| iPhone | $599 | 50 |\n| iPad | $399 | 30 |\n| MacBook | $1299 | 10 |" } ] ) - print("✅ 数据准备完成") - - print("\n**步骤 2: 配置 TEDS 评估器**") + print("Data preparation complete") + + print("\n**Step 2: Configure TEDS evaluator**") evaluation_config = { "metrics": { "overall": "edit_distance", @@ -289,74 +290,74 @@ def demo_evaluation_workflow(): } } } - + evaluator = Evaluator(task_config=evaluation_config) - print("✅ 评估器配置完成") - - print("\n**步骤 3: 模拟抽取结果**") - # 模拟一个有轻微错误的抽取结果 + print("Evaluator configuration complete") + + print("\n**Step 3: Simulate extraction results**") + # Simulate extraction result with minor errors extraction_result = ExtractionResult( extractor_name="TestExtractor", - extracted_content="产品价格表\n\n| 产品 | 价格 |\n|------|------|\n| iPhone | 5999元 |\n| iPad | 3999元 |", # 缺少库存列和MacBook行 + extracted_content="Product Price List\n\n| Product | Price |\n|------|------|\n| iPhone | $599 |\n| iPad | $399 |", # Missing stock column and MacBook row extracted_content_list=[ { - "type": "title", - "content": "产品价格表" + "type": "title", + "content": "Product Price List" }, { "type": "table", - "content": "| 产品 | 价格 |\n|------|------|\n| iPhone | 5999元 |\n| iPad | 3999元 |" + "content": "| Product | Price |\n|------|------|\n| iPhone | $599 |\n| iPad | $399 |" } ] ) - print("✅ 模拟抽取结果生成") - - print("\n**步骤 4: 执行评估**") + print("Simulated extraction result generated") + + print("\n**Step 4: Run evaluation**") evaluation_result = evaluator.evaluate_single(sample_data, extraction_result) - - print(f"📊 评估结果:") - print(f" - 整体得分: {evaluation_result.overall_metrics.get('overall', 'N/A'):.4f}") - print(f" - 表格抽取 (TEDS): {evaluation_result.overall_metrics.get('table_extraction', 'N/A'):.4f}") - print(f" - 成功率: {evaluation_result.metadata.get('success_rate', 'N/A'):.2%}") - - # 显示详细的 TEDS 信息 + + print(f"Evaluation results:") + print(f" - Overall score: {evaluation_result.overall_metrics.get('overall', 'N/A'):.4f}") + print(f" - Table extraction (TEDS): {evaluation_result.overall_metrics.get('table_extraction', 'N/A'):.4f}") + print(f" - Success rate: {evaluation_result.metadata.get('success_rate', 'N/A'):.2%}") + + # Display detailed TEDS information if evaluation_result.detailed_metrics: for metric_name, metric_result in evaluation_result.detailed_metrics.items(): if 'teds' in metric_name.lower(): - print(f"\n🔍 {metric_name} 详细信息:") + print(f"\n{metric_name} details:") details = metric_result.details - print(f" - 算法: {details.get('algorithm', 'N/A')}") - print(f" - 编辑距离: {details.get('edit_distance', 'N/A')}") - print(f" - 节点数量 (预测/真实): {details.get('predicted_nodes', 'N/A')}/{details.get('groundtruth_nodes', 'N/A')}") - - print("\n✅ 评估完成") + print(f" - Algorithm: {details.get('algorithm', 'N/A')}") + print(f" - Edit distance: {details.get('edit_distance', 'N/A')}") + print(f" - Node count (predicted/groundtruth): {details.get('predicted_nodes', 'N/A')}/{details.get('groundtruth_nodes', 'N/A')}") + + print("\nEvaluation complete") if __name__ == "__main__": - print("🚀 WebMainBench TEDS 算法使用示例\n") + print("WebMainBench TEDS Algorithm Usage Example\n") print("=" * 60) - + try: demo_teds_configuration() print("=" * 60) - + demo_teds_comparison() print("=" * 60) - + demo_advanced_teds_features() print("=" * 60) - + demo_evaluation_workflow() - - print("\n🎉 所有演示完成!") - print("\n💡 要点总结:") - print(" 1. TEDS 算法提供更学术严谨的表格评估") - print(" 2. 支持 HTML、Markdown 等多种表格格式") - print(" 3. 可配置结构化评估 (S-TEDS) 或内容敏感评估") - print(" 4. 能够准确识别表格结构差异和内容差异") - print(" 5. 与现有评估流程完全兼容") - + + print("\nAll demos complete!") + print("\nKey takeaways:") + print(" 1. TEDS algorithm provides more academically rigorous table evaluation") + print(" 2. Supports multiple table formats including HTML and Markdown") + print(" 3. Configurable structure-only evaluation (S-TEDS) or content-sensitive evaluation") + print(" 4. Accurately identifies table structure differences and content differences") + print(" 5. Fully compatible with existing evaluation workflows") + except Exception as e: - print(f"\n❌ 演示过程中发生错误: {e}") + print(f"\nError during demo: {e}") import traceback - traceback.print_exc() \ No newline at end of file + traceback.print_exc() \ No newline at end of file diff --git a/examples/test_model.py b/examples/test_model.py index 59b88aa..c7bc994 100644 --- a/examples/test_model.py +++ b/examples/test_model.py @@ -1,16 +1,16 @@ from webmainbench import DataLoader, Evaluator, ExtractorFactory -# 1. 加载评测数据集 +# 1. Load evaluation dataset dataset = DataLoader.load_jsonl("WebMainBench/data/WebMainBench_llm-webkit_v1_WebMainBench_dataset_merge_2549_llm_webkit.jsonl") -# 2. 创建抽取器 +# 2. Create extractor extractor = ExtractorFactory.create("test-model") -# 3. 运行评测 +# 3. Run evaluation evaluator = Evaluator() result = evaluator.evaluate(dataset, extractor) -# 4. 查看结果 +# 4. View results print(f"Overall Score: {result.overall_metrics}") print(f"Category Metrics: {result.category_metrics}") print(f"Error Analysis: {result.error_analysis}") diff --git a/examples/test_table_extract.py b/examples/test_table_extract.py index 00b978c..102409d 100644 --- a/examples/test_table_extract.py +++ b/examples/test_table_extract.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 """ -脚本:仅提取 WebMainBench 数据集中的表格内容到 table.md +Script: Extract only table content from the WebMainBench dataset into table.md """ import json @@ -8,52 +8,52 @@ import os from pathlib import Path -# 添加父目录到 sys.path 以便导入 webmainbench +# Add parent directory to sys.path for importing webmainbench sys.path.append(str(Path(__file__).parent.parent)) from webmainbench.metrics.base import BaseMetric def extract_only_tables_from_dataset(): - """只提取 WebMainBench 数据集中的表格内容并输出到 table.md(table为空的不记录)""" + """Extract only table content from the WebMainBench dataset and output to table.md (items with empty tables are not recorded)""" - # 路径配置 + # Path configuration dataset_path = "/home/zhangshuo/Desktop/vscodeworkspace/WebMainBench/data/WebMainBench_llm-webkit_v1_WebMainBench_dataset_merge_with_llm_webkit.jsonl" output_path = "table.md" - # 检查数据集文件是否存在 + # Check if the dataset file exists if not os.path.exists(dataset_path): - print(f"错误:未找到数据集文件 {dataset_path}") + print(f"Error: dataset file not found: {dataset_path}") return extracted_tables = [] line_ids = [] - # 按行读取 JSONL 文件 + # Read JSONL file line by line with open(dataset_path, 'r', encoding='utf-8') as f: for line_num, line in enumerate(f, 1): try: data = json.loads(line.strip()) - # 提取ID和内容 + # Extract ID and content item_id = data.get('track_id', f'line_{line_num}') content = data.get('llm_webkit_md', '') - # 使用 _extract_from_markdown 提取 + # Use _extract_from_markdown to extract if content: extracted = BaseMetric._extract_from_markdown(content) table_content = extracted.get("table", "") - # 只记录table不为空的项 + # Only record items with non-empty table if table_content and table_content.strip(): extracted_tables.append(table_content) line_ids.append((item_id, line_num)) except json.JSONDecodeError as e: - print(f"解析JSON出错,行{line_num}: {e}") + print(f"JSON parse error at line {line_num}: {e}") continue except Exception as e: - print(f"处理第{line_num}行时出错: {e}") + print(f"Error processing line {line_num}: {e}") continue - # 写入 table.md 文件,只输出 table 字段 + # Write to table.md, output only the table field with open(output_path, 'w', encoding='utf-8') as f: f.write("# Extracted Table Content from WebMainBench Dataset\n\n") f.write(f"Total items processed: {len(extracted_tables)}\n\n") @@ -68,8 +68,8 @@ def extract_only_tables_from_dataset(): f.write("\n```\n\n") f.write("---\n\n") - print(f"表格提取完成!共处理 {len(extracted_tables)} 条数据。") - print(f"表格内容已保存到: {output_path}") + print(f"Table extraction complete! Processed {len(extracted_tables)} items.") + print(f"Table content saved to: {output_path}") if __name__ == "__main__": extract_only_tables_from_dataset() diff --git a/examples/trafilatura_extract_demo.py b/examples/trafilatura_extract_demo.py index 1ee9f3c..f031306 100644 --- a/examples/trafilatura_extract_demo.py +++ b/examples/trafilatura_extract_demo.py @@ -1,20 +1,20 @@ import time from webmainbench.extractors import ExtractorFactory -# 配置 Trafilatura 抽取器(这里可根据需要添加更多配置) +# Configure Trafilatura extractor (add more configuration as needed) config = {} try: - # 创建 Trafilatura 抽取器实例 + # Create Trafilatura extractor instance extractor = ExtractorFactory.create("trafilatura", config=config) - print(f"✅ Extractor创建成功: {extractor.description}") - print(f"📋 版本: {extractor.version}") - print(f"⚙️ 配置: {extractor.get_config()}\n") + print(f"✅ Extractor created successfully: {extractor.description}") + print(f"📋 Version: {extractor.version}") + print(f"⚙️ Config: {extractor.get_config()}\n") except Exception as e: - print(f"❌ Extractor创建失败: {e}") + print(f"❌ Failed to create extractor: {e}") -# 测试 HTML +# Test HTML test_html = """