Here we introduce the time complexity of common algorithms.
+
Quicksort Implementation
def quicksort(arr):
if len(arr) <= 1:
return arr
-
+
pivot = arr[len(arr) // 2]
left = [x for x in arr if x < pivot]
middle = [x for x in arr if x == pivot]
right = [x for x in arr if x > pivot]
-
+
return quicksort(left) + middle + quicksort(right)
-
复杂度对比
+
Complexity Comparison
-
算法
最好情况
平均情况
最坏情况
-
快速排序
O(n log n)
O(n log n)
O(n²)
-
归并排序
O(n log n)
O(n log n)
O(n log n)
-
冒泡排序
O(n)
O(n²)
O(n²)
+
Algorithm
Best Case
Average Case
Worst Case
+
Quicksort
O(n log n)
O(n log n)
O(n²)
+
Merge Sort
O(n log n)
O(n log n)
O(n log n)
+
Bubble Sort
O(n)
O(n²)
O(n²)
-
Master定理:T(n) = aT(n/b) + f(n)
-
其中 a ≥ 1, b > 1 是常数,f(n) 是正函数。
+
Master Theorem: T(n) = aT(n/b) + f(n)
+
Where a ≥ 1, b > 1 are constants, and f(n) is a positive function.
''',
- "groundtruth_content": '''# 算法复杂度分析
+ "groundtruth_content": '''# Algorithm Complexity Analysis
-这里介绍常见算法的时间复杂度。
+Here we introduce the time complexity of common algorithms.
-## 快速排序实现
+## Quicksort Implementation
```python
def quicksort(arr):
if len(arr) <= 1:
return arr
-
+
pivot = arr[len(arr) // 2]
left = [x for x in arr if x < pivot]
middle = [x for x in arr if x == pivot]
right = [x for x in arr if x > pivot]
-
+
return quicksort(left) + middle + quicksort(right)
```
-## 复杂度对比
+## Complexity Comparison
-| 算法 | 最好情况 | 平均情况 | 最坏情况 |
+| Algorithm | Best Case | Average Case | Worst Case |
|------|----------|----------|----------|
-| 快速排序 | O(n log n) | O(n log n) | O(n²) |
-| 归并排序 | O(n log n) | O(n log n) | O(n log n) |
-| 冒泡排序 | O(n) | O(n²) | O(n²) |
+| Quicksort | O(n log n) | O(n log n) | O(n²) |
+| Merge Sort | O(n log n) | O(n log n) | O(n log n) |
+| Bubble Sort | O(n) | O(n²) | O(n²) |
-Master定理:$T(n) = aT(n/b) + f(n)$
+Master Theorem: $T(n) = aT(n/b) + f(n)$
-其中 $a \\geq 1, b > 1$ 是常数,$f(n)$ 是正函数。''',
+Where $a \\geq 1, b > 1$ are constants, and $f(n)$ is a positive function.''',
"groundtruth_content_list": [
- {"type": "heading", "content": "算法复杂度分析", "level": 1},
- {"type": "paragraph", "content": "这里介绍常见算法的时间复杂度。"},
- {"type": "heading", "content": "快速排序实现", "level": 2},
+ {"type": "heading", "content": "Algorithm Complexity Analysis", "level": 1},
+ {"type": "paragraph", "content": "Here we introduce the time complexity of common algorithms."},
+ {"type": "heading", "content": "Quicksort Implementation", "level": 2},
{"type": "code", "content": "def quicksort(arr):\n if len(arr) <= 1:\n return arr\n \n pivot = arr[len(arr) // 2]\n left = [x for x in arr if x < pivot]\n middle = [x for x in arr if x == pivot]\n right = [x for x in arr if x > pivot]\n \n return quicksort(left) + middle + quicksort(right)"},
- {"type": "heading", "content": "复杂度对比", "level": 2},
- {"type": "table", "content": "| 算法 | 最好情况 | 平均情况 | 最坏情况 |\n|------|----------|----------|----------|\n| 快速排序 | O(n log n) | O(n log n) | O(n²) |\n| 归并排序 | O(n log n) | O(n log n) | O(n log n) |\n| 冒泡排序 | O(n) | O(n²) | O(n²) |"},
+ {"type": "heading", "content": "Complexity Comparison", "level": 2},
+ {"type": "table", "content": "| Algorithm | Best Case | Average Case | Worst Case |\n|------|----------|----------|----------|\n| Quicksort | O(n log n) | O(n log n) | O(n²) |\n| Merge Sort | O(n log n) | O(n log n) | O(n log n) |\n| Bubble Sort | O(n) | O(n²) | O(n²) |"},
{"type": "equation-inline", "content": "T(n) = aT(n/b) + f(n)"},
- {"type": "paragraph", "content": "其中 a ≥ 1, b > 1 是常数,f(n) 是正函数。"}
+ {"type": "paragraph", "content": "Where a ≥ 1, b > 1 are constants, and f(n) is a positive function."}
],
"url": "https://algorithm-guide.cs.edu/complexity-analysis",
"layout_id": "algorithm-guide_4",
@@ -280,8 +280,8 @@ def quicksort(arr):
}
]
- # 创建数据集
- dataset = BenchmarkDataset(name="sample_dataset", description="示例评测数据集")
+ # Create dataset
+ dataset = BenchmarkDataset(name="sample_dataset", description="Sample evaluation dataset")
for sample_data in samples:
sample = DataSample.from_dict(sample_data)
@@ -291,140 +291,140 @@ def quicksort(arr):
def demo_basic_mock_evaluation():
- """演示基本评测流程"""
-
- print("=== WebMainBench 基本使用示例 ===\n")
-
- # 设置日志
+ """Demonstrate the basic evaluation workflow"""
+
+ print("=== WebMainBench Basic Usage Example ===\n")
+
+ # Set up logging
setup_logging(level="INFO")
-
- # 1. 创建或加载数据集
- print("1. 创建示例数据集...")
+
+ # 1. Create or load dataset
+ print("1. Creating sample dataset...")
dataset = create_sample_dataset()
- print(f"数据集包含 {len(dataset)} 个样本")
- print(f"数据集统计: {dataset.get_statistics()}\n")
-
- # 2. 保存数据集到文件
+ print(f"Dataset contains {len(dataset)} samples")
+ print(f"Dataset statistics: {dataset.get_statistics()}\n")
+
+ # 2. Save dataset to file
data_dir = Path("data")
data_dir.mkdir(exist_ok=True)
-
+
dataset_path = data_dir / "sample_dataset.jsonl"
DataSaver.save_jsonl(dataset, dataset_path, include_results=False)
- print(f"数据集已保存到: {dataset_path}\n")
-
- # 3. 重新加载数据集
- print("2. 重新加载数据集...")
+ print(f"Dataset saved to: {dataset_path}\n")
+
+ # 3. Reload dataset
+ print("2. Reloading dataset...")
loaded_dataset = DataLoader.load_jsonl(dataset_path)
- print(f"加载的数据集包含 {len(loaded_dataset)} 个样本\n")
-
- # 4. 列出可用的抽取器
- print("3. 可用的抽取器:")
+ print(f"Loaded dataset contains {len(loaded_dataset)} samples\n")
+
+ # 4. List available extractors
+ print("3. Available extractors:")
available_extractors = ExtractorFactory.list_available()
for extractor_name in available_extractors:
print(f" - {extractor_name}")
print()
-
- # 5. 创建评测器
- print("4. 创建评测器...")
+
+ # 5. Create evaluator
+ print("4. Creating evaluator...")
evaluator = Evaluator()
- print(f"可用的评测指标: {evaluator.metric_calculator.list_available_metrics()}\n")
-
- # 6. 创建一个模拟抽取器进行演示
- print("5. 创建模拟抽取器...")
-
+ print(f"Available evaluation metrics: {evaluator.metric_calculator.list_available_metrics()}\n")
+
+ # 6. Create a mock extractor for demonstration
+ print("5. Creating mock extractor...")
+
from webmainbench.extractors import BaseExtractor, ExtractionResult
-
+
class MockExtractor(BaseExtractor):
- """模拟抽取器,用于演示"""
-
+ """Mock extractor for demonstration"""
+
def _setup(self):
pass
-
+
def _extract_content(self, html, url=None):
- # 简单的模拟抽取逻辑
- if "标题" in html:
- content = "# 提取的标题\n\n提取的正文内容。"
+ # Simple mock extraction logic
+ if "heading" in html.lower() or "title" in html.lower():
+ content = "# Extracted Title\n\nExtracted body content."
content_list = [
- {"type": "heading", "content": "提取的标题", "level": 1},
- {"type": "paragraph", "content": "提取的正文内容。"}
+ {"type": "heading", "content": "Extracted Title", "level": 1},
+ {"type": "paragraph", "content": "Extracted body content."}
]
else:
- content = "提取的内容"
- content_list = [{"type": "paragraph", "content": "提取的内容"}]
-
+ content = "Extracted content"
+ content_list = [{"type": "paragraph", "content": "Extracted content"}]
+
return ExtractionResult(
content=content,
content_list=content_list,
success=True,
confidence_score=0.85
)
-
- # 注册模拟抽取器
+
+ # Register mock extractor
ExtractorFactory.register("mock", MockExtractor)
mock_extractor = ExtractorFactory.create("mock")
- print("模拟抽取器已创建\n")
-
- # 7. 运行评测
- print("6. 运行评测...")
+ print("Mock extractor created\n")
+
+ # 7. Run evaluation
+ print("6. Running evaluation...")
result = evaluator.evaluate(
dataset=loaded_dataset,
extractor=mock_extractor,
- max_samples=2 # 限制样本数量用于演示
+ max_samples=2 # Limit sample count for demonstration
)
-
- # 8. 显示结果
- print("\n7. 评测结果:")
+
+ # 8. Display results
+ print("\n7. Evaluation results:")
print("=" * 50)
formatted_results = format_results(result.to_dict())
print(formatted_results)
-
- # 9. 保存结果
+
+ # 9. Save results
results_dir = Path("results")
results_dir.mkdir(exist_ok=True)
-
+
results_path = results_dir / "mock_evaluation_results.json"
DataSaver.save_evaluation_results(result, results_path)
- print(f"\n结果已保存到: {results_path}")
-
- # 10. 生成报告
+ print(f"\nResults saved to: {results_path}")
+
+ # 10. Generate report
report_path = results_dir / "mock_evaluation_report.csv"
DataSaver.save_summary_report(result, report_path)
- print(f"报告已保存到: {report_path}")
+ print(f"Report saved to: {report_path}")
def demo_llm_webkit_evaluation():
- """演示LLM-WebKit抽取器的6项指标评测"""
-
- print("=== LLM-WebKit Extractor 6项指标评测示例 ===\n")
-
- # 设置日志
+ """Demonstrate 6-metric evaluation with LLM-WebKit extractor"""
+
+ print("=== LLM-WebKit Extractor 6-Metric Evaluation Example ===\n")
+
+ # Set up logging
setup_logging(level="INFO")
-
- # 1. 创建包含各种内容类型的测试数据集
- print("1. 创建包含多种内容类型的测试数据集...")
-
+
+ # 1. Create test dataset with various content types
+ print("1. Creating test dataset with multiple content types...")
+
samples = []
-
- # 样本1: 包含文本和代码
+
+ # Sample 1: text and code
samples.append(DataSample(
id="text_code_sample",
html="""
-
Python编程示例
-
这是一段关于Python编程的介绍文本。
+
Python Programming Example
+
This is an introductory text about Python programming.
Artificial Intelligence (AI) technology is rapidly advancing, with far-reaching impacts across all industries. This article explores the major development trends and future prospects of AI.
+
+
1. Advances in Machine Learning
+
Breakthroughs in deep learning and large language models have enabled AI systems to understand and generate more natural language, excelling in dialogue, translation, and creative tasks.
+
+
2. Automation Applications
+
From robots in manufacturing to code generation in software development, AI is automating processes across domains, improving efficiency and reducing costs.
+
+
3. Personalized Services
+
Personalized recommendations and services based on user data are becoming increasingly precise, providing better user experiences.
-
+
-
+
"""
-
- # 3. 执行内容提取
- print("🔍 开始内容提取...")
+
+ # 3. Execute content extraction
+ print("Starting content extraction...")
start_time = time.time()
try:
result = extractor.extract(test_html)
end_time = time.time()
- print(f"⏱️ 提取耗时: {end_time - start_time:.2f}秒\n")
-
- # 4. 显示提取结果
+ print(f"⏱️ Extraction time: {end_time - start_time:.2f}s\n")
+
+ # 4. Display extraction results
if result.success:
- print("✅ 内容提取成功!\n")
-
- print("📄 提取的主要内容:")
+ print("✅ Content extracted successfully!\n")
+
+ print("📄 Extracted main content:")
print("=" * 50)
print(result.content[:500] + "..." if len(result.content) > 500 else result.content)
print("=" * 50)
-
- print(f"\n📊 提取统计:")
- print(f" • 内容长度: {len(result.content)} 字符")
- print(f" • 置信度: {result.confidence_score:.3f}")
- print(f" • 标题: {result.title}")
- print(f" • 语言: {result.language}")
- print(f" • 提取时间: {result.extraction_time:.3f}秒")
-
+
+ print(f"\n📊 Extraction statistics:")
+ print(f" • Content length: {len(result.content)} characters")
+ print(f" • Confidence: {result.confidence_score:.3f}")
+ print(f" • Title: {result.title}")
+ print(f" • Language: {result.language}")
+ print(f" • Extraction time: {result.extraction_time:.3f}s")
+
if result.content_list:
- print(f" • 结构化内容块: {len(result.content_list)}个")
- for i, item in enumerate(result.content_list[:3]): # 显示前3个
+ print(f" • Structured content blocks: {len(result.content_list)}")
+ for i, item in enumerate(result.content_list[:3]): # Show first 3
print(f" [{i+1}] {item.get('type', 'unknown')}: {item.get('content', '')[:50]}...")
-
+
else:
- print("❌ 内容提取失败")
- print(f"错误信息: {result.error_message}")
+ print("❌ Content extraction failed")
+ print(f"Error message: {result.error_message}")
if result.error_traceback:
- print(f"错误详情:\n{result.error_traceback}")
-
+ print(f"Error details:\n{result.error_traceback}")
+
except Exception as e:
- print(f"❌ 提取过程中发生异常: {e}")
-
- print("\n🎯 高级功能说明:")
- print("• 智能分类: 使用LLM理解HTML元素语义,准确区分主要内容和辅助内容")
- print("• 格式约束: 通过logits processor确保LLM输出有效的JSON格式")
- print("• 性能优化: 自动跳过过于复杂的HTML,支持延迟加载模型")
- print("• 详细反馈: 提供分类结果、置信度和性能指标")
+ print(f"❌ Exception during extraction: {e}")
+
+ print("\n🎯 Advanced feature notes:")
+ print("• Smart classification: Uses LLM to understand HTML element semantics, accurately distinguishing main content from auxiliary content")
+ print("• Format constraint: Uses logits processor to ensure valid JSON output from the LLM")
+ print("• Performance optimization: Automatically skips overly complex HTML, supports lazy model loading")
+ print("• Detailed feedback: Provides classification results, confidence scores, and performance metrics")
if __name__ == "__main__":
main()
-
- print("\n💡 使用提示:")
- print("1. 确保已安装所需依赖: vllm, transformers, torch, llm_web_kit")
- print("2. 设置正确的模型路径")
- print("3. 根据硬件资源调整tensor_parallel_size和dtype")
- print("4. 对于大规模HTML,适当调整max_item_count限制")
- print("5. 使用use_logits_processor=True确保输出格式可靠性")
\ No newline at end of file
+
+ print("\n💡 Usage tips:")
+ print("1. Ensure required dependencies are installed: vllm, transformers, torch, llm_web_kit")
+ print("2. Set the correct model path")
+ print("3. Adjust tensor_parallel_size and dtype based on hardware resources")
+ print("4. For large-scale HTML, adjust max_item_count accordingly")
+ print("5. Use use_logits_processor=True to ensure reliable output format")
\ No newline at end of file
diff --git a/examples/magic_html_extract_demo.py b/examples/magic_html_extract_demo.py
index 726c054..ef90532 100644
--- a/examples/magic_html_extract_demo.py
+++ b/examples/magic_html_extract_demo.py
@@ -1,68 +1,68 @@
import time
from webmainbench.extractors import ExtractorFactory
-# 配置 MagicHTML 抽取器(这里可根据需要添加更多配置)
+# Configure MagicHTML extractor (add more configuration as needed)
config = {}
try:
- # 创建 MagicHTML 抽取器实例
+ # Create MagicHTML extractor instance
extractor = ExtractorFactory.create("magic-html", config=config)
- print(f"✅ Extractor创建成功: {extractor.description}")
- print(f"📋 版本: {extractor.version}")
- print(f"⚙️ 配置: {extractor.get_config()}\n")
+ print(f"✅ Extractor created successfully: {extractor.description}")
+ print(f"📋 Version: {extractor.version}")
+ print(f"⚙️ Config: {extractor.get_config()}\n")
except Exception as e:
- print(f"❌ Extractor创建失败: {e}")
+ print(f"❌ Failed to create extractor: {e}")
-# 测试 HTML
+# Test HTML
test_html = """
-
Python编程教程
-
这是一个Python基础教程,展示如何定义函数。
+
Python Programming Tutorial
+
This is a basic Python tutorial demonstrating how to define functions.
def greet(name):
- ""问候函数""
+ ""Greeting function""
return f"Hello, {name}!"
-# 使用示例
+# Usage example
result = greet("World")
print(result)
-
这个函数可以用来问候任何人。
+
This function can be used to greet anyone.
"""
-print("🔍 开始内容提取...")
+print("🔍 Starting content extraction...")
start_time = time.time()
try:
result = extractor.extract(test_html)
end_time = time.time()
- print(f"⏱️ 提取耗时: {end_time - start_time:.2f}秒\n")
+ print(f"⏱️ Extraction time: {end_time - start_time:.2f}s\n")
- # 显示提取结果
+ # Display extraction results
if result.success:
- print("✅ 内容提取成功!\n")
+ print("✅ Content extracted successfully!\n")
- print("📄 提取的主要内容:")
+ print("📄 Extracted main content:")
print("=" * 50)
print(result.content[:500] + "..." if len(result.content) > 500 else result.content)
print("=" * 50)
- print(f"\n📊 提取统计:")
- print(f" • 内容长度: {len(result.content)} 字符")
- print(f" • 标题: {result.title}")
- print(f" • 语言: {result.language}")
- print(f" • 提取时间: {result.extraction_time:.3f}秒")
+ print(f"\n📊 Extraction statistics:")
+ print(f" • Content length: {len(result.content)} characters")
+ print(f" • Title: {result.title}")
+ print(f" • Language: {result.language}")
+ print(f" • Extraction time: {result.extraction_time:.3f}s")
if result.content_list:
- print(f" • 结构化内容块: {len(result.content_list)}个")
- for i, item in enumerate(result.content_list[:3]): # 显示前3个
+ print(f" • Structured content blocks: {len(result.content_list)}")
+ for i, item in enumerate(result.content_list[:3]): # Show first 3
print(f" [{i + 1}] {item.get('type', 'unknown')}: {item.get('content', '')[:50]}...")
else:
- print("❌ 内容提取失败")
- print(f"错误信息: {result.error_message}")
+ print("❌ Content extraction failed")
+ print(f"Error message: {result.error_message}")
if result.error_traceback:
- print(f"错误详情:\n{result.error_traceback}")
+ print(f"Error details:\n{result.error_traceback}")
except Exception as e:
- print(f"❌ 提取过程中发生异常: {e}")
\ No newline at end of file
+ print(f"❌ Exception during extraction: {e}")
diff --git a/examples/main_html_eval.py b/examples/main_html_eval.py
index cdeee0c..b29d395 100755
--- a/examples/main_html_eval.py
+++ b/examples/main_html_eval.py
@@ -1,12 +1,12 @@
#!/usr/bin/env python3
"""
-WebMainBench 基本使用示例
+WebMainBench Basic Usage Example
"""
import json
from pathlib import Path
-# 导入 WebMainBench 模块
+# Import WebMainBench modules
from webmainbench import (
DataLoader, DataSaver, BenchmarkDataset, DataSample,
ExtractorFactory, MainHTMLEvaluator,
@@ -16,17 +16,17 @@
def load_benchdata(dataset_path: str) -> BenchmarkDataset:
dataset_path = Path(dataset_path)
- print(f"📂 数据集文件: {dataset_path}")
-
+ print(f"📂 Dataset file: {dataset_path}")
+
if not dataset_path.exists():
- print(f"❌ 数据文件不存在: {dataset_path}")
- print("请确保已运行数据提取命令创建样本数据集")
+ print(f"❌ Data file does not exist: {dataset_path}")
+ print("Please ensure the data extraction command has been run to create the sample dataset")
return
-
- # 加载数据集
+
+ # Load dataset
dataset = DataLoader.load_jsonl(dataset_path, include_results=False)
dataset.name = "real_preprocessed_html_test"
- dataset.description = "基于真实数据的预处理HTML功能测试"
+ dataset.description = "Preprocessed HTML feature test based on real data"
return dataset
@@ -39,104 +39,103 @@ def save_results(result_file: Path, results: list[dict]):
with result_file.open("w", encoding="utf-8") as f:
for res in results:
f.write(json.dumps(res, ensure_ascii=False) + "\n")
-
-
+
+
def demo_llm_webkit_with_preprocessed_html_evaluation(model_path: str):
- """演示LLM-WebKit预处理HTML功能的评测"""
-
- print("\n=== LLM-WebKit 预处理HTML功能演示 ===\n")
-
- # 设置日志
+ """Demonstrate evaluation of the LLM-WebKit preprocessed HTML feature"""
+
+ print("\n=== LLM-WebKit Preprocessed HTML Feature Demo ===\n")
+
+ # Set up logging
setup_logging(level="INFO")
-
- # 1. 从真实数据集加载包含预处理HTML的数据
- print("1. 从真实数据集加载预处理HTML数据...")
-
- # 使用DataLoader加载真实的样本数据
-
+
+ # 1. Load preprocessed HTML data from the real dataset
+ print("1. Loading preprocessed HTML data from the real dataset...")
+
+ # Load real sample data using DataLoader
dataset = load_benchdata("data/WebMainBench_llm-webkit_v1_WebMainBench_1827_v1_WebMainBench_dataset_merge_with_llm_webkit.jsonl")
- print(f"✅ 真实数据集加载成功,包含 {len(dataset)} 个样本")
-
+ print(f"✅ Real dataset loaded successfully, contains {len(dataset)} samples")
+
+
+
+ # 2. Create LLM-WebKit extractor in preprocessed HTML mode
+ print("2. Creating LLM-WebKit extractor in preprocessed HTML mode...")
-
- # 2. 创建预处理HTML模式的LLM-WebKit抽取器
- print("2. 创建预处理HTML模式的LLM-WebKit抽取器...")
-
extractor = load_extractor(model_path)
- print(f"✅ 抽取器创建成功")
- print(f"📋 配置信息:")
- print(f" - 跳过LLM推理: 是(直接处理预处理HTML)")
+ print(f"✅ Extractor created successfully")
+ print(f"📋 Configuration:")
+ print(f" - Skip LLM inference: Yes (process preprocessed HTML directly)")
print()
-
- # 4. 运行评测
- print("4. 开始评测...")
+
+ # 4. Run evaluation
+ print("4. Starting evaluation...")
print("=" * 50)
-
+
evaluator = MainHTMLEvaluator()
result = evaluator.evaluate(
dataset=dataset,
extractor=extractor,
max_samples=None
)
-
- # 5. 显示评测结果
- print("\n5. 📊 预处理HTML模式评测结果:")
+
+ # 5. Display evaluation results
+ print("\n5. 📊 Preprocessed HTML mode evaluation results:")
print("=" * 50)
-
+
results_dict = result.to_dict()
metrics = results_dict.get('overall_metrics', {})
-
- # 显示关键指标
- print(f"\n🏆 综合指标:")
+
+ # Display key metrics
+ print(f"\n🏆 Overall metrics:")
for key in metrics.keys():
print(f" {key}: {metrics[key]:.4f}")
-
- print(f"\n⚡ 性能统计:")
+
+ print(f"\n⚡ Performance statistics:")
sample_results = results_dict.get('sample_results', [])
if sample_results:
extraction_times = [s.get('extraction_time', 0) for s in sample_results if s.get('extraction_success')]
if extraction_times:
avg_time = sum(extraction_times) / len(extraction_times)
- print(f" 平均提取时间: {avg_time:.3f}秒")
- print(f" 处理速度: {1/avg_time:.1f}样本/秒")
-
+ print(f" Average extraction time: {avg_time:.3f}s")
+ print(f" Processing speed: {1/avg_time:.1f} samples/s")
+
success_count = len([s for s in sample_results if s.get('extraction_success', False)])
- print(f" 成功样本数: {success_count}/{len(dataset)}")
-
- # 7. 保存结果
- print(f"\n6. 💾 保存评测结果...")
-
+ print(f" Successful samples: {success_count}/{len(dataset)}")
+
+ # 7. Save results
+ print(f"\n6. 💾 Saving evaluation results...")
+
results_dir = Path("results")
results_dir.mkdir(exist_ok=True)
- # 新增:保存带抽取结果的增强数据集(JSONL格式)
+ # Save enhanced dataset with extraction results (JSONL format)
jsonl_dataset_path = results_dir / f"{extractor.name}_preprocessed_html_dataset_with_results.jsonl"
save_results(jsonl_dataset_path, result.sample_results)
- print(f"✅ 结果已保存到: {jsonl_dataset_path}")
-
-
- print(f"✅ 带抽取结果的JSONL数据集已保存到: {jsonl_dataset_path}")
+ print(f"✅ Results saved to: {jsonl_dataset_path}")
+
+
+ print(f"✅ JSONL dataset with extraction results saved to: {jsonl_dataset_path}")
results_path = results_dir / f"{extractor.name}_preprocessed_html_evaluation_results.json"
report_path = results_dir / f"{extractor.name}_preprocessed_html_evaluation_report.csv"
-
+
DataSaver.save_evaluation_results(result, results_path)
DataSaver.save_summary_report(result, report_path)
-
- print(f"✅ 详细结果已保存到: {results_path}")
- print(f"✅ CSV报告已保存到: {report_path}")
-
+
+ print(f"✅ Detailed results saved to: {results_path}")
+ print(f"✅ CSV report saved to: {report_path}")
+
if __name__ == "__main__":
import argparse
- parser = argparse.ArgumentParser(description="WebMainBench 基本使用示例")
- parser.add_argument("--model_path", required=True, help="LLM model路径")
+ parser = argparse.ArgumentParser(description="WebMainBench Basic Usage Example")
+ parser.add_argument("--model_path", required=True, help="LLM model path")
args = parser.parse_args()
try:
demo_llm_webkit_with_preprocessed_html_evaluation(args.model_path)
- print("\n✅ 示例运行完成!")
-
+ print("\n✅ Example completed!")
+
except Exception as e:
- print(f"\n❌ 运行出错: {e}")
+ print(f"\n❌ Runtime error: {e}")
import traceback
- traceback.print_exc()
\ No newline at end of file
+ traceback.print_exc()
diff --git a/examples/multi_extractor_compare.py b/examples/multi_extractor_compare.py
index 9b3a56f..6b3390d 100644
--- a/examples/multi_extractor_compare.py
+++ b/examples/multi_extractor_compare.py
@@ -1,56 +1,56 @@
from webmainbench import DataLoader, Evaluator, ExtractorFactory, DataSaver
from pathlib import Path
-# 如需调用LLM修正抽取结果,在 webmainbench/config.py 中配置 LLM api
+# To use LLM to correct extraction results, configure the LLM API in webmainbench/config.py
def all_extractor_comparison():
- """演示多抽取器对比"""
-
- print("\n=== 多抽取器对比演示 ===\n")
-
- # 创建数据集
+ """Demonstrate multi-extractor comparison"""
+
+ print("\n=== Multi-Extractor Comparison Demo ===\n")
+
+ # Create dataset
dataset_path = Path("../data/WebMainBench_llm-webkit_v1_WebMainBench_7887_within_formula.jsonl")
dataset = DataLoader.load_jsonl(dataset_path)
- # 创建webkit抽取器
+ # Create webkit extractor
config = {
- "use_preprocessed_html": True, # 🔑 关键配置:启用预处理HTML模式
- "preprocessed_html_field": "llm_webkit_html" # 指定预处理HTML字段名
+ "use_preprocessed_html": True, # Key config: enable preprocessed HTML mode
+ "preprocessed_html_field": "llm_webkit_html" # Specify the preprocessed HTML field name
}
webkit_extractor = ExtractorFactory.create("llm-webkit", config=config)
- # 创建magic-extractor抽取器
+ # Create magic-extractor extractor
magic_extractor = ExtractorFactory.create("magic-html")
- # 创建trafilatura抽取器,抽取成markdown
+ # Create trafilatura extractor, extract to markdown
trafilatura_extractor = ExtractorFactory.create("trafilatura")
- # 创建trafilatura抽取器,抽取成txt
+ # Create trafilatura extractor, extract to txt
trafilatura_txt_extractor = ExtractorFactory.create("trafilatura_txt")
- # 创建resiliparse抽取器
+ # Create resiliparse extractor
resiliparse_extractor = ExtractorFactory.create("resiliparse")
-
- # 运行对比
+
+ # Run comparison
evaluator = Evaluator()
extractors = [webkit_extractor, magic_extractor, trafilatura_extractor,trafilatura_txt_extractor, resiliparse_extractor]
# extractors = [webkit_extractor]
-
+
results = evaluator.compare_extractors(
dataset=dataset,
extractors=extractors
)
-
- # 显示对比结果
- print("对比结果:")
+
+ # Display comparison results
+ print("Comparison results:")
print("-" * 40)
for extractor_name, result in results.items():
overall_score = result.overall_metrics.get('overall', 0)
print(f"{extractor_name}: {overall_score:.4f}")
-
- # 保存多抽取器对比榜单
+
+ # Save multi-extractor comparison leaderboard
all_results = []
for extractor_name, result in results.items():
all_results.append(result.to_dict())
-
+
results_dir = Path("results")
results_dir.mkdir(exist_ok=True)
leaderboard_path = results_dir / "leaderboard.csv"
@@ -60,10 +60,10 @@ def all_extractor_comparison():
DataSaver.save_evaluation_results(all_results, evaluation_results_path)
DataSaver.save_dataset_with_extraction(
results=all_results,
- dataset=dataset, # 原始数据集对象
+ dataset=dataset, # Original dataset object
file_path=jsonl_dataset_path
)
- print(f"\n📊 榜单已保存到: {leaderboard_path}")
+ print(f"\nLeaderboard saved to: {leaderboard_path}")
if __name__ == "__main__":
diff --git a/examples/resiliparse_extract_demo.py b/examples/resiliparse_extract_demo.py
index ba33a14..17c941f 100644
--- a/examples/resiliparse_extract_demo.py
+++ b/examples/resiliparse_extract_demo.py
@@ -1,7 +1,7 @@
import time
from webmainbench.extractors import ExtractorFactory
-# 配置 Resiliparse 抽取器
+# Configure Resiliparse extractor
config = {
"main_content": True,
"alt_texts": True,
@@ -14,66 +14,66 @@
}
try:
- # 创建 Resiliparse 抽取器实例
+ # Create Resiliparse extractor instance
extractor = ExtractorFactory.create("resiliparse", config=config)
- print(f"✅ Extractor创建成功: {extractor.description}")
- print(f"📋 版本: {extractor.version}")
- print(f"⚙️ 配置: {extractor.get_config()}\n")
+ print(f"✅ Extractor created successfully: {extractor.description}")
+ print(f"📋 Version: {extractor.version}")
+ print(f"⚙️ Config: {extractor.get_config()}\n")
except Exception as e:
- print(f"❌ Extractor创建失败: {e}")
+ print(f"❌ Failed to create extractor: {e}")
-# 测试 HTML
+# Test HTML
test_html = """
-
Python编程教程
-
这是一个Python基础教程,展示如何定义函数。
+
Python Programming Tutorial
+
This is a basic Python tutorial demonstrating how to define functions.
def greet(name):
- ""问候函数""
+ ""Greeting function""
return f"Hello, {name}!"
-# 使用示例
+# Usage example
result = greet("World")
print(result)