diff --git a/.env.example b/.env.example
new file mode 100644
index 0000000..97015be
--- /dev/null
+++ b/.env.example
@@ -0,0 +1,8 @@
+# LLM Configuration
+LLM_BASE_URL=https://api.openai.com/v1
+LLM_API_KEY=your-api-key-here
+LLM_MODEL=gpt-4o
+USE_LLM=True
+
+# Debug
+METRICS_DEBUG=False
diff --git a/.gitignore b/.gitignore
index 27dc5a1..1badaeb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -47,6 +47,9 @@ coverage.xml
webmainbench.egg-info/*
+# cache files
+webmainbench/.cache/
+
# PyPI packaging
build/
dist/
diff --git a/README.md b/README.md
index ca863b6..56e0bf7 100644
--- a/README.md
+++ b/README.md
@@ -148,31 +148,28 @@ hf_hub_download(
)
```
+### Configure LLM (Optional)
+
+LLM-enhanced content splitting improves formula/table/code extraction accuracy. To enable it, copy `.env.example` to `.env` and fill in your API credentials:
+
+```bash
+cp .env.example .env
+# Edit .env and set LLM_BASE_URL, LLM_API_KEY, LLM_MODEL
+```
+
### Run an Evaluation
```python
from webmainbench import DataLoader, Evaluator, ExtractorFactory
dataset = DataLoader.load_jsonl("data/WebMainBench_545.jsonl")
-extractor = ExtractorFactory.create("trafilatura")
+result = Evaluator().evaluate(dataset, ExtractorFactory.create("trafilatura"))
-evaluator = Evaluator(llm_config={
- "use_llm": True,
- "llm_base_url": "https://api.openai.com/v1",
- "llm_api_key": "sk-xxxxxxxxxxxx",
- "llm_model": "gpt-4o",
-})
-result = evaluator.evaluate(dataset, extractor)
+m = result.overall_metrics
print(f"Overall Score: {result.overall_metrics['overall']:.4f}")
```
-If you don't need LLM-enhanced content splitting (for formula/table/code extraction), disable it explicitly:
-
-```python
-evaluator = Evaluator(llm_config={"use_llm": False})
-```
-
### Compare Multiple Extractors
```python
diff --git a/README_zh.md b/README_zh.md
index 3694d3b..19e225c 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -148,31 +148,28 @@ hf_hub_download(
)
```
+### 配置 LLM(可选)
+
+LLM 增强内容拆分可提升公式/表格/代码的抽取精度。如需启用,将 `.env.example` 复制为 `.env` 并填写 API 信息:
+
+```bash
+cp .env.example .env
+# 编辑 .env,设置 LLM_BASE_URL、LLM_API_KEY、LLM_MODEL
+```
+
### 运行评测
```python
from webmainbench import DataLoader, Evaluator, ExtractorFactory
dataset = DataLoader.load_jsonl("data/WebMainBench_545.jsonl")
-extractor = ExtractorFactory.create("trafilatura")
+result = Evaluator().evaluate(dataset, ExtractorFactory.create("trafilatura"))
-evaluator = Evaluator(llm_config={
- "use_llm": True,
- "llm_base_url": "https://api.openai.com/v1",
- "llm_api_key": "sk-xxxxxxxxxxxx",
- "llm_model": "gpt-4o",
-})
-result = evaluator.evaluate(dataset, extractor)
+m = result.overall_metrics
print(f"Overall Score: {result.overall_metrics['overall']:.4f}")
```
-如不需要 LLM 增强内容拆分(用于公式/表格/代码抽取),可显式关闭:
-
-```python
-evaluator = Evaluator(llm_config={"use_llm": False})
-```
-
### 多抽取器对比
```python
diff --git a/examples/basic_usage.py b/examples/basic_usage.py
index 9e5c786..93af060 100755
--- a/examples/basic_usage.py
+++ b/examples/basic_usage.py
@@ -1,12 +1,12 @@
#!/usr/bin/env python3
"""
-WebMainBench 基本使用示例
+WebMainBench Basic Usage Example
"""
import json
from pathlib import Path
-# 导入 WebMainBench 模块
+# Import WebMainBench modules
from webmainbench import (
DataLoader, DataSaver, BenchmarkDataset, DataSample,
ExtractorFactory, Evaluator,
@@ -15,9 +15,9 @@
def create_sample_dataset():
- """创建示例数据集"""
-
- # 创建示例数据 - 包含多种内容类型(代码、公式、表格等)
+ """Create a sample dataset"""
+
+ # Create sample data - includes multiple content types (code, formulas, tables, etc.)
samples = [
{
"track_id": "sample-001-programming-tutorial",
@@ -280,8 +280,8 @@ def quicksort(arr):
}
]
- # 创建数据集
- dataset = BenchmarkDataset(name="sample_dataset", description="示例评测数据集")
+ # Create dataset
+ dataset = BenchmarkDataset(name="sample_dataset", description="Sample evaluation dataset")
for sample_data in samples:
sample = DataSample.from_dict(sample_data)
@@ -291,121 +291,121 @@ def quicksort(arr):
def demo_basic_mock_evaluation():
- """演示基本评测流程"""
-
- print("=== WebMainBench 基本使用示例 ===\n")
-
- # 设置日志
+ """Demonstrate the basic evaluation workflow"""
+
+ print("=== WebMainBench Basic Usage Example ===\n")
+
+ # Set up logging
setup_logging(level="INFO")
-
- # 1. 创建或加载数据集
- print("1. 创建示例数据集...")
+
+ # 1. Create or load dataset
+ print("1. Creating sample dataset...")
dataset = create_sample_dataset()
- print(f"数据集包含 {len(dataset)} 个样本")
- print(f"数据集统计: {dataset.get_statistics()}\n")
-
- # 2. 保存数据集到文件
+ print(f"Dataset contains {len(dataset)} samples")
+ print(f"Dataset statistics: {dataset.get_statistics()}\n")
+
+ # 2. Save dataset to file
data_dir = Path("data")
data_dir.mkdir(exist_ok=True)
-
+
dataset_path = data_dir / "sample_dataset.jsonl"
DataSaver.save_jsonl(dataset, dataset_path, include_results=False)
- print(f"数据集已保存到: {dataset_path}\n")
-
- # 3. 重新加载数据集
- print("2. 重新加载数据集...")
+ print(f"Dataset saved to: {dataset_path}\n")
+
+ # 3. Reload dataset
+ print("2. Reloading dataset...")
loaded_dataset = DataLoader.load_jsonl(dataset_path)
- print(f"加载的数据集包含 {len(loaded_dataset)} 个样本\n")
-
- # 4. 列出可用的抽取器
- print("3. 可用的抽取器:")
+ print(f"Loaded dataset contains {len(loaded_dataset)} samples\n")
+
+ # 4. List available extractors
+ print("3. Available extractors:")
available_extractors = ExtractorFactory.list_available()
for extractor_name in available_extractors:
print(f" - {extractor_name}")
print()
-
- # 5. 创建评测器
- print("4. 创建评测器...")
+
+ # 5. Create evaluator
+ print("4. Creating evaluator...")
evaluator = Evaluator()
- print(f"可用的评测指标: {evaluator.metric_calculator.list_available_metrics()}\n")
-
- # 6. 创建一个模拟抽取器进行演示
- print("5. 创建模拟抽取器...")
-
+ print(f"Available evaluation metrics: {evaluator.metric_calculator.list_available_metrics()}\n")
+
+ # 6. Create a mock extractor for demonstration
+ print("5. Creating mock extractor...")
+
from webmainbench.extractors import BaseExtractor, ExtractionResult
-
+
class MockExtractor(BaseExtractor):
- """模拟抽取器,用于演示"""
-
+ """Mock extractor for demonstration"""
+
def _setup(self):
pass
-
+
def _extract_content(self, html, url=None):
- # 简单的模拟抽取逻辑
- if "标题" in html:
- content = "# 提取的标题\n\n提取的正文内容。"
+ # Simple mock extraction logic
+ if "heading" in html.lower() or "title" in html.lower():
+ content = "# Extracted Title\n\nExtracted body content."
content_list = [
- {"type": "heading", "content": "提取的标题", "level": 1},
- {"type": "paragraph", "content": "提取的正文内容。"}
+ {"type": "heading", "content": "Extracted Title", "level": 1},
+ {"type": "paragraph", "content": "Extracted body content."}
]
else:
- content = "提取的内容"
- content_list = [{"type": "paragraph", "content": "提取的内容"}]
-
+ content = "Extracted content"
+ content_list = [{"type": "paragraph", "content": "Extracted content"}]
+
return ExtractionResult(
content=content,
content_list=content_list,
success=True,
confidence_score=0.85
)
-
- # 注册模拟抽取器
+
+ # Register mock extractor
ExtractorFactory.register("mock", MockExtractor)
mock_extractor = ExtractorFactory.create("mock")
- print("模拟抽取器已创建\n")
-
- # 7. 运行评测
- print("6. 运行评测...")
+ print("Mock extractor created\n")
+
+ # 7. Run evaluation
+ print("6. Running evaluation...")
result = evaluator.evaluate(
dataset=loaded_dataset,
extractor=mock_extractor,
- max_samples=2 # 限制样本数量用于演示
+ max_samples=2 # Limit sample count for demonstration
)
-
- # 8. 显示结果
- print("\n7. 评测结果:")
+
+ # 8. Display results
+ print("\n7. Evaluation results:")
print("=" * 50)
formatted_results = format_results(result.to_dict())
print(formatted_results)
-
- # 9. 保存结果
+
+ # 9. Save results
results_dir = Path("results")
results_dir.mkdir(exist_ok=True)
-
+
results_path = results_dir / "mock_evaluation_results.json"
DataSaver.save_evaluation_results(result, results_path)
- print(f"\n结果已保存到: {results_path}")
-
- # 10. 生成报告
+ print(f"\nResults saved to: {results_path}")
+
+ # 10. Generate report
report_path = results_dir / "mock_evaluation_report.csv"
DataSaver.save_summary_report(result, report_path)
- print(f"报告已保存到: {report_path}")
+ print(f"Report saved to: {report_path}")
def demo_llm_webkit_evaluation():
- """演示LLM-WebKit抽取器的6项指标评测"""
-
- print("=== LLM-WebKit Extractor 6项指标评测示例 ===\n")
-
- # 设置日志
+ """Demonstrate 6-metric evaluation with LLM-WebKit extractor"""
+
+ print("=== LLM-WebKit Extractor 6-Metric Evaluation Example ===\n")
+
+ # Set up logging
setup_logging(level="INFO")
-
- # 1. 创建包含各种内容类型的测试数据集
- print("1. 创建包含多种内容类型的测试数据集...")
-
+
+ # 1. Create test dataset with various content types
+ print("1. Creating test dataset with multiple content types...")
+
samples = []
-
- # 样本1: 包含文本和代码
+
+ # Sample 1: text and code
samples.append(DataSample(
id="text_code_sample",
html="""
@@ -440,7 +440,7 @@ def hello_world():
{"type": "text", "content": "以上代码展示了一个简单的Python函数。"}
]
))
-
+
# 样本2: 包含表格
samples.append(DataSample(
id="table_sample",
@@ -483,7 +483,7 @@ def hello_world():
{"type": "table", "content": "| 产品 | 销量 | 收入 |\n|------|------|------|\n| 产品A | 100 | 1000 |\n| 产品B | 200 | 3000 |"}
]
))
-
+
# 样本3: 包含公式
samples.append(DataSample(
id="formula_sample",
@@ -511,238 +511,237 @@ def hello_world():
{"type": "formula", "content": "\\int_{-\\infty}^{\\infty} e^{-x^2} dx = \\sqrt{\\pi}"}
]
))
-
- # 创建数据集并添加样本
- dataset = BenchmarkDataset(name="llm_webkit_test", description="LLM-WebKit 6项指标测试数据集")
+
+ # Create dataset and add samples
+ dataset = BenchmarkDataset(name="llm_webkit_test", description="LLM-WebKit 6-metric test dataset")
for sample in samples:
dataset.add_sample(sample)
-
- print(f"测试数据集包含 {len(dataset)} 个样本")
- print(f"样本类型: 文本+代码, 表格, 公式\n")
-
- # 2. 创建LLM-WebKit抽取器
- print("2. 创建LLM-WebKit抽取器...")
-
- # 显示所有可用的抽取器
+
+ print(f"Test dataset contains {len(dataset)} samples")
+ print(f"Sample types: text+code, table, formula\n")
+
+ # 2. Create LLM-WebKit extractor
+ print("2. Creating LLM-WebKit extractor...")
+
+ # Show all available extractors
available_extractors = ExtractorFactory.list_available()
- print(f"可用的抽取器: {available_extractors}")
-
- # 直接创建LLM-WebKit抽取器,设置模型路径
+ print(f"Available extractors: {available_extractors}")
+
+ # Create LLM-WebKit extractor directly with model path
config = {
"model_path": "/Users/chupei/model/checkpoint-3296"
}
extractor = ExtractorFactory.create("llm-webkit", config=config)
- print(f"✅ LLM-WebKit抽取器创建成功,模型路径: {config['model_path']}")
-
+ print(f"LLM-WebKit extractor created successfully, model path: {config['model_path']}")
+
print()
-
- # 3. 创建评测器并显示所有可用指标
- print("3. 创建评测器...")
+
+ # 3. Create evaluator and show all available metrics
+ print("3. Creating evaluator...")
evaluator = Evaluator()
available_metrics = evaluator.metric_calculator.list_available_metrics()
- print(f"✅ 可用的评测指标 ({len(available_metrics)}项):")
-
- # 按照6项指标分类显示
+ print(f"Available evaluation metrics ({len(available_metrics)} total):")
+
+ # Display by the 6 metric categories
target_metrics = ["overall", "text_edit", "code_edit", "table_edit", "table_TEDS", "formula_edit"]
-
+
for metric in target_metrics:
if metric in available_metrics:
- print(f" ✅ {metric}")
+ print(f" {metric}")
else:
- print(f" ❌ {metric} (未注册)")
-
+ print(f" {metric} (not registered)")
+
print()
-
- # 4. 运行评测
- print("4. 开始评测...")
+
+ # 4. Run evaluation
+ print("4. Starting evaluation...")
print("=" * 60)
-
+
result = evaluator.evaluate(
dataset=dataset,
extractor=extractor,
- max_samples=None # 评测所有样本
+ max_samples=None # Evaluate all samples
)
-
- # 5. 显示详细的6项指标结果
- print("\n5. 📊 6项指标详细评测结果:")
+
+ # 5. Display detailed 6-metric results
+ print("\n5. 6-metric detailed evaluation results:")
print("=" * 60)
-
+
results_dict = result.to_dict()
-
- # 从overall_metrics中提取指标结果
+
+ # Extract metric results from overall_metrics
metrics = results_dict.get('overall_metrics', {})
-
- # 按照指标分类显示
- print(f"\n🏆 综合指标:")
+
+ # Display by metric category
+ print(f"\nOverall metrics:")
if 'overall' in metrics:
- print(f" overall (综合得分): {metrics['overall']:.4f}")
+ print(f" overall (combined score): {metrics['overall']:.4f}")
else:
- print(" overall: 未计算")
-
- print(f"\n📝 文本相关指标:")
+ print(" overall: not calculated")
+
+ print(f"\nText-related metrics:")
if 'text_edit' in metrics:
- print(f" text_edit (文本编辑距离): {metrics['text_edit']:.4f}")
+ print(f" text_edit (text edit distance): {metrics['text_edit']:.4f}")
else:
- print(" text_edit: 未计算")
+ print(" text_edit: not calculated")
if 'code_edit' in metrics:
- print(f" code_edit (代码编辑距离): {metrics['code_edit']:.4f}")
+ print(f" code_edit (code edit distance): {metrics['code_edit']:.4f}")
else:
- print(" code_edit: 未计算")
-
- print(f"\n📊 表格相关指标:")
+ print(" code_edit: not calculated")
+
+ print(f"\nTable-related metrics:")
if 'table_edit' in metrics:
- print(f" table_edit (表格编辑距离): {metrics['table_edit']:.4f}")
+ print(f" table_edit (table edit distance): {metrics['table_edit']:.4f}")
else:
- print(" table_edit: 未计算")
+ print(" table_edit: not calculated")
if 'table_TEDS' in metrics:
- print(f" table_TEDS (表格结构相似度): {metrics['table_TEDS']:.4f}")
+ print(f" table_TEDS (table structure similarity): {metrics['table_TEDS']:.4f}")
else:
- print(" table_TEDS: 未计算")
-
- print(f"\n🧮 公式相关指标:")
+ print(" table_TEDS: not calculated")
+
+ print(f"\nFormula-related metrics:")
if 'formula_edit' in metrics:
- print(f" formula_edit (公式编辑距离): {metrics['formula_edit']:.4f}")
+ print(f" formula_edit (formula edit distance): {metrics['formula_edit']:.4f}")
else:
- print(" formula_edit: 未计算")
-
- print(f"\n📈 详细统计:")
- print(f" 总样本数: {len(dataset)}")
+ print(" formula_edit: not calculated")
+
+ print(f"\nDetailed statistics:")
+ print(f" Total samples: {len(dataset)}")
success_count = len([s for s in results_dict.get('sample_results', []) if s.get('extraction_success', False)])
failure_count = len(dataset) - success_count
- print(f" 成功样本数: {success_count}")
- print(f" 失败样本数: {failure_count}")
-
- # 6. 保存结果到文件
+ print(f" Successful samples: {success_count}")
+ print(f" Failed samples: {failure_count}")
+
+ # 6. Save results to file
print("\n" + "=" * 60)
- print("6. 保存评测结果...")
-
+ print("6. Saving evaluation results...")
+
results_dir = Path("results")
results_dir.mkdir(exist_ok=True)
-
- # 保存详细结果
+
+ # Save detailed results
results_path = results_dir / "llm_webkit_evaluation_results.json"
- DataSaver.save_evaluation_results(result, results_path) # 直接传递result对象
- print(f"✅ 详细结果已保存到: {results_path}")
-
- # 生成CSV报告
+ DataSaver.save_evaluation_results(result, results_path) # Pass result object directly
+ print(f"Detailed results saved to: {results_path}")
+
+ # Generate CSV report
report_path = results_dir / "llm_webkit_evaluation_report.csv"
- DataSaver.save_summary_report(result, report_path) # 直接传递result对象
- print(f"✅ CSV报告已保存到: {report_path}")
-
+ DataSaver.save_summary_report(result, report_path) # Pass result object directly
+ print(f"CSV report saved to: {report_path}")
+
print("\n" + "=" * 60)
- print("✅ LLM-WebKit 6项指标评测完成!")
+ print("LLM-WebKit 6-metric evaluation complete!")
def demo_dataset_with_extraction():
- """演示保存带有抽取内容的数据集"""
- print("=== 演示:保存带有抽取内容的数据集 ===")
-
+ """Demonstrate saving a dataset with extracted content"""
+ print("=== Demo: Saving a Dataset with Extracted Content ===")
+
from webmainbench import DataLoader, DataSaver, Evaluator, ExtractorFactory
from pathlib import Path
-
- # 配置文件路径
+
+ # Configure file paths
data_dir = Path("data")
dataset_path = data_dir / "sample_dataset.jsonl"
# dataset_path = "/Users/chupei/Downloads/WebMainBench_dataset_merge_2549.jsonl"
-
- print(f"📂 数据集文件: {dataset_path}")
-
- # 🔧 创建llm-webkit抽取器(统一使用)
+
+ print(f"Dataset file: {dataset_path}")
+
+ # Create llm-webkit extractor (used uniformly)
extractor_config = {"model_path": "/Users/chupei/model/checkpoint-3296"}
extractor = ExtractorFactory.create("llm-webkit", config=extractor_config)
- print(f"🤖 使用抽取器: {extractor.name}")
-
- # 创建评测器
+ print(f"Using extractor: {extractor.name}")
+
+ # Create evaluator
evaluator = Evaluator()
-
- # 🔧 选择评测模式:内存模式 vs 批处理模式
- USE_BATCHED_MODE = True # 设置为True使用批处理模式(适用于大数据集)
-
+
+ # Choose evaluation mode: in-memory mode vs batched mode
+ USE_BATCHED_MODE = True # Set to True to use batched mode (suitable for large datasets)
+
if USE_BATCHED_MODE:
- print("🔄 使用批处理模式(内存优化)")
-
- # 🚀 批处理评测(适用于大数据集)
+ print("Using batched mode (memory-optimized)")
+
+ # Batched evaluation (suitable for large datasets)
result = evaluator.evaluate_batched(
jsonl_file_path=dataset_path,
- extractor=extractor, # 直接传递extractor对象
- batch_size=10, # 小批次
- max_samples=20 # 演示用
+ extractor=extractor, # Pass extractor object directly
+ batch_size=10, # Small batch size
+ max_samples=20 # For demonstration
)
- print(f"✅ 批处理评测完成,总体得分: {result.overall_metrics.get('overall', 0):.4f}")
-
- # 为了保存带有抽取内容的数据集,需要重新加载原始数据集
- # 注:这里只是短暂加载用于保存,不影响前面的内存优化评测
+ print(f"Batched evaluation complete, overall score: {result.overall_metrics.get('overall', 0):.4f}")
+
+ # To save the dataset with extraction content, reload the original dataset temporarily
+ # Note: this is only a brief load for saving and does not affect the memory-optimized evaluation above
dataset = DataLoader.load_jsonl(dataset_path, include_results=False)
dataset.name = result.dataset_name
-
+
else:
- print("🔄 使用传统内存模式")
-
- # 从文件加载数据集
- print(f"📂 从文件加载数据集: {dataset_path}")
+ print("Using traditional in-memory mode")
+
+ # Load dataset from file
+ print(f"Loading dataset from file: {dataset_path}")
dataset = DataLoader.load_jsonl(dataset_path, include_results=False)
dataset.name = "WebMainBench_with_extraction"
- dataset.description = "演示抽取内容保存的测试数据集"
-
- print(f"📊 加载数据集完成,包含 {len(dataset.samples)} 个样本")
-
- # 运行评测
+ dataset.description = "Test dataset demonstrating extraction content saving"
+
+ print(f"Dataset loaded, contains {len(dataset.samples)} samples")
+
+ # Run evaluation
result = evaluator.evaluate(dataset, extractor)
-
- print(f"✅ 评测完成,总体得分: {result.overall_metrics.get('overall', 0):.4f}")
-
- # 保存带有抽取内容的数据集
+
+ print(f"Evaluation complete, overall score: {result.overall_metrics.get('overall', 0):.4f}")
+
+ # Save dataset with extracted content
results_dir = Path("results")
enriched_dataset_path = results_dir / f"{dataset.name}_with_{extractor.name}_extraction.jsonl"
-
+
DataSaver.save_dataset_with_extraction(
results=result,
- dataset=dataset,
+ dataset=dataset,
file_path=enriched_dataset_path,
extractor_name=extractor.name
)
-
- print(f"💾 已保存带有抽取内容的数据集到: {enriched_dataset_path}")
-
- # 保存评测结果和摘要报告
+
+ print(f"Dataset with extracted content saved to: {enriched_dataset_path}")
+
+ # Save evaluation results and summary report
evaluation_results_path = results_dir / f"{dataset.name}_{extractor.name}_evaluation_results.json"
summary_report_path = results_dir / f"{dataset.name}_{extractor.name}_evaluation_report.csv"
-
+
DataSaver.save_evaluation_results(result, evaluation_results_path)
DataSaver.save_summary_report(result, summary_report_path)
-
- print(f"📊 已保存评测结果到: {evaluation_results_path}")
- print(f"📈 已保存摘要报告到: {summary_report_path}")
-
- # 显示保存的字段信息
- print("\n📋 保存的新字段包括:")
- print(f" - {extractor.name}_content: 抽取的内容")
- print(f" - {extractor.name}_content_list: 抽取的结构化内容列表")
- print(f" - {extractor.name}_success: 抽取是否成功")
- print(f" - {extractor.name}_time: 抽取耗时")
- print(f" - {extractor.name}_*_score: 各项指标分数")
+
+ print(f"Evaluation results saved to: {evaluation_results_path}")
+ print(f"Summary report saved to: {summary_report_path}")
+
+ # Display saved field info
+ print("\nNewly saved fields include:")
+ print(f" - {extractor.name}_content: extracted content")
+ print(f" - {extractor.name}_content_list: extracted structured content list")
+ print(f" - {extractor.name}_success: whether extraction succeeded")
+ print(f" - {extractor.name}_time: extraction time")
+ print(f" - {extractor.name}_*_score: metric scores")
def demo_multi_extraction():
- """演示保存带有多个抽取器抽取内容的数据集(支持批处理模式)"""
- print("=== 演示:保存带有多个抽取器抽取内容的数据集 ===")
+ """Demonstrate saving a dataset with content from multiple extractors (supports batched mode)"""
+ print("=== Demo: Saving a Dataset with Multiple Extractor Results ===")
from webmainbench import DataLoader, DataSaver, Evaluator, ExtractorFactory
from pathlib import Path
import time
-
- # 设置日志
+ # Set up logging
setup_logging(level="INFO")
- # 配置文件路径
+ # Configure file paths
data_dir = Path("../data")
# dataset_path = data_dir / "sample_dataset.jsonl"
dataset_path = "/home/lulindong/Pycharm_projects/cc/WebMainBench_1904_v1_WebMainBench_dataset_merge_with_llm_webkit.jsonl"
- print(f"📂 数据集文件: {dataset_path}")
+ print(f"Dataset file: {dataset_path}")
- # 🔧 定义要使用的抽取器列表及配置
+ # Define list of extractors and their configurations
extractors_info = [
{"name": "resiliparse", "config": {
"main_content": True,
@@ -755,68 +754,68 @@ def demo_multi_extraction():
{"name": "magic-html", "config": {}},
]
- # 🔧 选择评测模式:内存模式 vs 批处理模式
- USE_BATCHED_MODE = True # 大数据集建议设为True
- BATCH_SIZE = 10 # 批处理大小
- MAX_SAMPLES = None # 演示用(全量评测可设为None)
+ # Choose evaluation mode: in-memory mode vs batched mode
+ USE_BATCHED_MODE = True # Recommended True for large datasets
+ BATCH_SIZE = 10 # Batch size
+ MAX_SAMPLES = None # For demonstration (set None for full evaluation)
- # 创建结果目录
+ # Create results directory
results_dir = Path("results")
results_dir.mkdir(exist_ok=True)
- # 存储所有抽取器的评测结果和性能数据
+ # Store evaluation results and performance data for all extractors
all_results = []
extractor_performance = []
- # 为每个抽取器运行评测
+ # Run evaluation for each extractor
for info in extractors_info:
extractor_name = info["name"]
config = info["config"]
try:
- # 创建抽取器实例
+ # Create extractor instance
extractor = ExtractorFactory.create(extractor_name, config=config)
- print(f"\n🤖 使用抽取器: {extractor.name}")
+ print(f"\nUsing extractor: {extractor.name}")
except Exception as e:
- print(f"⚠️ {extractor_name} 抽取器创建失败: {e}")
+ print(f"Failed to create extractor {extractor_name}: {e}")
continue
- # 记录总耗时
+ # Record total elapsed time
start_time = time.time()
- # 初始化评测器
+ # Initialize evaluator
evaluator = Evaluator()
- # 选择批处理模式或传统模式
+ # Choose batched or traditional mode
if USE_BATCHED_MODE:
- print(f"🔄 使用批处理模式(批大小: {BATCH_SIZE},最大样本: {MAX_SAMPLES or '全部'})")
- # 批处理评测(内存优化)
+ print(f"Using batched mode (batch size: {BATCH_SIZE}, max samples: {MAX_SAMPLES or 'all'})")
+ # Batched evaluation (memory-optimized)
result = evaluator.evaluate_batched(
jsonl_file_path=dataset_path,
extractor=extractor,
batch_size=BATCH_SIZE,
max_samples=MAX_SAMPLES
)
- # 为保存数据集,临时加载原始数据(不影响内存优化)
+ # Temporarily load original data for saving (does not affect memory-optimized evaluation)
dataset = DataLoader.load_jsonl(dataset_path, include_results=False, max_samples=MAX_SAMPLES)
dataset.name = result.dataset_name
else:
- print("🔄 使用传统内存模式")
- # 加载完整数据集到内存
+ print("Using traditional in-memory mode")
+ # Load full dataset into memory
dataset = DataLoader.load_jsonl(dataset_path, include_results=False, max_samples=MAX_SAMPLES)
dataset.name = "WebMainBench_with_multi_extraction"
- dataset.description = "多抽取器内容保存演示数据集"
- print(f"📊 加载数据集完成,包含 {len(dataset.samples)} 个样本")
+ dataset.description = "Multi-extractor content saving demo dataset"
+ print(f"Dataset loaded, contains {len(dataset.samples)} samples")
- # 传统模式评测
+ # Traditional mode evaluation
result = evaluator.evaluate(dataset, extractor)
- # 计算耗时指标
+ # Calculate elapsed time metrics
total_time = time.time() - start_time
total_samples = len(dataset.samples)
avg_time_per_sample = total_time / total_samples if total_samples else 0
- # 保存性能数据
+ # Save performance data
extractor_performance.append({
"name": extractor_name,
"total_samples": total_samples,
@@ -824,19 +823,19 @@ def demo_multi_extraction():
"avg_time_per_sample": avg_time_per_sample
})
- # 输出评测结果
- print(f"⏱️ 总耗时: {total_time:.4f}秒(单样本平均: {avg_time_per_sample:.4f}秒)")
- print(f"📊 核心指标:")
+ # Output evaluation results
+ print(f"Total time: {total_time:.4f}s (avg per sample: {avg_time_per_sample:.4f}s)")
+ print(f"Core metrics:")
print(f" code_edit: {result.overall_metrics.get('code_edit', 0):.4f}")
print(f" formula_edit: {result.overall_metrics.get('formula_edit', 0):.4f}")
print(f" table_TEDS: {result.overall_metrics.get('table_TEDS', 0):.4f}")
print(f" table_edit: {result.overall_metrics.get('table_edit', 0):.4f}")
print(f" text_edit: {result.overall_metrics.get('text_edit', 0):.4f}")
- print(f"✅ 总体得分: {result.overall_metrics.get('overall', 0):.4f}")
+ print(f"Overall score: {result.overall_metrics.get('overall', 0):.4f}")
all_results.append(result)
- # 保存带有当前抽取器内容的数据集
+ # Save dataset with current extractor's content
enriched_dataset_path = results_dir / f"{dataset.name}_{extractor.name}_extraction_infer.jsonl"
DataSaver.save_dataset_with_extraction(
results=result,
@@ -844,153 +843,153 @@ def demo_multi_extraction():
file_path=enriched_dataset_path,
extractor_name=extractor.name
)
- print(f"💾 已保存抽取内容到: {enriched_dataset_path}")
+ print(f"Extracted content saved to: {enriched_dataset_path}")
- # 保存单个抽取器的评测结果
+ # Save individual extractor evaluation results
eval_results_path = results_dir / f"{dataset.name}_{extractor.name}_evaluation_results.json"
DataSaver.save_evaluation_results(result, eval_results_path)
- print(f"📋 已保存评测结果到: {eval_results_path}")
+ print(f"Evaluation results saved to: {eval_results_path}")
- # 保存所有抽取器的汇总报告
+ # Save summary report for all extractors
if all_results:
summary_path = results_dir / f"{dataset.name}_multi_extractors_summary_report.csv"
DataSaver.save_summary_report(all_results, summary_path)
- print(f"\n📈 已保存汇总报告到: {summary_path}")
+ print(f"\nSummary report saved to: {summary_path}")
- # 展示性能对比
+ # Display performance comparison
if extractor_performance:
- print("\n⚡ 抽取器性能对比:")
+ print("\nExtractor performance comparison:")
for perf in extractor_performance:
print(f" {perf['name']}:")
- print(f" 样本数: {perf['total_samples']}")
- print(f" 总耗时: {perf['total_time']:.4f}秒")
- print(f" 单样本耗时: {perf['avg_time_per_sample']:.4f}秒")
- print(f" 效率: {1 / perf['avg_time_per_sample']:.2f}样本/秒")
+ print(f" Samples: {perf['total_samples']}")
+ print(f" Total time: {perf['total_time']:.4f}s")
+ print(f" Time per sample: {perf['avg_time_per_sample']:.4f}s")
+ print(f" Throughput: {1 / perf['avg_time_per_sample']:.2f} samples/s")
- # 展示保存的字段信息
- print("\n📋 保存的新字段说明:")
+ # Display saved field information
+ print("\nSaved new field descriptions:")
for info in extractors_info:
name = info["name"]
- print(f" {name}相关字段:")
- print(f" - {name}_content: 抽取的原始内容")
- print(f" - {name}_content_list: 结构化内容列表(含type字段)")
- print(f" - {name}_success: 抽取是否成功(布尔值)")
- print(f" - {name}_time: 单样本抽取耗时(秒)")
- print(f" - {name}_*_score: 各指标得分(如{name}_text_edit)")
+ print(f" {name} related fields:")
+ print(f" - {name}_content: extracted raw content")
+ print(f" - {name}_content_list: structured content list (with type field)")
+ print(f" - {name}_success: whether extraction succeeded (boolean)")
+ print(f" - {name}_time: per-sample extraction time (seconds)")
+ print(f" - {name}_*_score: metric scores (e.g. {name}_text_edit)")
def demo_llm_webkit_with_preprocessed_html_evaluation():
- """演示LLM-WebKit预处理HTML功能的评测"""
-
- print("\n=== LLM-WebKit 预处理HTML功能演示 ===\n")
-
- # 设置日志
+ """Demonstrate evaluation of LLM-WebKit preprocessed HTML feature"""
+
+ print("\n=== LLM-WebKit Preprocessed HTML Feature Demo ===\n")
+
+ # Set up logging
setup_logging(level="INFO")
-
- # 1. 从真实数据集加载包含预处理HTML的数据
- print("1. 从真实数据集加载预处理HTML数据...")
+
+ # 1. Load preprocessed HTML data from the real dataset
+ print("1. Loading preprocessed HTML data from the real dataset...")
dataset_path = Path("data/track_id_diff_result_56.jsonl")
- print(f"📂 数据集文件: {dataset_path}")
-
- # 加载数据集
+ print(f"Dataset file: {dataset_path}")
+
+ # Load dataset
dataset = DataLoader.load_jsonl(dataset_path, include_results=False)
dataset.name = "real_preprocessed_html_test"
- dataset.description = "基于真实数据的预处理HTML功能测试"
-
- print(f"✅ 真实数据集加载成功,包含 {len(dataset)} 个样本")
- print("📋 真实数据样本包含:")
- print(" - html: 原始网页HTML")
- print(" - llm_webkit_html: LLM预处理后的简化HTML(包含_item_id标记)")
- print(" - groundtruth_content: 人工标注的标准答案")
- print(" - llm_webkit_md: LLM提取的markdown内容")
-
-
- # 2. 创建预处理HTML模式的LLM-WebKit抽取器
- print("2. 创建预处理HTML模式的LLM-WebKit抽取器...")
-
+ dataset.description = "Preprocessed HTML feature test based on real data"
+
+ print(f"Real dataset loaded successfully, contains {len(dataset)} samples")
+ print("Real data samples include:")
+ print(" - html: raw web page HTML")
+ print(" - llm_webkit_html: LLM-preprocessed simplified HTML (with _item_id markers)")
+ print(" - groundtruth_content: manually annotated ground truth")
+ print(" - llm_webkit_md: LLM-extracted markdown content")
+
+
+ # 2. Create LLM-WebKit extractor in preprocessed HTML mode
+ print("2. Creating LLM-WebKit extractor in preprocessed HTML mode...")
+
config = {
- "use_preprocessed_html": True, # 🔑 关键配置:启用预处理HTML模式
- "preprocessed_html_field": "llm_webkit_html" # 指定预处理HTML字段名
+ "use_preprocessed_html": True, # Key config: enable preprocessed HTML mode
+ "preprocessed_html_field": "llm_webkit_html" # Specify preprocessed HTML field name
}
-
+
extractor = ExtractorFactory.create("llm-webkit", config=config)
-
- # 4. 运行评测
- print("4. 开始评测...")
+
+ # 4. Run evaluation
+ print("4. Starting evaluation...")
print("=" * 50)
-
+
evaluator = Evaluator()
result = evaluator.evaluate(
dataset=dataset,
extractor=extractor,
max_samples=None
)
-
- # 5. 显示评测结果
- print("\n5. 📊 预处理HTML模式评测结果:")
+
+ # 5. Display evaluation results
+ print("\n5. Preprocessed HTML mode evaluation results:")
print("=" * 50)
-
+
results_dict = result.to_dict()
metrics = results_dict.get('overall_metrics', {})
-
- # 显示关键指标
- print(f"\n🏆 综合指标:")
+
+ # Display key metrics
+ print(f"\nOverall metrics:")
print(f" overall: {metrics.get('overall', 0):.4f}")
-
- print(f"\n📝 内容提取质量:")
+
+ print(f"\nContent extraction quality:")
print(f" text_edit: {metrics.get('text_edit', 0):.4f}")
print(f" code_edit: {metrics.get('code_edit', 0):.4f}")
print(f" table_edit: {metrics.get('table_edit', 0):.4f}")
print(f" table_TEDS: {metrics.get('table_TEDS', 0):.4f}")
-
- print(f"\n⚡ 性能统计:")
+
+ print(f"\nPerformance statistics:")
sample_results = results_dict.get('sample_results', [])
if sample_results:
extraction_times = [s.get('extraction_time', 0) for s in sample_results if s.get('extraction_success')]
if extraction_times:
avg_time = sum(extraction_times) / len(extraction_times)
- print(f" 平均提取时间: {avg_time:.3f}秒")
- print(f" 处理速度: {1/avg_time:.1f}样本/秒")
-
+ print(f" Average extraction time: {avg_time:.3f}s")
+ print(f" Processing speed: {1/avg_time:.1f} samples/s")
+
success_count = len([s for s in sample_results if s.get('extraction_success', False)])
- print(f" 成功样本数: {success_count}/{len(dataset)}")
-
- # 7. 保存结果
- print(f"\n7. 💾 保存评测结果...")
-
+ print(f" Successful samples: {success_count}/{len(dataset)}")
+
+ # 7. Save results
+ print(f"\n7. Saving evaluation results...")
+
results_dir = Path("results")
results_dir.mkdir(exist_ok=True)
- # 新增:保存带抽取结果的增强数据集(JSONL格式)
+ # Save enhanced dataset with extraction results (JSONL format)
jsonl_dataset_path = results_dir / f"{extractor.name}_preprocessed_html_dataset_with_results.jsonl"
DataSaver.save_dataset_with_extraction(
results=result,
- dataset=dataset, # 原始数据集对象
+ dataset=dataset, # Original dataset object
file_path=jsonl_dataset_path,
- extractor_name="llm-webkit" # 抽取器名称前缀
+ extractor_name="llm-webkit" # Extractor name prefix
)
- print(f"✅ 带抽取结果的JSONL数据集已保存到: {jsonl_dataset_path}")
+ print(f"JSONL dataset with extraction results saved to: {jsonl_dataset_path}")
results_path = results_dir / f"{extractor.name}_preprocessed_html_evaluation_results.json"
report_path = results_dir / f"{extractor.name}_preprocessed_html_evaluation_report.csv"
-
+
DataSaver.save_evaluation_results(result, results_path)
DataSaver.save_summary_report(result, report_path)
-
- print(f"✅ 详细结果已保存到: {results_path}")
- print(f"✅ CSV报告已保存到: {report_path}")
+
+ print(f"Detailed results saved to: {results_path}")
+ print(f"CSV report saved to: {report_path}")
if __name__ == "__main__":
try:
# demo_basic_mock_evaluation()
- # demo_llm_webkit_evaluation() # 使用LLM-WebKit评测示例
+ # demo_llm_webkit_evaluation() # LLM-WebKit evaluation example
demo_llm_webkit_with_preprocessed_html_evaluation()
# demo_extractor_comparison()
- # demo_dataset_with_extraction() # 演示保存带有抽取内容的数据集
- # demo_multi_extraction() # 演示多个抽取器同时评测
- print("\n✅ 示例运行完成!")
-
+ # demo_dataset_with_extraction() # Demo saving dataset with extracted content
+ # demo_multi_extraction() # Demo evaluating with multiple extractors simultaneously
+ print("\nExample completed!")
+
except Exception as e:
- print(f"\n❌ 运行出错: {e}")
+ print(f"\nRuntime error: {e}")
import traceback
traceback.print_exc()
\ No newline at end of file
diff --git a/examples/demo.py b/examples/demo.py
index b460ad5..0c016bd 100644
--- a/examples/demo.py
+++ b/examples/demo.py
@@ -1,15 +1,15 @@
from webmainbench import DataLoader, Evaluator, ExtractorFactory
from pathlib import Path
-# 1. 加载评测数据集
+# 1. Load evaluation dataset
dataset = DataLoader.load_jsonl(Path("data/sample_dataset.jsonl"))
-# 2. 创建抽取器
+# 2. Create extractor
extractor = ExtractorFactory.create("llm-webkit")
-# 3. 运行评测
+# 3. Run evaluation
evaluator = Evaluator()
result = evaluator.evaluate(dataset, extractor)
-# 4. 查看结果
+# 4. View results
print(f"Overall Score: {result}")
diff --git a/examples/llm_webkit_usage.py b/examples/llm_webkit_usage.py
index 4300f55..229cde0 100644
--- a/examples/llm_webkit_usage.py
+++ b/examples/llm_webkit_usage.py
@@ -1,8 +1,8 @@
#!/usr/bin/env python3
"""
-LLM-WebKit Extractor使用示例
+LLM-WebKit Extractor Usage Example
-本示例展示如何使用集成了VLLM推理能力的LLM-WebKit extractor。
+This example demonstrates how to use the LLM-WebKit extractor integrated with VLLM inference capabilities.
"""
import time
@@ -10,138 +10,138 @@
def main():
- print("🚀 LLM-WebKit Extractor 使用示例\n")
-
- # 1. 创建带有自定义配置的extractor
+ print("LLM-WebKit Extractor Usage Example\n")
+
+ # 1. Create extractor with custom configuration
config = {
- "model_path": "/Users/chupei/model/checkpoint-3296", # 替换为您的模型路径
- "use_logits_processor": True, # 启用JSON格式约束
- "temperature": 0.0, # 确定性输出
- "max_item_count": 500, # 处理的最大item数量
- "max_output_tokens": 4096, # 最大输出token数
- "dtype": "bfloat16", # 模型精度
- "tensor_parallel_size": 1 # 张量并行大小
+ "model_path": "/Users/chupei/model/checkpoint-3296", # Replace with your model path
+ "use_logits_processor": True, # Enable JSON format constraint
+ "temperature": 0.0, # Deterministic output
+ "max_item_count": 500, # Maximum number of items to process
+ "max_output_tokens": 4096, # Maximum output tokens
+ "dtype": "bfloat16", # Model precision
+ "tensor_parallel_size": 1 # Tensor parallel size
}
-
+
try:
extractor = ExtractorFactory.create("llm-webkit", config=config)
- print(f"✅ Extractor创建成功: {extractor.description}")
- print(f"📋 版本: {extractor.version}")
- print(f"⚙️ 配置: {extractor.inference_config.__dict__}\n")
-
+ print(f"Extractor created successfully: {extractor.description}")
+ print(f"Version: {extractor.version}")
+ print(f"Config: {extractor.inference_config.__dict__}\n")
+
except Exception as e:
- print(f"❌ Extractor创建失败: {e}")
- print("💡 请确保已安装所需依赖:")
+ print(f"Extractor creation failed: {e}")
+ print("Please ensure the required dependencies are installed:")
print(" pip install vllm transformers torch llm_web_kit")
return
-
- # 2. 准备测试HTML(包含_item_id属性的结构化HTML)
+
+ # 2. Prepare test HTML (structured HTML with _item_id attributes)
test_html = """
- 测试文章 - 人工智能的发展趋势
+ Test Article - AI Development Trends
-
+
- 人工智能的发展趋势
- 作者:张三 | 发布时间:2024-01-15 | 阅读量:1,234
+ AI Development Trends
+ Author: John Doe | Published: 2024-01-15 | Views: 1,234
-
+
- 人工智能(AI)技术正在快速发展,对各行各业产生深远影响。本文将探讨AI的主要发展趋势和未来展望。
-
- 1. 机器学习的进步
- 深度学习和大语言模型的突破使得AI系统能够理解和生成更自然的语言,在对话、翻译、创作等领域表现出色。
-
- 2. 自动化应用
- 从制造业的机器人到软件开发的代码生成,AI正在各个领域实现流程自动化,提高效率并降低成本。
-
- 3. 个性化服务
- 基于用户数据的个性化推荐和服务正变得越来越精准,为用户提供更好的体验。
+ Artificial Intelligence (AI) technology is rapidly advancing, with far-reaching impacts across all industries. This article explores the major development trends and future prospects of AI.
+
+ 1. Advances in Machine Learning
+ Breakthroughs in deep learning and large language models have enabled AI systems to understand and generate more natural language, excelling in dialogue, translation, and creative tasks.
+
+ 2. Automation Applications
+ From robots in manufacturing to code generation in software development, AI is automating processes across domains, improving efficiency and reducing costs.
+
+ 3. Personalized Services
+ Personalized recommendations and services based on user data are becoming increasingly precise, providing better user experiences.
-
+
- 相关文章
+ Related Articles
-
+
"""
-
- # 3. 执行内容提取
- print("🔍 开始内容提取...")
+
+ # 3. Execute content extraction
+ print("Starting content extraction...")
start_time = time.time()
try:
result = extractor.extract(test_html)
end_time = time.time()
- print(f"⏱️ 提取耗时: {end_time - start_time:.2f}秒\n")
-
- # 4. 显示提取结果
+ print(f"⏱️ Extraction time: {end_time - start_time:.2f}s\n")
+
+ # 4. Display extraction results
if result.success:
- print("✅ 内容提取成功!\n")
-
- print("📄 提取的主要内容:")
+ print("✅ Content extracted successfully!\n")
+
+ print("📄 Extracted main content:")
print("=" * 50)
print(result.content[:500] + "..." if len(result.content) > 500 else result.content)
print("=" * 50)
-
- print(f"\n📊 提取统计:")
- print(f" • 内容长度: {len(result.content)} 字符")
- print(f" • 置信度: {result.confidence_score:.3f}")
- print(f" • 标题: {result.title}")
- print(f" • 语言: {result.language}")
- print(f" • 提取时间: {result.extraction_time:.3f}秒")
-
+
+ print(f"\n📊 Extraction statistics:")
+ print(f" • Content length: {len(result.content)} characters")
+ print(f" • Confidence: {result.confidence_score:.3f}")
+ print(f" • Title: {result.title}")
+ print(f" • Language: {result.language}")
+ print(f" • Extraction time: {result.extraction_time:.3f}s")
+
if result.content_list:
- print(f" • 结构化内容块: {len(result.content_list)}个")
- for i, item in enumerate(result.content_list[:3]): # 显示前3个
+ print(f" • Structured content blocks: {len(result.content_list)}")
+ for i, item in enumerate(result.content_list[:3]): # Show first 3
print(f" [{i+1}] {item.get('type', 'unknown')}: {item.get('content', '')[:50]}...")
-
+
else:
- print("❌ 内容提取失败")
- print(f"错误信息: {result.error_message}")
+ print("❌ Content extraction failed")
+ print(f"Error message: {result.error_message}")
if result.error_traceback:
- print(f"错误详情:\n{result.error_traceback}")
-
+ print(f"Error details:\n{result.error_traceback}")
+
except Exception as e:
- print(f"❌ 提取过程中发生异常: {e}")
-
- print("\n🎯 高级功能说明:")
- print("• 智能分类: 使用LLM理解HTML元素语义,准确区分主要内容和辅助内容")
- print("• 格式约束: 通过logits processor确保LLM输出有效的JSON格式")
- print("• 性能优化: 自动跳过过于复杂的HTML,支持延迟加载模型")
- print("• 详细反馈: 提供分类结果、置信度和性能指标")
+ print(f"❌ Exception during extraction: {e}")
+
+ print("\n🎯 Advanced feature notes:")
+ print("• Smart classification: Uses LLM to understand HTML element semantics, accurately distinguishing main content from auxiliary content")
+ print("• Format constraint: Uses logits processor to ensure valid JSON output from the LLM")
+ print("• Performance optimization: Automatically skips overly complex HTML, supports lazy model loading")
+ print("• Detailed feedback: Provides classification results, confidence scores, and performance metrics")
if __name__ == "__main__":
main()
-
- print("\n💡 使用提示:")
- print("1. 确保已安装所需依赖: vllm, transformers, torch, llm_web_kit")
- print("2. 设置正确的模型路径")
- print("3. 根据硬件资源调整tensor_parallel_size和dtype")
- print("4. 对于大规模HTML,适当调整max_item_count限制")
- print("5. 使用use_logits_processor=True确保输出格式可靠性")
\ No newline at end of file
+
+ print("\n💡 Usage tips:")
+ print("1. Ensure required dependencies are installed: vllm, transformers, torch, llm_web_kit")
+ print("2. Set the correct model path")
+ print("3. Adjust tensor_parallel_size and dtype based on hardware resources")
+ print("4. For large-scale HTML, adjust max_item_count accordingly")
+ print("5. Use use_logits_processor=True to ensure reliable output format")
\ No newline at end of file
diff --git a/examples/magic_html_extract_demo.py b/examples/magic_html_extract_demo.py
index 726c054..ef90532 100644
--- a/examples/magic_html_extract_demo.py
+++ b/examples/magic_html_extract_demo.py
@@ -1,68 +1,68 @@
import time
from webmainbench.extractors import ExtractorFactory
-# 配置 MagicHTML 抽取器(这里可根据需要添加更多配置)
+# Configure MagicHTML extractor (add more configuration as needed)
config = {}
try:
- # 创建 MagicHTML 抽取器实例
+ # Create MagicHTML extractor instance
extractor = ExtractorFactory.create("magic-html", config=config)
- print(f"✅ Extractor创建成功: {extractor.description}")
- print(f"📋 版本: {extractor.version}")
- print(f"⚙️ 配置: {extractor.get_config()}\n")
+ print(f"✅ Extractor created successfully: {extractor.description}")
+ print(f"📋 Version: {extractor.version}")
+ print(f"⚙️ Config: {extractor.get_config()}\n")
except Exception as e:
- print(f"❌ Extractor创建失败: {e}")
+ print(f"❌ Failed to create extractor: {e}")
-# 测试 HTML
+# Test HTML
test_html = """
- Python编程教程
- 这是一个Python基础教程,展示如何定义函数。
+ Python Programming Tutorial
+ This is a basic Python tutorial demonstrating how to define functions.
def greet(name):
- ""问候函数""
+ ""Greeting function""
return f"Hello, {name}!"
-# 使用示例
+# Usage example
result = greet("World")
print(result)
- 这个函数可以用来问候任何人。
+ This function can be used to greet anyone.
"""
-print("🔍 开始内容提取...")
+print("🔍 Starting content extraction...")
start_time = time.time()
try:
result = extractor.extract(test_html)
end_time = time.time()
- print(f"⏱️ 提取耗时: {end_time - start_time:.2f}秒\n")
+ print(f"⏱️ Extraction time: {end_time - start_time:.2f}s\n")
- # 显示提取结果
+ # Display extraction results
if result.success:
- print("✅ 内容提取成功!\n")
+ print("✅ Content extracted successfully!\n")
- print("📄 提取的主要内容:")
+ print("📄 Extracted main content:")
print("=" * 50)
print(result.content[:500] + "..." if len(result.content) > 500 else result.content)
print("=" * 50)
- print(f"\n📊 提取统计:")
- print(f" • 内容长度: {len(result.content)} 字符")
- print(f" • 标题: {result.title}")
- print(f" • 语言: {result.language}")
- print(f" • 提取时间: {result.extraction_time:.3f}秒")
+ print(f"\n📊 Extraction statistics:")
+ print(f" • Content length: {len(result.content)} characters")
+ print(f" • Title: {result.title}")
+ print(f" • Language: {result.language}")
+ print(f" • Extraction time: {result.extraction_time:.3f}s")
if result.content_list:
- print(f" • 结构化内容块: {len(result.content_list)}个")
- for i, item in enumerate(result.content_list[:3]): # 显示前3个
+ print(f" • Structured content blocks: {len(result.content_list)}")
+ for i, item in enumerate(result.content_list[:3]): # Show first 3
print(f" [{i + 1}] {item.get('type', 'unknown')}: {item.get('content', '')[:50]}...")
else:
- print("❌ 内容提取失败")
- print(f"错误信息: {result.error_message}")
+ print("❌ Content extraction failed")
+ print(f"Error message: {result.error_message}")
if result.error_traceback:
- print(f"错误详情:\n{result.error_traceback}")
+ print(f"Error details:\n{result.error_traceback}")
except Exception as e:
- print(f"❌ 提取过程中发生异常: {e}")
\ No newline at end of file
+ print(f"❌ Exception during extraction: {e}")
diff --git a/examples/main_html_eval.py b/examples/main_html_eval.py
index cdeee0c..b29d395 100755
--- a/examples/main_html_eval.py
+++ b/examples/main_html_eval.py
@@ -1,12 +1,12 @@
#!/usr/bin/env python3
"""
-WebMainBench 基本使用示例
+WebMainBench Basic Usage Example
"""
import json
from pathlib import Path
-# 导入 WebMainBench 模块
+# Import WebMainBench modules
from webmainbench import (
DataLoader, DataSaver, BenchmarkDataset, DataSample,
ExtractorFactory, MainHTMLEvaluator,
@@ -16,17 +16,17 @@
def load_benchdata(dataset_path: str) -> BenchmarkDataset:
dataset_path = Path(dataset_path)
- print(f"📂 数据集文件: {dataset_path}")
-
+ print(f"📂 Dataset file: {dataset_path}")
+
if not dataset_path.exists():
- print(f"❌ 数据文件不存在: {dataset_path}")
- print("请确保已运行数据提取命令创建样本数据集")
+ print(f"❌ Data file does not exist: {dataset_path}")
+ print("Please ensure the data extraction command has been run to create the sample dataset")
return
-
- # 加载数据集
+
+ # Load dataset
dataset = DataLoader.load_jsonl(dataset_path, include_results=False)
dataset.name = "real_preprocessed_html_test"
- dataset.description = "基于真实数据的预处理HTML功能测试"
+ dataset.description = "Preprocessed HTML feature test based on real data"
return dataset
@@ -39,104 +39,103 @@ def save_results(result_file: Path, results: list[dict]):
with result_file.open("w", encoding="utf-8") as f:
for res in results:
f.write(json.dumps(res, ensure_ascii=False) + "\n")
-
-
+
+
def demo_llm_webkit_with_preprocessed_html_evaluation(model_path: str):
- """演示LLM-WebKit预处理HTML功能的评测"""
-
- print("\n=== LLM-WebKit 预处理HTML功能演示 ===\n")
-
- # 设置日志
+ """Demonstrate evaluation of the LLM-WebKit preprocessed HTML feature"""
+
+ print("\n=== LLM-WebKit Preprocessed HTML Feature Demo ===\n")
+
+ # Set up logging
setup_logging(level="INFO")
-
- # 1. 从真实数据集加载包含预处理HTML的数据
- print("1. 从真实数据集加载预处理HTML数据...")
-
- # 使用DataLoader加载真实的样本数据
-
+
+ # 1. Load preprocessed HTML data from the real dataset
+ print("1. Loading preprocessed HTML data from the real dataset...")
+
+ # Load real sample data using DataLoader
dataset = load_benchdata("data/WebMainBench_llm-webkit_v1_WebMainBench_1827_v1_WebMainBench_dataset_merge_with_llm_webkit.jsonl")
- print(f"✅ 真实数据集加载成功,包含 {len(dataset)} 个样本")
-
+ print(f"✅ Real dataset loaded successfully, contains {len(dataset)} samples")
+
+
+
+ # 2. Create LLM-WebKit extractor in preprocessed HTML mode
+ print("2. Creating LLM-WebKit extractor in preprocessed HTML mode...")
-
- # 2. 创建预处理HTML模式的LLM-WebKit抽取器
- print("2. 创建预处理HTML模式的LLM-WebKit抽取器...")
-
extractor = load_extractor(model_path)
- print(f"✅ 抽取器创建成功")
- print(f"📋 配置信息:")
- print(f" - 跳过LLM推理: 是(直接处理预处理HTML)")
+ print(f"✅ Extractor created successfully")
+ print(f"📋 Configuration:")
+ print(f" - Skip LLM inference: Yes (process preprocessed HTML directly)")
print()
-
- # 4. 运行评测
- print("4. 开始评测...")
+
+ # 4. Run evaluation
+ print("4. Starting evaluation...")
print("=" * 50)
-
+
evaluator = MainHTMLEvaluator()
result = evaluator.evaluate(
dataset=dataset,
extractor=extractor,
max_samples=None
)
-
- # 5. 显示评测结果
- print("\n5. 📊 预处理HTML模式评测结果:")
+
+ # 5. Display evaluation results
+ print("\n5. 📊 Preprocessed HTML mode evaluation results:")
print("=" * 50)
-
+
results_dict = result.to_dict()
metrics = results_dict.get('overall_metrics', {})
-
- # 显示关键指标
- print(f"\n🏆 综合指标:")
+
+ # Display key metrics
+ print(f"\n🏆 Overall metrics:")
for key in metrics.keys():
print(f" {key}: {metrics[key]:.4f}")
-
- print(f"\n⚡ 性能统计:")
+
+ print(f"\n⚡ Performance statistics:")
sample_results = results_dict.get('sample_results', [])
if sample_results:
extraction_times = [s.get('extraction_time', 0) for s in sample_results if s.get('extraction_success')]
if extraction_times:
avg_time = sum(extraction_times) / len(extraction_times)
- print(f" 平均提取时间: {avg_time:.3f}秒")
- print(f" 处理速度: {1/avg_time:.1f}样本/秒")
-
+ print(f" Average extraction time: {avg_time:.3f}s")
+ print(f" Processing speed: {1/avg_time:.1f} samples/s")
+
success_count = len([s for s in sample_results if s.get('extraction_success', False)])
- print(f" 成功样本数: {success_count}/{len(dataset)}")
-
- # 7. 保存结果
- print(f"\n6. 💾 保存评测结果...")
-
+ print(f" Successful samples: {success_count}/{len(dataset)}")
+
+ # 7. Save results
+ print(f"\n6. 💾 Saving evaluation results...")
+
results_dir = Path("results")
results_dir.mkdir(exist_ok=True)
- # 新增:保存带抽取结果的增强数据集(JSONL格式)
+ # Save enhanced dataset with extraction results (JSONL format)
jsonl_dataset_path = results_dir / f"{extractor.name}_preprocessed_html_dataset_with_results.jsonl"
save_results(jsonl_dataset_path, result.sample_results)
- print(f"✅ 结果已保存到: {jsonl_dataset_path}")
-
-
- print(f"✅ 带抽取结果的JSONL数据集已保存到: {jsonl_dataset_path}")
+ print(f"✅ Results saved to: {jsonl_dataset_path}")
+
+
+ print(f"✅ JSONL dataset with extraction results saved to: {jsonl_dataset_path}")
results_path = results_dir / f"{extractor.name}_preprocessed_html_evaluation_results.json"
report_path = results_dir / f"{extractor.name}_preprocessed_html_evaluation_report.csv"
-
+
DataSaver.save_evaluation_results(result, results_path)
DataSaver.save_summary_report(result, report_path)
-
- print(f"✅ 详细结果已保存到: {results_path}")
- print(f"✅ CSV报告已保存到: {report_path}")
-
+
+ print(f"✅ Detailed results saved to: {results_path}")
+ print(f"✅ CSV report saved to: {report_path}")
+
if __name__ == "__main__":
import argparse
- parser = argparse.ArgumentParser(description="WebMainBench 基本使用示例")
- parser.add_argument("--model_path", required=True, help="LLM model路径")
+ parser = argparse.ArgumentParser(description="WebMainBench Basic Usage Example")
+ parser.add_argument("--model_path", required=True, help="LLM model path")
args = parser.parse_args()
try:
demo_llm_webkit_with_preprocessed_html_evaluation(args.model_path)
- print("\n✅ 示例运行完成!")
-
+ print("\n✅ Example completed!")
+
except Exception as e:
- print(f"\n❌ 运行出错: {e}")
+ print(f"\n❌ Runtime error: {e}")
import traceback
- traceback.print_exc()
\ No newline at end of file
+ traceback.print_exc()
diff --git a/examples/multi_extractor_compare.py b/examples/multi_extractor_compare.py
index 9b3a56f..6b3390d 100644
--- a/examples/multi_extractor_compare.py
+++ b/examples/multi_extractor_compare.py
@@ -1,56 +1,56 @@
from webmainbench import DataLoader, Evaluator, ExtractorFactory, DataSaver
from pathlib import Path
-# 如需调用LLM修正抽取结果,在 webmainbench/config.py 中配置 LLM api
+# To use LLM to correct extraction results, configure the LLM API in webmainbench/config.py
def all_extractor_comparison():
- """演示多抽取器对比"""
-
- print("\n=== 多抽取器对比演示 ===\n")
-
- # 创建数据集
+ """Demonstrate multi-extractor comparison"""
+
+ print("\n=== Multi-Extractor Comparison Demo ===\n")
+
+ # Create dataset
dataset_path = Path("../data/WebMainBench_llm-webkit_v1_WebMainBench_7887_within_formula.jsonl")
dataset = DataLoader.load_jsonl(dataset_path)
- # 创建webkit抽取器
+ # Create webkit extractor
config = {
- "use_preprocessed_html": True, # 🔑 关键配置:启用预处理HTML模式
- "preprocessed_html_field": "llm_webkit_html" # 指定预处理HTML字段名
+ "use_preprocessed_html": True, # Key config: enable preprocessed HTML mode
+ "preprocessed_html_field": "llm_webkit_html" # Specify the preprocessed HTML field name
}
webkit_extractor = ExtractorFactory.create("llm-webkit", config=config)
- # 创建magic-extractor抽取器
+ # Create magic-extractor extractor
magic_extractor = ExtractorFactory.create("magic-html")
- # 创建trafilatura抽取器,抽取成markdown
+ # Create trafilatura extractor, extract to markdown
trafilatura_extractor = ExtractorFactory.create("trafilatura")
- # 创建trafilatura抽取器,抽取成txt
+ # Create trafilatura extractor, extract to txt
trafilatura_txt_extractor = ExtractorFactory.create("trafilatura_txt")
- # 创建resiliparse抽取器
+ # Create resiliparse extractor
resiliparse_extractor = ExtractorFactory.create("resiliparse")
-
- # 运行对比
+
+ # Run comparison
evaluator = Evaluator()
extractors = [webkit_extractor, magic_extractor, trafilatura_extractor,trafilatura_txt_extractor, resiliparse_extractor]
# extractors = [webkit_extractor]
-
+
results = evaluator.compare_extractors(
dataset=dataset,
extractors=extractors
)
-
- # 显示对比结果
- print("对比结果:")
+
+ # Display comparison results
+ print("Comparison results:")
print("-" * 40)
for extractor_name, result in results.items():
overall_score = result.overall_metrics.get('overall', 0)
print(f"{extractor_name}: {overall_score:.4f}")
-
- # 保存多抽取器对比榜单
+
+ # Save multi-extractor comparison leaderboard
all_results = []
for extractor_name, result in results.items():
all_results.append(result.to_dict())
-
+
results_dir = Path("results")
results_dir.mkdir(exist_ok=True)
leaderboard_path = results_dir / "leaderboard.csv"
@@ -60,10 +60,10 @@ def all_extractor_comparison():
DataSaver.save_evaluation_results(all_results, evaluation_results_path)
DataSaver.save_dataset_with_extraction(
results=all_results,
- dataset=dataset, # 原始数据集对象
+ dataset=dataset, # Original dataset object
file_path=jsonl_dataset_path
)
- print(f"\n📊 榜单已保存到: {leaderboard_path}")
+ print(f"\nLeaderboard saved to: {leaderboard_path}")
if __name__ == "__main__":
diff --git a/examples/resiliparse_extract_demo.py b/examples/resiliparse_extract_demo.py
index ba33a14..17c941f 100644
--- a/examples/resiliparse_extract_demo.py
+++ b/examples/resiliparse_extract_demo.py
@@ -1,7 +1,7 @@
import time
from webmainbench.extractors import ExtractorFactory
-# 配置 Resiliparse 抽取器
+# Configure Resiliparse extractor
config = {
"main_content": True,
"alt_texts": True,
@@ -14,66 +14,66 @@
}
try:
- # 创建 Resiliparse 抽取器实例
+ # Create Resiliparse extractor instance
extractor = ExtractorFactory.create("resiliparse", config=config)
- print(f"✅ Extractor创建成功: {extractor.description}")
- print(f"📋 版本: {extractor.version}")
- print(f"⚙️ 配置: {extractor.get_config()}\n")
+ print(f"✅ Extractor created successfully: {extractor.description}")
+ print(f"📋 Version: {extractor.version}")
+ print(f"⚙️ Config: {extractor.get_config()}\n")
except Exception as e:
- print(f"❌ Extractor创建失败: {e}")
+ print(f"❌ Failed to create extractor: {e}")
-# 测试 HTML
+# Test HTML
test_html = """
- Python编程教程
- 这是一个Python基础教程,展示如何定义函数。
+ Python Programming Tutorial
+ This is a basic Python tutorial demonstrating how to define functions.
def greet(name):
- ""问候函数""
+ ""Greeting function""
return f"Hello, {name}!"
-# 使用示例
+# Usage example
result = greet("World")
print(result)
- 这个函数可以用来问候任何人。
+ This function can be used to greet anyone.
"""
-print("🔍 开始内容提取...")
+print("🔍 Starting content extraction...")
start_time = time.time()
try:
result = extractor.extract(test_html)
end_time = time.time()
- print(f"⏱️ 提取耗时: {end_time - start_time:.2f}秒\n")
+ print(f"⏱️ Extraction time: {end_time - start_time:.2f}s\n")
- # 显示提取结果
+ # Display extraction results
if result.success:
- print("✅ 内容提取成功!\n")
+ print("✅ Content extracted successfully!\n")
- print("📄 提取的主要内容:")
+ print("📄 Extracted main content:")
print("=" * 50)
print(result.content[:500] + "..." if len(result.content) > 500 else result.content)
print("=" * 50)
- print(f"\n📊 提取统计:")
- print(f" • 内容长度: {len(result.content)} 字符")
- print(f" • 标题: {result.title}")
- print(f" • 语言: {result.language}")
- print(f" • 提取时间: {result.extraction_time:.3f}秒")
+ print(f"\n📊 Extraction statistics:")
+ print(f" • Content length: {len(result.content)} characters")
+ print(f" • Title: {result.title}")
+ print(f" • Language: {result.language}")
+ print(f" • Extraction time: {result.extraction_time:.3f}s")
if result.content_list:
- print(f" • 结构化内容块: {len(result.content_list)}个")
- for i, item in enumerate(result.content_list[:3]): # 显示前3个
+ print(f" • Structured content blocks: {len(result.content_list)}")
+ for i, item in enumerate(result.content_list[:3]): # Show first 3
print(f" [{i + 1}] {item.get('type', 'unknown')}: {item.get('content', '')[:50]}...")
else:
- print("❌ 内容提取失败")
- print(f"错误信息: {result.error_message}")
+ print("❌ Content extraction failed")
+ print(f"Error message: {result.error_message}")
if result.error_traceback:
- print(f"错误详情:\n{result.error_traceback}")
+ print(f"Error details:\n{result.error_traceback}")
except Exception as e:
- print(f"❌ 提取过程中发生异常: {e}")
+ print(f"❌ Exception during extraction: {e}")
diff --git a/examples/teds_usage.py b/examples/teds_usage.py
index 000f288..70d4e51 100644
--- a/examples/teds_usage.py
+++ b/examples/teds_usage.py
@@ -1,8 +1,9 @@
#!/usr/bin/env python3
"""
-WebMainBench TEDS 算法使用示例
+WebMainBench TEDS Algorithm Usage Example
-展示如何在评估中使用 TEDS (Tree-Edit Distance based Similarity) 算法进行表格评估
+Demonstrates how to use the TEDS (Tree-Edit Distance based Similarity) algorithm
+for table evaluation in assessments.
"""
import sys
@@ -18,268 +19,268 @@
def demo_teds_configuration():
- """演示如何配置 TEDS 算法"""
- print("=== 🔧 TEDS 配置示例 ===\n")
-
- # 方法1: 使用 TableTEDSMetric 指标
- print("**方法1: 使用专用的 TableTEDSMetric 指标**")
+ """Demonstrate how to configure the TEDS algorithm"""
+ print("=== TEDS Configuration Example ===\n")
+
+ # Method 1: Use the TableTEDSMetric metric
+ print("**Method 1: Use the dedicated TableTEDSMetric metric**")
evaluation_config = {
"metrics": {
"table_extraction": {
- "use_teds": True, # 启用 TEDS 算法
- "structure_only": False # 同时考虑结构和内容
+ "use_teds": True, # Enable TEDS algorithm
+ "structure_only": False # Consider both structure and content
}
}
}
- print("配置:", evaluation_config)
+ print("Config:", evaluation_config)
print()
-
- # 方法2: 直接使用 TEDS 指标
- print("**方法2: 直接使用独立的 TEDS 指标**")
+
+ # Method 2: Use TEDS metric directly
+ print("**Method 2: Use the standalone TEDS metric directly**")
teds_config = {
"metrics": {
"teds": {
"structure_only": False,
"ignore_nodes": ["tbody", "thead", "tfoot"]
},
- "s_teds": { # 结构化 TEDS
+ "s_teds": { # Structural TEDS
"structure_only": True
}
}
}
- print("配置:", teds_config)
+ print("Config:", teds_config)
print()
def demo_teds_comparison():
- """演示 TEDS 与简单算法的对比"""
- print("=== ⚖️ TEDS vs 简单算法对比 ===\n")
-
- # 准备测试数据
+ """Demonstrate comparison of TEDS vs simple algorithm"""
+ print("=== TEDS vs Simple Algorithm Comparison ===\n")
+
+ # Prepare test data
test_cases = [
{
- "name": "完全匹配的表格",
+ "name": "Perfectly matching table",
"extracted": """
- 产品 价格
- 苹果 5元
- 橙子 3元
+ Product Price
+ Apple $5
+ Orange $3
""",
"groundtruth": """
- 产品 价格
- 苹果 5元
- 橙子 3元
+ Product Price
+ Apple $5
+ Orange $3
"""
},
{
- "name": "缺少行的表格",
+ "name": "Table with missing row",
"extracted": """
- 产品 价格
- 苹果 5元
+ Product Price
+ Apple $5
""",
"groundtruth": """
- 产品 价格
- 苹果 5元
- 橙子 3元
- 香蕉 4元
+ Product Price
+ Apple $5
+ Orange $3
+ Banana $4
"""
},
{
- "name": "结构不同的表格",
+ "name": "Table with different structure",
"extracted": """
- 产品 价格
- 苹果 5元
+ Product Price
+ Apple $5
""",
"groundtruth": """
- 产品 价格 库存
- 苹果 5元 100
+ Product Price Stock
+ Apple $5 100
"""
}
]
-
- print("| 测试用例 | 简单算法 | TEDS算法 | S-TEDS | 差异 |")
+
+ print("| Test case | Simple | TEDS | S-TEDS | Diff |")
print("|---------|---------|---------|--------|------|")
-
+
for case in test_cases:
- # 简单算法评估
+ # Simple algorithm evaluation
simple_evaluator = Evaluator(task_config={
"metrics": {
"table_extraction": {"use_teds": False}
}
})
-
- # TEDS 算法评估
+
+ # TEDS algorithm evaluation
teds_evaluator = Evaluator(task_config={
"metrics": {
"table_extraction": {"use_teds": True}
}
})
-
- # 创建模拟数据
+
+ # Create mock data
sample = DataSample(
id=f"test_{case['name']}",
- html="测试HTML
",
- content="测试内容",
+ html="Test HTML
",
+ content="Test content",
content_list=[{"table": case["groundtruth"]}]
)
-
+
extraction_result = ExtractionResult(
extractor_name="test",
- extracted_content="测试内容",
+ extracted_content="Test content",
extracted_content_list=[{"table": case["extracted"]}]
)
-
- # 计算得分
+
+ # Calculate scores
try:
simple_result = simple_evaluator.evaluate_single(sample, extraction_result)
teds_result = teds_evaluator.evaluate_single(sample, extraction_result)
-
+
simple_score = simple_result.overall_metrics.get("table_extraction", 0.0)
teds_score = teds_result.overall_metrics.get("table_extraction", 0.0)
-
- # S-TEDS (结构化) 评估
+
+ # S-TEDS (structure-only) evaluation
s_teds = StructureTEDSMetric("s_teds")
s_teds_result = s_teds.calculate(case["extracted"], case["groundtruth"])
s_teds_score = s_teds_result.score
-
+
diff = abs(simple_score - teds_score)
-
+
print(f"| {case['name'][:10]}... | {simple_score:.4f} | {teds_score:.4f} | {s_teds_score:.4f} | {diff:.4f} |")
-
+
except Exception as e:
- print(f"| {case['name'][:10]}... | 错误 | 错误 | 错误 | - |")
- print(f" 错误信息: {e}")
-
+ print(f"| {case['name'][:10]}... | Error | Error | Error | - |")
+ print(f" Error message: {e}")
+
print()
def demo_advanced_teds_features():
- """演示 TEDS 的高级功能"""
- print("=== 🚀 TEDS 高级功能演示 ===\n")
-
- # 1. 处理 Markdown 表格
- print("**1. Markdown 表格支持**")
+ """Demonstrate advanced TEDS features"""
+ print("=== TEDS Advanced Feature Demo ===\n")
+
+ # 1. Handle Markdown tables
+ print("**1. Markdown Table Support**")
teds = TEDSMetric("teds")
-
+
markdown_table = """
- | 姓名 | 年龄 | 职业 |
+ | Name | Age | Occupation |
|------|------|------|
- | 张三 | 25 | 工程师 |
- | 李四 | 30 | 设计师 |
+ | Alice | 25 | Engineer |
+ | Bob | 30 | Designer |
"""
-
+
html_table = """
- 姓名 年龄 职业
- 张三 25 工程师
- 李四 30 设计师
+ Name Age Occupation
+ Alice 25 Engineer
+ Bob 30 Designer
"""
-
+
result = teds.calculate(markdown_table, html_table)
- print(f"Markdown vs HTML 表格 TEDS 得分: {result.score:.4f}")
- print(f"详细信息: {result.details}")
+ print(f"Markdown vs HTML table TEDS score: {result.score:.4f}")
+ print(f"Details: {result.details}")
print()
-
- # 2. 复杂表格结构
- print("**2. 复杂表格结构支持 (colspan, rowspan)**")
+
+ # 2. Complex table structure
+ print("**2. Complex Table Structure Support (colspan, rowspan)**")
complex_table1 = """
- 学生信息
- 姓名 成绩
- 张三 95
- 李四 87
+ Student Info
+ Name Score
+ Alice 95
+ Bob 87
"""
-
+
complex_table2 = """
- 类别 详情
- 姓名 成绩
- 张三 95
- 李四 87
+ Category Details
+ Name Score
+ Alice 95
+ Bob 87
"""
-
+
result = teds.calculate(complex_table1, complex_table2)
- print(f"复杂表格结构 TEDS 得分: {result.score:.4f}")
- print(f"编辑距离: {result.details.get('edit_distance')}")
- print(f"节点数量: 预测={result.details.get('predicted_nodes')}, 真实={result.details.get('groundtruth_nodes')}")
+ print(f"Complex table structure TEDS score: {result.score:.4f}")
+ print(f"Edit distance: {result.details.get('edit_distance')}")
+ print(f"Node count: predicted={result.details.get('predicted_nodes')}, groundtruth={result.details.get('groundtruth_nodes')}")
print()
-
- # 3. 结构化 vs 内容敏感评估
- print("**3. 结构化 vs 内容敏感评估对比**")
+
+ # 3. Structure-only vs content-sensitive evaluation
+ print("**3. Structure-only vs Content-sensitive Evaluation Comparison**")
content_teds = TEDSMetric("content_teds", {"structure_only": False})
structure_teds = StructureTEDSMetric("structure_teds")
-
+
table_diff_content = """
"""
-
+
table_same_structure = """
"""
-
+
content_result = content_teds.calculate(table_diff_content, table_same_structure)
structure_result = structure_teds.calculate(table_diff_content, table_same_structure)
-
- print(f"内容敏感 TEDS 得分: {content_result.score:.4f}")
- print(f"仅结构 S-TEDS 得分: {structure_result.score:.4f}")
- print(f"说明: S-TEDS 忽略文本内容差异,只关注表格结构")
+
+ print(f"Content-sensitive TEDS score: {content_result.score:.4f}")
+ print(f"Structure-only S-TEDS score: {structure_result.score:.4f}")
+ print(f"Note: S-TEDS ignores text content differences and only focuses on table structure")
print()
def demo_evaluation_workflow():
- """演示完整的评估工作流程"""
- print("=== 📋 完整评估工作流程 ===\n")
-
- print("**步骤 1: 准备数据**")
- # 模拟评估数据
+ """Demonstrate the complete evaluation workflow"""
+ print("=== Complete Evaluation Workflow ===\n")
+
+ print("**Step 1: Prepare data**")
+ # Simulated evaluation data
sample_data = DataSample(
id="sample_001",
html="""
-
产品价格表
+
Product Price List
- 产品 价格 库存
- iPhone 5999元 50
- iPad 3999元 30
- MacBook 12999元 10
+ Product Price Stock
+ iPhone $599 50
+ iPad $399 30
+ MacBook $1299 10
""",
- content="产品价格表\n\n| 产品 | 价格 | 库存 |\n|------|------|------|\n| iPhone | 5999元 | 50 |\n| iPad | 3999元 | 30 |\n| MacBook | 12999元 | 10 |",
+ content="Product Price List\n\n| Product | Price | Stock |\n|------|------|------|\n| iPhone | $599 | 50 |\n| iPad | $399 | 30 |\n| MacBook | $1299 | 10 |",
content_list=[
{
"type": "title",
- "content": "产品价格表"
+ "content": "Product Price List"
},
{
"type": "table",
- "content": "| 产品 | 价格 | 库存 |\n|------|------|------|\n| iPhone | 5999元 | 50 |\n| iPad | 3999元 | 30 |\n| MacBook | 12999元 | 10 |"
+ "content": "| Product | Price | Stock |\n|------|------|------|\n| iPhone | $599 | 50 |\n| iPad | $399 | 30 |\n| MacBook | $1299 | 10 |"
}
]
)
- print("✅ 数据准备完成")
-
- print("\n**步骤 2: 配置 TEDS 评估器**")
+ print("Data preparation complete")
+
+ print("\n**Step 2: Configure TEDS evaluator**")
evaluation_config = {
"metrics": {
"overall": "edit_distance",
@@ -289,74 +290,74 @@ def demo_evaluation_workflow():
}
}
}
-
+
evaluator = Evaluator(task_config=evaluation_config)
- print("✅ 评估器配置完成")
-
- print("\n**步骤 3: 模拟抽取结果**")
- # 模拟一个有轻微错误的抽取结果
+ print("Evaluator configuration complete")
+
+ print("\n**Step 3: Simulate extraction results**")
+ # Simulate extraction result with minor errors
extraction_result = ExtractionResult(
extractor_name="TestExtractor",
- extracted_content="产品价格表\n\n| 产品 | 价格 |\n|------|------|\n| iPhone | 5999元 |\n| iPad | 3999元 |", # 缺少库存列和MacBook行
+ extracted_content="Product Price List\n\n| Product | Price |\n|------|------|\n| iPhone | $599 |\n| iPad | $399 |", # Missing stock column and MacBook row
extracted_content_list=[
{
- "type": "title",
- "content": "产品价格表"
+ "type": "title",
+ "content": "Product Price List"
},
{
"type": "table",
- "content": "| 产品 | 价格 |\n|------|------|\n| iPhone | 5999元 |\n| iPad | 3999元 |"
+ "content": "| Product | Price |\n|------|------|\n| iPhone | $599 |\n| iPad | $399 |"
}
]
)
- print("✅ 模拟抽取结果生成")
-
- print("\n**步骤 4: 执行评估**")
+ print("Simulated extraction result generated")
+
+ print("\n**Step 4: Run evaluation**")
evaluation_result = evaluator.evaluate_single(sample_data, extraction_result)
-
- print(f"📊 评估结果:")
- print(f" - 整体得分: {evaluation_result.overall_metrics.get('overall', 'N/A'):.4f}")
- print(f" - 表格抽取 (TEDS): {evaluation_result.overall_metrics.get('table_extraction', 'N/A'):.4f}")
- print(f" - 成功率: {evaluation_result.metadata.get('success_rate', 'N/A'):.2%}")
-
- # 显示详细的 TEDS 信息
+
+ print(f"Evaluation results:")
+ print(f" - Overall score: {evaluation_result.overall_metrics.get('overall', 'N/A'):.4f}")
+ print(f" - Table extraction (TEDS): {evaluation_result.overall_metrics.get('table_extraction', 'N/A'):.4f}")
+ print(f" - Success rate: {evaluation_result.metadata.get('success_rate', 'N/A'):.2%}")
+
+ # Display detailed TEDS information
if evaluation_result.detailed_metrics:
for metric_name, metric_result in evaluation_result.detailed_metrics.items():
if 'teds' in metric_name.lower():
- print(f"\n🔍 {metric_name} 详细信息:")
+ print(f"\n{metric_name} details:")
details = metric_result.details
- print(f" - 算法: {details.get('algorithm', 'N/A')}")
- print(f" - 编辑距离: {details.get('edit_distance', 'N/A')}")
- print(f" - 节点数量 (预测/真实): {details.get('predicted_nodes', 'N/A')}/{details.get('groundtruth_nodes', 'N/A')}")
-
- print("\n✅ 评估完成")
+ print(f" - Algorithm: {details.get('algorithm', 'N/A')}")
+ print(f" - Edit distance: {details.get('edit_distance', 'N/A')}")
+ print(f" - Node count (predicted/groundtruth): {details.get('predicted_nodes', 'N/A')}/{details.get('groundtruth_nodes', 'N/A')}")
+
+ print("\nEvaluation complete")
if __name__ == "__main__":
- print("🚀 WebMainBench TEDS 算法使用示例\n")
+ print("WebMainBench TEDS Algorithm Usage Example\n")
print("=" * 60)
-
+
try:
demo_teds_configuration()
print("=" * 60)
-
+
demo_teds_comparison()
print("=" * 60)
-
+
demo_advanced_teds_features()
print("=" * 60)
-
+
demo_evaluation_workflow()
-
- print("\n🎉 所有演示完成!")
- print("\n💡 要点总结:")
- print(" 1. TEDS 算法提供更学术严谨的表格评估")
- print(" 2. 支持 HTML、Markdown 等多种表格格式")
- print(" 3. 可配置结构化评估 (S-TEDS) 或内容敏感评估")
- print(" 4. 能够准确识别表格结构差异和内容差异")
- print(" 5. 与现有评估流程完全兼容")
-
+
+ print("\nAll demos complete!")
+ print("\nKey takeaways:")
+ print(" 1. TEDS algorithm provides more academically rigorous table evaluation")
+ print(" 2. Supports multiple table formats including HTML and Markdown")
+ print(" 3. Configurable structure-only evaluation (S-TEDS) or content-sensitive evaluation")
+ print(" 4. Accurately identifies table structure differences and content differences")
+ print(" 5. Fully compatible with existing evaluation workflows")
+
except Exception as e:
- print(f"\n❌ 演示过程中发生错误: {e}")
+ print(f"\nError during demo: {e}")
import traceback
- traceback.print_exc()
\ No newline at end of file
+ traceback.print_exc()
\ No newline at end of file
diff --git a/examples/test_model.py b/examples/test_model.py
index 59b88aa..c7bc994 100644
--- a/examples/test_model.py
+++ b/examples/test_model.py
@@ -1,16 +1,16 @@
from webmainbench import DataLoader, Evaluator, ExtractorFactory
-# 1. 加载评测数据集
+# 1. Load evaluation dataset
dataset = DataLoader.load_jsonl("WebMainBench/data/WebMainBench_llm-webkit_v1_WebMainBench_dataset_merge_2549_llm_webkit.jsonl")
-# 2. 创建抽取器
+# 2. Create extractor
extractor = ExtractorFactory.create("test-model")
-# 3. 运行评测
+# 3. Run evaluation
evaluator = Evaluator()
result = evaluator.evaluate(dataset, extractor)
-# 4. 查看结果
+# 4. View results
print(f"Overall Score: {result.overall_metrics}")
print(f"Category Metrics: {result.category_metrics}")
print(f"Error Analysis: {result.error_analysis}")
diff --git a/examples/test_table_extract.py b/examples/test_table_extract.py
index 00b978c..102409d 100644
--- a/examples/test_table_extract.py
+++ b/examples/test_table_extract.py
@@ -1,6 +1,6 @@
#!/usr/bin/env python3
"""
-脚本:仅提取 WebMainBench 数据集中的表格内容到 table.md
+Script: Extract only table content from the WebMainBench dataset into table.md
"""
import json
@@ -8,52 +8,52 @@
import os
from pathlib import Path
-# 添加父目录到 sys.path 以便导入 webmainbench
+# Add parent directory to sys.path for importing webmainbench
sys.path.append(str(Path(__file__).parent.parent))
from webmainbench.metrics.base import BaseMetric
def extract_only_tables_from_dataset():
- """只提取 WebMainBench 数据集中的表格内容并输出到 table.md(table为空的不记录)"""
+ """Extract only table content from the WebMainBench dataset and output to table.md (items with empty tables are not recorded)"""
- # 路径配置
+ # Path configuration
dataset_path = "/home/zhangshuo/Desktop/vscodeworkspace/WebMainBench/data/WebMainBench_llm-webkit_v1_WebMainBench_dataset_merge_with_llm_webkit.jsonl"
output_path = "table.md"
- # 检查数据集文件是否存在
+ # Check if the dataset file exists
if not os.path.exists(dataset_path):
- print(f"错误:未找到数据集文件 {dataset_path}")
+ print(f"Error: dataset file not found: {dataset_path}")
return
extracted_tables = []
line_ids = []
- # 按行读取 JSONL 文件
+ # Read JSONL file line by line
with open(dataset_path, 'r', encoding='utf-8') as f:
for line_num, line in enumerate(f, 1):
try:
data = json.loads(line.strip())
- # 提取ID和内容
+ # Extract ID and content
item_id = data.get('track_id', f'line_{line_num}')
content = data.get('llm_webkit_md', '')
- # 使用 _extract_from_markdown 提取
+ # Use _extract_from_markdown to extract
if content:
extracted = BaseMetric._extract_from_markdown(content)
table_content = extracted.get("table", "")
- # 只记录table不为空的项
+ # Only record items with non-empty table
if table_content and table_content.strip():
extracted_tables.append(table_content)
line_ids.append((item_id, line_num))
except json.JSONDecodeError as e:
- print(f"解析JSON出错,行{line_num}: {e}")
+ print(f"JSON parse error at line {line_num}: {e}")
continue
except Exception as e:
- print(f"处理第{line_num}行时出错: {e}")
+ print(f"Error processing line {line_num}: {e}")
continue
- # 写入 table.md 文件,只输出 table 字段
+ # Write to table.md, output only the table field
with open(output_path, 'w', encoding='utf-8') as f:
f.write("# Extracted Table Content from WebMainBench Dataset\n\n")
f.write(f"Total items processed: {len(extracted_tables)}\n\n")
@@ -68,8 +68,8 @@ def extract_only_tables_from_dataset():
f.write("\n```\n\n")
f.write("---\n\n")
- print(f"表格提取完成!共处理 {len(extracted_tables)} 条数据。")
- print(f"表格内容已保存到: {output_path}")
+ print(f"Table extraction complete! Processed {len(extracted_tables)} items.")
+ print(f"Table content saved to: {output_path}")
if __name__ == "__main__":
extract_only_tables_from_dataset()
diff --git a/examples/trafilatura_extract_demo.py b/examples/trafilatura_extract_demo.py
index 1ee9f3c..f031306 100644
--- a/examples/trafilatura_extract_demo.py
+++ b/examples/trafilatura_extract_demo.py
@@ -1,20 +1,20 @@
import time
from webmainbench.extractors import ExtractorFactory
-# 配置 Trafilatura 抽取器(这里可根据需要添加更多配置)
+# Configure Trafilatura extractor (add more configuration as needed)
config = {}
try:
- # 创建 Trafilatura 抽取器实例
+ # Create Trafilatura extractor instance
extractor = ExtractorFactory.create("trafilatura", config=config)
- print(f"✅ Extractor创建成功: {extractor.description}")
- print(f"📋 版本: {extractor.version}")
- print(f"⚙️ 配置: {extractor.get_config()}\n")
+ print(f"✅ Extractor created successfully: {extractor.description}")
+ print(f"📋 Version: {extractor.version}")
+ print(f"⚙️ Config: {extractor.get_config()}\n")
except Exception as e:
- print(f"❌ Extractor创建失败: {e}")
+ print(f"❌ Failed to create extractor: {e}")
-# 测试 HTML
+# Test HTML
test_html = """