Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -45,4 +45,12 @@ results/
.coverage*
coverage.xml

webmainbench.egg-info/*
webmainbench.egg-info/*

# PyPI packaging
build/
dist/
*.egg-info/
.eggs/
*.egg
.pypirc
15 changes: 7 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,6 @@ WebMainBench is a specialized benchmark tool for end-to-end evaluation of web ma
```bash
# Basic installation
pip install webmainbench

# Install with all optional dependencies
pip install webmainbench[all]

# Development environment installation
pip install webmainbench[dev]
```

### Basic Usage
Expand All @@ -55,13 +49,18 @@ pip install webmainbench[dev]
from webmainbench import DataLoader, Evaluator, ExtractorFactory

# 1. Load evaluation dataset
dataset = DataLoader.load_jsonl("your_dataset.jsonl")
dataset = DataLoader.load_jsonl("data/WebMainBench_dataset_sample2.jsonl")

# 2. Create extractor
extractor = ExtractorFactory.create("trafilatura")

# 3. Run evaluation
evaluator = Evaluator()
evaluator = Evaluator(llm_config={
"use_llm": True,
"llm_base_url": "",
"llm_api_key": "",
"llm_model": "gpt-5-chat-latest",
})
result = evaluator.evaluate(dataset, extractor)

# 4. View results
Expand Down
4 changes: 2 additions & 2 deletions data/WebMainBench_dataset_sample2.jsonl

Large diffs are not rendered by default.

12 changes: 8 additions & 4 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,11 @@
name="webmainbench",
version="0.1.0",
author="WebMainBench Team",
author_email="webmainbench@example.com",
author_email="chupei@pjlab.org.cn",
description="A comprehensive benchmark for web main content extraction",
long_description=long_description,
long_description_content_type="text/markdown",
url="https://github.com/example/webmainbench",
url="https://github.com/opendatalab/WebMainBench",
packages=find_packages(),
classifiers=[
"Development Status :: 3 - Alpha",
Expand All @@ -32,11 +32,15 @@
],
python_requires=">=3.8",
install_requires=[
"lxml==5.3.0",
"lxml>=5.3.0",
"jsonlines>=3.1.0",
"requests>=2.28.0",
"beautifulsoup4==4.12.0",
"beautifulsoup4>=4.12.0",
"numpy>=1.21.0,<2.0.0", # 避免NumPy 2.x兼容性问题
"rapidfuzz>=3.0.0", # 用于文本编辑距离计算
"apted>=1.0.3", # 用于树编辑距离计算(TEDS)
"jieba>=0.42.0", # 用于中文分词
"rouge>=1.0.0", # 用于 ROUGE 指标
],
extras_require={
"all": [
Expand Down
11 changes: 6 additions & 5 deletions webmainbench/config.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
"""
全局配置文件
"""
"""Package-wide configuration."""

# LLM配置,用于修正抽取工具的抽取结果
# LLM settings for refinement of extractor outputs
LLM_CONFIG = {
'llm_base_url': '',
'llm_api_key': '',
'llm_model': 'deepseek-chat',
'use_llm': True
'use_llm': True,
}

# When True, print LLM enhancement / cache diagnostics (very noisy).
METRICS_DEBUG = False
112 changes: 66 additions & 46 deletions webmainbench/evaluator/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,81 +78,101 @@ def from_dict(cls, data: Dict[str, Any]) -> "EvaluationResult":
class Evaluator:
"""Main evaluator for web content extraction benchmarks."""

def __init__(self, metric_config: Dict[str, Any] = None):
def __init__(self, metric_config: Dict[str, Any] = None,
llm_config: Dict[str, Any] = None):
"""
Initialize the evaluator.

Args:
metric_config: Configuration for metrics
llm_config: Optional LLM configuration dict to override webmainbench/config.py.
Supported keys:
- use_llm (bool): whether to enable LLM enhancement
- llm_base_url (str): API base URL
- llm_api_key (str): API key
- llm_model (str): model name (default: 'deepseek-chat')
Example:
Evaluator(llm_config={
'use_llm': True,
'llm_base_url': 'https://api.deepseek.com',
'llm_api_key': 'sk-xxxxxxxxxxxx',
'llm_model': 'deepseek-chat',
})
"""

self._validate_llm_config()
self._validate_llm_config(llm_config)

self.metric_calculator = MetricCalculator(metric_config)
self.metric_config = metric_config or {}

def _validate_llm_config(self):
"""验证LLM配置的完整性和有效性"""
def _validate_llm_config(self, llm_config: Dict[str, Any] = None):
"""Validate LLM configuration completeness and API connectivity."""
import time
from ..config import LLM_CONFIG

if LLM_CONFIG.get('use_llm', False):
# 检查配置完整性
if not LLM_CONFIG.get('llm_base_url') or not LLM_CONFIG.get('llm_api_key'):
# External llm_config takes priority over config.py
config = {**LLM_CONFIG, **(llm_config or {})}

if config.get('use_llm', False):
if not config.get('llm_base_url') or not config.get('llm_api_key'):
print("\n" + "=" * 60)
print("❌ 错误:LLM配置不完整!")
print("❌ Error: Incomplete LLM configuration!")
print("-" * 60)
print("当前 use_llm = True,但缺少必要的API配置。")
print("\n请在 webmainbench/config.py 中完成以下配置:")
print(" 1. llm_base_url (例如: 'https://api.deepseek.com')")
print(" 2. llm_api_key (例如: 'sk-xxxxxxxxxxxx')")
print("\n或者设置 use_llm = False 来禁用LLM功能。")
print("'use_llm' is set to True, but required API settings are missing.")
print("\nOption 1 - Pass config directly to Evaluator:")
print(" Evaluator(llm_config={")
print(" 'use_llm': True,")
print(" 'llm_base_url': 'https://api.deepseek.com',")
print(" 'llm_api_key': 'sk-xxxxxxxxxxxx',")
print(" })")
print("\nOption 2 - Edit webmainbench/config.py:")
print(" 1. llm_base_url (e.g. 'https://api.deepseek.com')")
print(" 2. llm_api_key (e.g. 'sk-xxxxxxxxxxxx')")
print("\nOption 3 - Disable LLM: set use_llm = False")
print("=" * 60 + "\n")
sys.exit(1)

# 验证API有效性
try:
from openai import OpenAI

print("正在验证LLM API配置...")
print("Validating LLM API configuration...")
client = OpenAI(
base_url=LLM_CONFIG.get('llm_base_url'),
api_key=LLM_CONFIG.get('llm_api_key')
base_url=config.get('llm_base_url'),
api_key=config.get('llm_api_key')
)

# 发送测试请求
response = client.chat.completions.create(
model=LLM_CONFIG.get('llm_model', 'deepseek-chat'),
client.chat.completions.create(
model=config.get('llm_model', 'deepseek-chat'),
messages=[{"role": "user", "content": "test"}],
max_tokens=5,
temperature=0
)

print("✅ LLM API配置验证成功!\n使用 基础方案➕LLM增强提取效果 进行评测。")
print("✅ LLM API validated. Running evaluation with LLM enhancement.\n")

except Exception as e:
print("\n" + "=" * 60)
print("❌ 错误:LLM API配置无效!")
print("❌ Error: LLM API validation failed!")
print("-" * 60)
print(f"验证失败原因: {str(e)}")
print("\n请检查 webmainbench/config.py 中的配置:")
print(" 1. llm_base_url 是否正确")
print(" 2. llm_api_key 是否有效")
print(" 3. llm_model 是否支持")
print(" 4. 网络连接是否正常")
print("\n或者设置 use_llm = False 来禁用LLM功能。")
print(f"Reason: {str(e)}")
print("\nPlease check:")
print(" 1. llm_base_url is correct")
print(" 2. llm_api_key is valid")
print(" 3. llm_model is supported")
print(" 4. Network connectivity")
print("\nAlternatively, set use_llm = False to disable LLM functionality.")
print("=" * 60 + "\n")
sys.exit(1)
else:
# 未启用LLM的提示
print("\n" + "=" * 60)
print("⚠️ 注意:当前未启用LLM增强提取效果功能")
print(" 如需启用LLM增强提取效果,请在 webmainbench/config.py 中配置:")
print(" - 设置 use_llm = True")
print(" - 填写 llm_base_url")
print(" - 填写 llm_api_key")
print("ℹ️ LLM enhancement is disabled. Running in baseline mode.")
print(" To enable LLM enhancement, pass llm_config to Evaluator:")
print(" Evaluator(llm_config={")
print(" 'use_llm': True,")
print(" 'llm_base_url': '...',")
print(" 'llm_api_key': '...',")
print(" })")
print("=" * 60)
print(" (5秒后使用基础方案进行对比...)")
print(" Continuing in 5 seconds...")
time.sleep(5)
print()

Expand Down Expand Up @@ -289,10 +309,10 @@ def evaluate_batched(self,
all_sample_results = []
all_extraction_errors = []

print(f"🔄 开始批处理评测")
print(f" 数据集: {jsonl_file_path}")
print(f" 批大小: {batch_size}")
print(f" 最大样本数: {max_samples or '无限制'}")
print(f"🔄 Starting batched evaluation")
print(f" Dataset: {jsonl_file_path}")
print(f" Batch size: {batch_size}")
print(f" Max samples: {max_samples if max_samples is not None else 'unlimited'}")

start_time = time.time()

Expand All @@ -311,17 +331,17 @@ def evaluate_batched(self,
processed_samples += len(batch_samples)
total_samples += len(batch_samples)

print(f" 已处理: {processed_samples} 样本")
print(f" Processed: {processed_samples} samples")

# 如果有输出文件,可以立即写入避免内存累积
if output_file and len(all_sample_results) > 1000:
DataSaver.append_intermediate_results(all_sample_results, output_file)
all_sample_results = [] # 清空已保存的结果

end_time = time.time()
print(f"✅ 批处理评测完成")
print(f" 总耗时: {end_time - start_time:.2f}")
print(f" 处理样本: {processed_samples}")
print(f"✅ Batched evaluation finished")
print(f" Elapsed: {end_time - start_time:.2f}s")
print(f" Samples processed: {processed_samples}")

# 聚合结果
overall_metrics = self._aggregate_metrics(all_sample_results)
Expand Down Expand Up @@ -363,7 +383,7 @@ def _process_batch(self, batch_samples: List[DataSample], extractor: BaseExtract
})

except Exception as e:
print(f"⚠️ 样本 {sample.id} 评测失败: {e}")
print(f"⚠️ Sample {sample.id} evaluation failed: {e}")
batch_errors.append({
'sample_id': sample.id,
'error': str(e),
Expand Down
22 changes: 16 additions & 6 deletions webmainbench/metrics/base_content_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,16 @@
from openai import OpenAI


def _metrics_debug(message: str) -> None:
"""Print diagnostics only when METRICS_DEBUG is True (see webmainbench/config.py)."""
try:
from ..config import METRICS_DEBUG
except ImportError:
METRICS_DEBUG = False
if METRICS_DEBUG:
print(f"[DEBUG] {message}")


class BaseContentSplitter(ABC):
"""抽象基类,用于从文本中提取特定类型的内容"""

Expand Down Expand Up @@ -58,7 +68,7 @@ def should_use_llm(self, field_name: str) -> bool:
def enhance_with_llm(self, basic_results: List[str], cache_key: str = None) -> List[str]:
"""使用LLM增强基本提取结果"""
if not basic_results:
print(f"[DEBUG] 输入内容为空,跳过LLM增强")
_metrics_debug("Empty input; skipping LLM enhancement")
return []

# 生成缓存键
Expand All @@ -73,10 +83,10 @@ def enhance_with_llm(self, basic_results: List[str], cache_key: str = None) -> L
try:
with open(cache_file, 'r', encoding='utf-8') as f:
cached_result = json.load(f)
print(f"[DEBUG] 从缓存加载LLM增强结果: {len(cached_result)} ")
_metrics_debug(f"Loaded LLM-enhanced result from cache: {len(cached_result)} items")
return cached_result
except Exception as e:
print(f"[DEBUG] 缓存读取失败: {e}")
_metrics_debug(f"Cache read failed: {e}")

# 实际的LLM增强逻辑
try:
Expand All @@ -86,13 +96,13 @@ def enhance_with_llm(self, basic_results: List[str], cache_key: str = None) -> L
try:
with open(cache_file, 'w', encoding='utf-8') as f:
json.dump(enhanced_results, f, ensure_ascii=False, indent=2)
print(f"[DEBUG] LLM增强结果已缓存到: {cache_file}")
_metrics_debug(f"LLM-enhanced result cached at: {cache_file}")
except Exception as e:
print(f"[DEBUG] 缓存保存失败: {e}")
_metrics_debug(f"Cache write failed: {e}")

return enhanced_results
except Exception as e:
print(f"[DEBUG] LLM增强失败: {type(e).__name__}: {e}")
_metrics_debug(f"LLM enhancement failed: {type(e).__name__}: {e}")
return basic_results

@abstractmethod
Expand Down
4 changes: 2 additions & 2 deletions webmainbench/metrics/code_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import re
from typing import List, Dict, Any

from .base_content_splitter import BaseContentSplitter
from .base_content_splitter import BaseContentSplitter, _metrics_debug


class CodeSplitter(BaseContentSplitter):
Expand Down Expand Up @@ -87,5 +87,5 @@ def extract_basic(self, text: str) -> List[str]:

def _llm_enhance(self, basic_results: List[str]) -> List[str]:
"""使用LLM增强代码提取结果(未实现)"""
print(f"[DEBUG] 代码LLM增强功能尚未实现,返回原始结果")
_metrics_debug("Code LLM enhancement not implemented; returning raw results")
return basic_results
10 changes: 5 additions & 5 deletions webmainbench/metrics/formula_extractor.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import re
from typing import List
from .base_content_splitter import BaseContentSplitter
from .base_content_splitter import BaseContentSplitter, _metrics_debug


class FormulaSplitter(BaseContentSplitter):
Expand Down Expand Up @@ -50,13 +50,13 @@ def extract(self, text: str, field_name: str = None) -> str:
"""提取数学公式"""
regex_formulas = self.extract_basic(text)
if self.should_use_llm(field_name):
print(f"[DEBUG] 使用LLM增强公式提取")
_metrics_debug("Using LLM-enhanced formula extraction")
formula_parts = self.enhance_with_llm(regex_formulas)
if not formula_parts:
print("[DEBUG] LLM增强后无有效公式")
_metrics_debug("No valid formulas after LLM enhancement")
else:
formula_parts = regex_formulas
print("[DEBUG] 跳过LLM增强,使用基础正则结果")
_metrics_debug("Skipping LLM enhancement; using regex-only results")
return '\n'.join(formula_parts)

def extract_basic(self, text: str) -> List[str]:
Expand Down Expand Up @@ -89,7 +89,7 @@ def extract_basic(self, text: str) -> List[str]:
def _llm_enhance(self, basic_results: List[str]) -> List[str]:
"""使用LLM增强公式提取结果"""
if not self.client:
print("[DEBUG] OpenAI客户端未初始化,返回基础提取结果")
_metrics_debug("OpenAI client not initialized; returning basic extraction results")
return basic_results

formulas_text = '\n'.join(basic_results)
Expand Down
Loading
Loading