opendatalab · e06084 · Mar 27, 2026 · Mar 27, 2026
diff --git a/.gitignore b/.gitignore
@@ -45,4 +45,12 @@ results/
 .coverage*
 coverage.xml
 
-webmainbench.egg-info/*
+webmainbench.egg-info/*
+
+# PyPI packaging
+build/
+dist/
+*.egg-info/
+.eggs/
+*.egg
+.pypirc
diff --git a/README.md b/README.md
@@ -41,12 +41,6 @@ WebMainBench is a specialized benchmark tool for end-to-end evaluation of web ma
 ```bash
 # Basic installation
 pip install webmainbench
-
-# Install with all optional dependencies
-pip install webmainbench[all]
-
-# Development environment installation
-pip install webmainbench[dev]
 ```
 
 ### Basic Usage
@@ -55,13 +49,18 @@ pip install webmainbench[dev]
 from webmainbench import DataLoader, Evaluator, ExtractorFactory
 
 # 1. Load evaluation dataset
-dataset = DataLoader.load_jsonl("your_dataset.jsonl")
+dataset = DataLoader.load_jsonl("data/WebMainBench_dataset_sample2.jsonl")
 
 # 2. Create extractor
 extractor = ExtractorFactory.create("trafilatura")
 
 # 3. Run evaluation
-evaluator = Evaluator()
+evaluator = Evaluator(llm_config={
+    "use_llm": True,
+    "llm_base_url": "",
+    "llm_api_key": "",
+    "llm_model": "gpt-5-chat-latest",
+})
 result = evaluator.evaluate(dataset, extractor)
 
 # 4. View results

diff --git a/data/WebMainBench_dataset_sample2.jsonl b/data/WebMainBench_dataset_sample2.jsonl
diff --git a/setup.py b/setup.py
@@ -9,11 +9,11 @@
     name="webmainbench",
     version="0.1.0",
     author="WebMainBench Team",
-    author_email="webmainbench@example.com",
+    author_email="chupei@pjlab.org.cn",
     description="A comprehensive benchmark for web main content extraction",
     long_description=long_description,
     long_description_content_type="text/markdown",
-    url="https://github.com/example/webmainbench",
+    url="https://github.com/opendatalab/WebMainBench",
     packages=find_packages(),
     classifiers=[
         "Development Status :: 3 - Alpha",
@@ -32,11 +32,15 @@
     ],
     python_requires=">=3.8",
     install_requires=[
-        "lxml==5.3.0",
+        "lxml>=5.3.0",
         "jsonlines>=3.1.0",
         "requests>=2.28.0",
-        "beautifulsoup4==4.12.0",
+        "beautifulsoup4>=4.12.0",
         "numpy>=1.21.0,<2.0.0",  # 避免NumPy 2.x兼容性问题
+        "rapidfuzz>=3.0.0",  # 用于文本编辑距离计算
+        "apted>=1.0.3",  # 用于树编辑距离计算（TEDS）
+        "jieba>=0.42.0",  # 用于中文分词
+        "rouge>=1.0.0",  # 用于 ROUGE 指标
     ],
     extras_require={
         "all": [

diff --git a/webmainbench/config.py b/webmainbench/config.py
@@ -1,11 +1,12 @@
-"""
-全局配置文件
-"""
+"""Package-wide configuration."""
 
-# LLM配置，用于修正抽取工具的抽取结果
+# LLM settings for refinement of extractor outputs
 LLM_CONFIG = {
     'llm_base_url': '',
     'llm_api_key': '',
     'llm_model': 'deepseek-chat',
-    'use_llm': True
+    'use_llm': True,
 }
+
+# When True, print LLM enhancement / cache diagnostics (very noisy).
+METRICS_DEBUG = False
diff --git a/webmainbench/evaluator/evaluator.py b/webmainbench/evaluator/evaluator.py
@@ -78,81 +78,101 @@ def from_dict(cls, data: Dict[str, Any]) -> "EvaluationResult":
 class Evaluator:
     """Main evaluator for web content extraction benchmarks."""
 
-    def __init__(self, metric_config: Dict[str, Any] = None):
+    def __init__(self, metric_config: Dict[str, Any] = None,
+                 llm_config: Dict[str, Any] = None):
         """
         Initialize the evaluator.
 
         Args:
             metric_config: Configuration for metrics
+            llm_config: Optional LLM configuration dict to override webmainbench/config.py.
+                        Supported keys:
+                          - use_llm (bool): whether to enable LLM enhancement
+                          - llm_base_url (str): API base URL
+                          - llm_api_key (str): API key
+                          - llm_model (str): model name (default: 'deepseek-chat')
+                        Example:
+                          Evaluator(llm_config={
+                              'use_llm': True,
+                              'llm_base_url': 'https://api.deepseek.com',
+                              'llm_api_key': 'sk-xxxxxxxxxxxx',
+                              'llm_model': 'deepseek-chat',
+                          })
         """
-
-        self._validate_llm_config()
+        self._validate_llm_config(llm_config)
 
         self.metric_calculator = MetricCalculator(metric_config)
         self.metric_config = metric_config or {}
 
-    def _validate_llm_config(self):
-        """验证LLM配置的完整性和有效性"""
+    def _validate_llm_config(self, llm_config: Dict[str, Any] = None):
+        """Validate LLM configuration completeness and API connectivity."""
         import time
         from ..config import LLM_CONFIG
 
-        if LLM_CONFIG.get('use_llm', False):
-            # 检查配置完整性
-            if not LLM_CONFIG.get('llm_base_url') or not LLM_CONFIG.get('llm_api_key'):
+        # External llm_config takes priority over config.py
+        config = {**LLM_CONFIG, **(llm_config or {})}
+
+        if config.get('use_llm', False):
+            if not config.get('llm_base_url') or not config.get('llm_api_key'):
                 print("\n" + "=" * 60)
-                print("❌ 错误：LLM配置不完整！")
+                print("❌ Error: Incomplete LLM configuration!")
                 print("-" * 60)
-                print("当前 use_llm = True，但缺少必要的API配置。")
-                print("\n请在 webmainbench/config.py 中完成以下配置：")
-                print("  1. llm_base_url  (例如: 'https://api.deepseek.com')")
-                print("  2. llm_api_key   (例如: 'sk-xxxxxxxxxxxx')")
-                print("\n或者设置 use_llm = False 来禁用LLM功能。")
+                print("'use_llm' is set to True, but required API settings are missing.")
+                print("\nOption 1 - Pass config directly to Evaluator:")
+                print("  Evaluator(llm_config={")
+                print("      'use_llm': True,")
+                print("      'llm_base_url': 'https://api.deepseek.com',")
+                print("      'llm_api_key': 'sk-xxxxxxxxxxxx',")
+                print("  })")
+                print("\nOption 2 - Edit webmainbench/config.py:")
+                print("  1. llm_base_url  (e.g. 'https://api.deepseek.com')")
+                print("  2. llm_api_key   (e.g. 'sk-xxxxxxxxxxxx')")
+                print("\nOption 3 - Disable LLM: set use_llm = False")
                 print("=" * 60 + "\n")
                 sys.exit(1)
 
-            # 验证API有效性
             try:
                 from openai import OpenAI
 
-                print("正在验证LLM API配置...")
+                print("Validating LLM API configuration...")
                 client = OpenAI(
-                    base_url=LLM_CONFIG.get('llm_base_url'),
-                    api_key=LLM_CONFIG.get('llm_api_key')
+                    base_url=config.get('llm_base_url'),
+                    api_key=config.get('llm_api_key')
                 )
 
-                # 发送测试请求
-                response = client.chat.completions.create(
-                    model=LLM_CONFIG.get('llm_model', 'deepseek-chat'),
+                client.chat.completions.create(
+                    model=config.get('llm_model', 'deepseek-chat'),
                     messages=[{"role": "user", "content": "test"}],
                     max_tokens=5,
                     temperature=0
                 )
 
-                print("✅ LLM API配置验证成功！\n使用 基础方案➕LLM增强提取效果 进行评测。")
+                print("✅ LLM API validated. Running evaluation with LLM enhancement.\n")
 
             except Exception as e:
                 print("\n" + "=" * 60)
-                print("❌ 错误：LLM API配置无效！")
+                print("❌ Error: LLM API validation failed!")
                 print("-" * 60)
-                print(f"验证失败原因: {str(e)}")
-                print("\n请检查 webmainbench/config.py 中的配置：")
-                print("  1. llm_base_url 是否正确")
-                print("  2. llm_api_key 是否有效")
-                print("  3. llm_model 是否支持")
-                print("  4. 网络连接是否正常")
-                print("\n或者设置 use_llm = False 来禁用LLM功能。")
+                print(f"Reason: {str(e)}")
+                print("\nPlease check:")
+                print("  1. llm_base_url is correct")
+                print("  2. llm_api_key is valid")
+                print("  3. llm_model is supported")
+                print("  4. Network connectivity")
+                print("\nAlternatively, set use_llm = False to disable LLM functionality.")
                 print("=" * 60 + "\n")
                 sys.exit(1)
         else:
-            # 未启用LLM的提示
             print("\n" + "=" * 60)
-            print("⚠️  注意：当前未启用LLM增强提取效果功能")
-            print("   如需启用LLM增强提取效果，请在 webmainbench/config.py 中配置：")
-            print("   - 设置 use_llm = True")
-            print("   - 填写 llm_base_url")
-            print("   - 填写 llm_api_key")
+            print("ℹ️  LLM enhancement is disabled. Running in baseline mode.")
+            print("   To enable LLM enhancement, pass llm_config to Evaluator:")
+            print("     Evaluator(llm_config={")
+            print("         'use_llm': True,")
+            print("         'llm_base_url': '...',")
+            print("         'llm_api_key': '...',")
+            print("     })")
             print("=" * 60)
-            print("   (5秒后使用基础方案进行对比...)")
+            print("   Continuing in 5 seconds...")
             time.sleep(5)
             print()
 
@@ -289,10 +309,10 @@ def evaluate_batched(self,
         all_sample_results = []
         all_extraction_errors = []
 
-        print(f"🔄 开始批处理评测")
-        print(f"   数据集: {jsonl_file_path}")
-        print(f"   批大小: {batch_size}")
-        print(f"   最大样本数: {max_samples or '无限制'}")
+        print(f"🔄 Starting batched evaluation")
+        print(f"   Dataset: {jsonl_file_path}")
+        print(f"   Batch size: {batch_size}")
+        print(f"   Max samples: {max_samples if max_samples is not None else 'unlimited'}")
 
         start_time = time.time()
 
@@ -311,17 +331,17 @@ def evaluate_batched(self,
             processed_samples += len(batch_samples)
             total_samples += len(batch_samples)
 
-            print(f"   已处理: {processed_samples} 样本")
+            print(f"   Processed: {processed_samples} samples")
 
             # 如果有输出文件，可以立即写入避免内存累积
             if output_file and len(all_sample_results) > 1000:
                 DataSaver.append_intermediate_results(all_sample_results, output_file)
                 all_sample_results = []  # 清空已保存的结果
 
         end_time = time.time()
-        print(f"✅ 批处理评测完成")
-        print(f"   总耗时: {end_time - start_time:.2f}秒")
-        print(f"   处理样本: {processed_samples}")
+        print(f"✅ Batched evaluation finished")
+        print(f"   Elapsed: {end_time - start_time:.2f}s")
+        print(f"   Samples processed: {processed_samples}")
 
         # 聚合结果
         overall_metrics = self._aggregate_metrics(all_sample_results)
@@ -363,7 +383,7 @@ def _process_batch(self, batch_samples: List[DataSample], extractor: BaseExtract
                     })
 
             except Exception as e:
-                print(f"⚠️  样本 {sample.id} 评测失败: {e}")
+                print(f"⚠️  Sample {sample.id} evaluation failed: {e}")
                 batch_errors.append({
                     'sample_id': sample.id,
                     'error': str(e),

diff --git a/webmainbench/metrics/base_content_splitter.py b/webmainbench/metrics/base_content_splitter.py
@@ -6,6 +6,16 @@
 from openai import OpenAI
 
 
+def _metrics_debug(message: str) -> None:
+    """Print diagnostics only when METRICS_DEBUG is True (see webmainbench/config.py)."""
+    try:
+        from ..config import METRICS_DEBUG
+    except ImportError:
+        METRICS_DEBUG = False
+    if METRICS_DEBUG:
+        print(f"[DEBUG] {message}")
+
+
 class BaseContentSplitter(ABC):
     """抽象基类，用于从文本中提取特定类型的内容"""
 
@@ -58,7 +68,7 @@ def should_use_llm(self, field_name: str) -> bool:
     def enhance_with_llm(self, basic_results: List[str], cache_key: str = None) -> List[str]:
         """使用LLM增强基本提取结果"""
         if not basic_results:
-            print(f"[DEBUG] 输入内容为空，跳过LLM增强")
+            _metrics_debug("Empty input; skipping LLM enhancement")
             return []
 
         # 生成缓存键
@@ -73,10 +83,10 @@ def enhance_with_llm(self, basic_results: List[str], cache_key: str = None) -> L
             try:
                 with open(cache_file, 'r', encoding='utf-8') as f:
                     cached_result = json.load(f)
-                    print(f"[DEBUG] 从缓存加载LLM增强结果: {len(cached_result)} 个")
+                    _metrics_debug(f"Loaded LLM-enhanced result from cache: {len(cached_result)} items")
                     return cached_result
             except Exception as e:
-                print(f"[DEBUG] 缓存读取失败: {e}")
+                _metrics_debug(f"Cache read failed: {e}")
 
         # 实际的LLM增强逻辑
         try:
@@ -86,13 +96,13 @@ def enhance_with_llm(self, basic_results: List[str], cache_key: str = None) -> L
             try:
                 with open(cache_file, 'w', encoding='utf-8') as f:
                     json.dump(enhanced_results, f, ensure_ascii=False, indent=2)
-                print(f"[DEBUG] LLM增强结果已缓存到: {cache_file}")
+                _metrics_debug(f"LLM-enhanced result cached at: {cache_file}")
             except Exception as e:
-                print(f"[DEBUG] 缓存保存失败: {e}")
+                _metrics_debug(f"Cache write failed: {e}")
 
             return enhanced_results
         except Exception as e:
-            print(f"[DEBUG] LLM增强失败: {type(e).__name__}: {e}")
+            _metrics_debug(f"LLM enhancement failed: {type(e).__name__}: {e}")
             return basic_results
 
     @abstractmethod

diff --git a/webmainbench/metrics/code_extractor.py b/webmainbench/metrics/code_extractor.py
@@ -2,7 +2,7 @@
 import re
 from typing import List, Dict, Any
 
-from .base_content_splitter import BaseContentSplitter
+from .base_content_splitter import BaseContentSplitter, _metrics_debug
 
 
 class CodeSplitter(BaseContentSplitter):
@@ -87,5 +87,5 @@ def extract_basic(self, text: str) -> List[str]:
 
     def _llm_enhance(self, basic_results: List[str]) -> List[str]:
         """使用LLM增强代码提取结果（未实现）"""
-        print(f"[DEBUG] 代码LLM增强功能尚未实现，返回原始结果")
+        _metrics_debug("Code LLM enhancement not implemented; returning raw results")
         return basic_results
diff --git a/webmainbench/metrics/formula_extractor.py b/webmainbench/metrics/formula_extractor.py
@@ -1,6 +1,6 @@
 import re
 from typing import List
-from .base_content_splitter import BaseContentSplitter
+from .base_content_splitter import BaseContentSplitter, _metrics_debug
 
 
 class FormulaSplitter(BaseContentSplitter):
@@ -50,13 +50,13 @@ def extract(self, text: str, field_name: str = None) -> str:
         """提取数学公式"""
         regex_formulas = self.extract_basic(text)
         if self.should_use_llm(field_name):
-            print(f"[DEBUG] 使用LLM增强公式提取")
+            _metrics_debug("Using LLM-enhanced formula extraction")
             formula_parts = self.enhance_with_llm(regex_formulas)
             if not formula_parts:
-                print("[DEBUG] LLM增强后无有效公式")
+                _metrics_debug("No valid formulas after LLM enhancement")
         else:
             formula_parts = regex_formulas
-            print("[DEBUG] 跳过LLM增强，使用基础正则结果")
+            _metrics_debug("Skipping LLM enhancement; using regex-only results")
         return '\n'.join(formula_parts)
 
     def extract_basic(self, text: str) -> List[str]:
@@ -89,7 +89,7 @@ def extract_basic(self, text: str) -> List[str]:
     def _llm_enhance(self, basic_results: List[str]) -> List[str]:
         """使用LLM增强公式提取结果"""
         if not self.client:
-            print("[DEBUG] OpenAI客户端未初始化，返回基础提取结果")
+            _metrics_debug("OpenAI client not initialized; returning basic extraction results")
             return basic_results
 
         formulas_text = '\n'.join(basic_results)