Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
917 changes: 458 additions & 459 deletions examples/basic_usage.py

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions examples/demo.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
from webmainbench import DataLoader, Evaluator, ExtractorFactory
from pathlib import Path

# 1. 加载评测数据集
# 1. Load evaluation dataset
dataset = DataLoader.load_jsonl(Path("data/sample_dataset.jsonl"))

# 2. 创建抽取器
# 2. Create extractor
extractor = ExtractorFactory.create("llm-webkit")

# 3. 运行评测
# 3. Run evaluation
evaluator = Evaluator()
result = evaluator.evaluate(dataset, extractor)

# 4. 查看结果
# 4. View results
print(f"Overall Score: {result}")
170 changes: 85 additions & 85 deletions examples/llm_webkit_usage.py
Original file line number Diff line number Diff line change
@@ -1,147 +1,147 @@
#!/usr/bin/env python3
"""
LLM-WebKit Extractor使用示例
LLM-WebKit Extractor Usage Example

本示例展示如何使用集成了VLLM推理能力的LLM-WebKit extractor
This example demonstrates how to use the LLM-WebKit extractor integrated with VLLM inference capabilities.
"""

import time
from webmainbench.extractors import ExtractorFactory


def main():
print("🚀 LLM-WebKit Extractor 使用示例\n")
# 1. 创建带有自定义配置的extractor
print("LLM-WebKit Extractor Usage Example\n")

# 1. Create extractor with custom configuration
config = {
"model_path": "/Users/chupei/model/checkpoint-3296", # 替换为您的模型路径
"use_logits_processor": True, # 启用JSON格式约束
"temperature": 0.0, # 确定性输出
"max_item_count": 500, # 处理的最大item数量
"max_output_tokens": 4096, # 最大输出token数
"dtype": "bfloat16", # 模型精度
"tensor_parallel_size": 1 # 张量并行大小
"model_path": "/Users/chupei/model/checkpoint-3296", # Replace with your model path
"use_logits_processor": True, # Enable JSON format constraint
"temperature": 0.0, # Deterministic output
"max_item_count": 500, # Maximum number of items to process
"max_output_tokens": 4096, # Maximum output tokens
"dtype": "bfloat16", # Model precision
"tensor_parallel_size": 1 # Tensor parallel size
}

try:
extractor = ExtractorFactory.create("llm-webkit", config=config)
print(f"✅ Extractor创建成功: {extractor.description}")
print(f"📋 版本: {extractor.version}")
print(f"⚙️ 配置: {extractor.inference_config.__dict__}\n")
print(f"Extractor created successfully: {extractor.description}")
print(f"Version: {extractor.version}")
print(f"Config: {extractor.inference_config.__dict__}\n")

except Exception as e:
print(f"❌ Extractor创建失败: {e}")
print("💡 请确保已安装所需依赖:")
print(f"Extractor creation failed: {e}")
print("Please ensure the required dependencies are installed:")
print(" pip install vllm transformers torch llm_web_kit")
return
# 2. 准备测试HTML(包含_item_id属性的结构化HTML)

# 2. Prepare test HTML (structured HTML with _item_id attributes)
test_html = """
<html>
<head>
<title>测试文章 - 人工智能的发展趋势</title>
<title>Test Article - AI Development Trends</title>
</head>
<body>
<nav _item_id="1">
<ul>
<li><a href="/">首页</a></li>
<li><a href="/news">新闻</a></li>
<li><a href="/tech">科技</a></li>
<li><a href="/">Home</a></li>
<li><a href="/news">News</a></li>
<li><a href="/tech">Tech</a></li>
</ul>
</nav>

<header _item_id="2">
<h1>人工智能的发展趋势</h1>
<p class="meta">作者:张三 | 发布时间:2024-01-15 | 阅读量:1,234</p>
<h1>AI Development Trends</h1>
<p class="meta">Author: John Doe | Published: 2024-01-15 | Views: 1,234</p>
</header>

<main _item_id="3">
<article>
<p>人工智能(AI)技术正在快速发展,对各行各业产生深远影响。本文将探讨AI的主要发展趋势和未来展望。</p>
<h2>1. 机器学习的进步</h2>
<p>深度学习和大语言模型的突破使得AI系统能够理解和生成更自然的语言,在对话、翻译、创作等领域表现出色。</p>
<h2>2. 自动化应用</h2>
<p>从制造业的机器人到软件开发的代码生成,AI正在各个领域实现流程自动化,提高效率并降低成本。</p>
<h2>3. 个性化服务</h2>
<p>基于用户数据的个性化推荐和服务正变得越来越精准,为用户提供更好的体验。</p>
<p>Artificial Intelligence (AI) technology is rapidly advancing, with far-reaching impacts across all industries. This article explores the major development trends and future prospects of AI.</p>

<h2>1. Advances in Machine Learning</h2>
<p>Breakthroughs in deep learning and large language models have enabled AI systems to understand and generate more natural language, excelling in dialogue, translation, and creative tasks.</p>

<h2>2. Automation Applications</h2>
<p>From robots in manufacturing to code generation in software development, AI is automating processes across domains, improving efficiency and reducing costs.</p>

<h2>3. Personalized Services</h2>
<p>Personalized recommendations and services based on user data are becoming increasingly precise, providing better user experiences.</p>
</article>
</main>

<aside _item_id="4">
<h3>相关文章</h3>
<h3>Related Articles</h3>
<ul>
<li><a href="/article1">机器学习基础入门</a></li>
<li><a href="/article2">深度学习应用案例</a></li>
<li><a href="/article3">AI伦理与安全</a></li>
<li><a href="/article1">Introduction to Machine Learning</a></li>
<li><a href="/article2">Deep Learning Application Cases</a></li>
<li><a href="/article3">AI Ethics and Safety</a></li>
</ul>
</aside>

<footer _item_id="5">
<p>&copy; 2024 科技资讯网. 保留所有权利.</p>
<p>&copy; 2024 Tech News. All rights reserved.</p>
<div class="social-links">
<a href="#">微博</a> | <a href="#">微信</a> | <a href="#">知乎</a>
<a href="#">Twitter</a> | <a href="#">LinkedIn</a> | <a href="#">GitHub</a>
</div>
</footer>
</body>
</html>
"""
# 3. 执行内容提取
print("🔍 开始内容提取...")

# 3. Execute content extraction
print("Starting content extraction...")
start_time = time.time()

try:
result = extractor.extract(test_html)
end_time = time.time()

print(f"⏱️ 提取耗时: {end_time - start_time:.2f}\n")
# 4. 显示提取结果
print(f"⏱️ Extraction time: {end_time - start_time:.2f}s\n")

# 4. Display extraction results
if result.success:
print("✅ 内容提取成功!\n")
print("📄 提取的主要内容:")
print("✅ Content extracted successfully!\n")

print("📄 Extracted main content:")
print("=" * 50)
print(result.content[:500] + "..." if len(result.content) > 500 else result.content)
print("=" * 50)
print(f"\n📊 提取统计:")
print(f" • 内容长度: {len(result.content)} 字符")
print(f" • 置信度: {result.confidence_score:.3f}")
print(f" • 标题: {result.title}")
print(f" • 语言: {result.language}")
print(f" • 提取时间: {result.extraction_time:.3f}")

print(f"\n📊 Extraction statistics:")
print(f" • Content length: {len(result.content)} characters")
print(f" • Confidence: {result.confidence_score:.3f}")
print(f" • Title: {result.title}")
print(f" • Language: {result.language}")
print(f" • Extraction time: {result.extraction_time:.3f}s")

if result.content_list:
print(f" • 结构化内容块: {len(result.content_list)}")
for i, item in enumerate(result.content_list[:3]): # 显示前3个
print(f" • Structured content blocks: {len(result.content_list)}")
for i, item in enumerate(result.content_list[:3]): # Show first 3
print(f" [{i+1}] {item.get('type', 'unknown')}: {item.get('content', '')[:50]}...")

else:
print("❌ 内容提取失败")
print(f"错误信息: {result.error_message}")
print("❌ Content extraction failed")
print(f"Error message: {result.error_message}")
if result.error_traceback:
print(f"错误详情:\n{result.error_traceback}")
print(f"Error details:\n{result.error_traceback}")

except Exception as e:
print(f"❌ 提取过程中发生异常: {e}")
print("\n🎯 高级功能说明:")
print("• 智能分类: 使用LLM理解HTML元素语义,准确区分主要内容和辅助内容")
print("• 格式约束: 通过logits processor确保LLM输出有效的JSON格式")
print("• 性能优化: 自动跳过过于复杂的HTML,支持延迟加载模型")
print("• 详细反馈: 提供分类结果、置信度和性能指标")
print(f"❌ Exception during extraction: {e}")

print("\n🎯 Advanced feature notes:")
print("• Smart classification: Uses LLM to understand HTML element semantics, accurately distinguishing main content from auxiliary content")
print("• Format constraint: Uses logits processor to ensure valid JSON output from the LLM")
print("• Performance optimization: Automatically skips overly complex HTML, supports lazy model loading")
print("• Detailed feedback: Provides classification results, confidence scores, and performance metrics")


if __name__ == "__main__":
main()
print("\n💡 使用提示:")
print("1. 确保已安装所需依赖: vllm, transformers, torch, llm_web_kit")
print("2. 设置正确的模型路径")
print("3. 根据硬件资源调整tensor_parallel_size和dtype")
print("4. 对于大规模HTML,适当调整max_item_count限制")
print("5. 使用use_logits_processor=True确保输出格式可靠性")

print("\n💡 Usage tips:")
print("1. Ensure required dependencies are installed: vllm, transformers, torch, llm_web_kit")
print("2. Set the correct model path")
print("3. Adjust tensor_parallel_size and dtype based on hardware resources")
print("4. For large-scale HTML, adjust max_item_count accordingly")
print("5. Use use_logits_processor=True to ensure reliable output format")
56 changes: 28 additions & 28 deletions examples/magic_html_extract_demo.py
Original file line number Diff line number Diff line change
@@ -1,68 +1,68 @@
import time
from webmainbench.extractors import ExtractorFactory

# 配置 MagicHTML 抽取器(这里可根据需要添加更多配置)
# Configure MagicHTML extractor (add more configuration as needed)
config = {}
try:
# 创建 MagicHTML 抽取器实例
# Create MagicHTML extractor instance
extractor = ExtractorFactory.create("magic-html", config=config)
print(f"✅ Extractor创建成功: {extractor.description}")
print(f"📋 版本: {extractor.version}")
print(f"⚙️ 配置: {extractor.get_config()}\n")
print(f"✅ Extractor created successfully: {extractor.description}")
print(f"📋 Version: {extractor.version}")
print(f"⚙️ Config: {extractor.get_config()}\n")
except Exception as e:
print(f"❌ Extractor创建失败: {e}")
print(f"❌ Failed to create extractor: {e}")

# 测试 HTML
# Test HTML
test_html = """
<html>
<body>
<h1 cc-select="true">Python编程教程</h1>
<p cc-select="true">这是一个Python基础教程,展示如何定义函数。</p>
<h1 cc-select="true">Python Programming Tutorial</h1>
<p cc-select="true">This is a basic Python tutorial demonstrating how to define functions.</p>
<pre cc-select="true"><code>def greet(name):
""问候函数""
""Greeting function""
return f"Hello, {name}!"

# 使用示例
# Usage example
result = greet("World")
print(result)</code></pre>
<p cc-select="true">这个函数可以用来问候任何人。</p>
<p cc-select="true">This function can be used to greet anyone.</p>
</body>
</html>
"""

print("🔍 开始内容提取...")
print("🔍 Starting content extraction...")
start_time = time.time()

try:
result = extractor.extract(test_html)
end_time = time.time()

print(f"⏱️ 提取耗时: {end_time - start_time:.2f}\n")
print(f"⏱️ Extraction time: {end_time - start_time:.2f}s\n")

# 显示提取结果
# Display extraction results
if result.success:
print("✅ 内容提取成功!\n")
print("✅ Content extracted successfully!\n")

print("📄 提取的主要内容:")
print("📄 Extracted main content:")
print("=" * 50)
print(result.content[:500] + "..." if len(result.content) > 500 else result.content)
print("=" * 50)

print(f"\n📊 提取统计:")
print(f" • 内容长度: {len(result.content)} 字符")
print(f" • 标题: {result.title}")
print(f" • 语言: {result.language}")
print(f" • 提取时间: {result.extraction_time:.3f}")
print(f"\n📊 Extraction statistics:")
print(f" • Content length: {len(result.content)} characters")
print(f" • Title: {result.title}")
print(f" • Language: {result.language}")
print(f" • Extraction time: {result.extraction_time:.3f}s")

if result.content_list:
print(f" • 结构化内容块: {len(result.content_list)}")
for i, item in enumerate(result.content_list[:3]): # 显示前3个
print(f" • Structured content blocks: {len(result.content_list)}")
for i, item in enumerate(result.content_list[:3]): # Show first 3
print(f" [{i + 1}] {item.get('type', 'unknown')}: {item.get('content', '')[:50]}...")
else:
print("❌ 内容提取失败")
print(f"错误信息: {result.error_message}")
print("❌ Content extraction failed")
print(f"Error message: {result.error_message}")
if result.error_traceback:
print(f"错误详情:\n{result.error_traceback}")
print(f"Error details:\n{result.error_traceback}")

except Exception as e:
print(f"❌ 提取过程中发生异常: {e}")
print(f"❌ Exception during extraction: {e}")
Loading
Loading