Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ def extract(url:str, html:str) -> str:
try:
nlp_md = extract_content_from_html_with_magic_html(url, html)
# or mm_nlp_md = extract_content_from_html_with_magic_html(url, html, 'mm_md')
# or nlp_md = extract_content_from_html_with_magic_html(url, html, language='zh')
return nlp_md
except Exception as e:
logger.exception(e)
Expand Down Expand Up @@ -113,6 +114,7 @@ def extract(url:str, html:str) -> str:
try:
nlp_md = extract_content_from_main_html(url, html)
# or mm_nlp_md = extract_content_from_main_html(url, html, 'mm_md')
# or nlp_md = extract_content_from_main_html(url, html, language='zh')
return nlp_md
except Exception as e:
logger.exception(e)
Expand Down
2 changes: 1 addition & 1 deletion llm_web_kit/main_html_parser/parser/layout_batch_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ def find_blocks_drop(self, element, depth, element_dict, parent_keyy, parent_lab
length_tail = len(element.tail.strip())
idd = element.get('id')
tag = element.tag
layer_nodes = element_dict[depth]
layer_nodes = element_dict.get(depth, {})
class_tag = element.get('class')
ori_keyy = (tag, class_tag, idd)
if idd and idd.strip():
Expand Down
29 changes: 18 additions & 11 deletions llm_web_kit/simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def get_extractor(pipe_tpl_name: str):
return ExtractorFactory._extractors[pipe_tpl_name]


def _extract_html(url: str, html_content: str, pipe_tpl: str) -> DataJson:
def _extract_html(url: str, html_content: str, pipe_tpl: str, language: str = 'en') -> DataJson:
"""内部使用的统一HTML提取方法,返回处理后的DataJson对象.

Args:
Expand All @@ -70,6 +70,7 @@ def _extract_html(url: str, html_content: str, pipe_tpl: str) -> DataJson:
- PipeTpl.MAGIC_HTML_NOCLIP: magic_html + markdown转换
- PipeTpl.LLM_NOCLIP: LLM + markdown转换
- PipeTpl.LAYOUT_BATCH_NOCLIP: layout_batch + markdown转换
language: 语言,可选:'en' 或 'zh'

Returns:
DataJson: 处理后的DataJson对象,包含main_html和content_list等信息
Expand All @@ -83,6 +84,7 @@ def _extract_html(url: str, html_content: str, pipe_tpl: str) -> DataJson:
'dataset_name': f'llm-web-kit-{pipe_tpl}',
'data_source_category': 'HTML',
'file_bytes': len(html_content),
'language': language,
'meta_info': {'input_datetime': datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
}

Expand All @@ -94,33 +96,35 @@ def _extract_html(url: str, html_content: str, pipe_tpl: str) -> DataJson:
# SDK方法(三种使用场景)
# ========================================

def extract_main_html_only(url: str, html_content: str, parser_type: str = PipeTpl.MAGIC_HTML) -> str:
def extract_main_html_only(url: str, html_content: str, parser_type: str = PipeTpl.MAGIC_HTML, language: str = 'en') -> str:
"""场景1: 只执行第一阶段,抽取main_html.

Args:
url: 网页URL
html_content: 原始HTML内容
parser_type: 解析器类型,可选:PipeTpl.MAGIC_HTML, PipeTpl.LLM, PipeTpl.LAYOUT_BATCH
language: 语言,可选:'en' 或 'zh'

Returns:
str: 提取的主要HTML内容
"""
result = _extract_html(url, html_content, parser_type)
result = _extract_html(url, html_content, parser_type, language)
return result.get('main_html', '')


def extract_content_from_main_html(url: str, main_html: str, output_format: str = 'md') -> str:
def extract_content_from_main_html(url: str, main_html: str, output_format: str = 'md', language: str = 'en') -> str:
"""场景2: 只执行第二阶段,从main_html抽取结构化内容.

Args:
url: 网页URL
main_html: 已经抽取的主要HTML内容
output_format: 输出格式,'md' 或 'mm_md'
language: 语言,可选:'en' 或 'zh'

Returns:
str: 结构化的内容(markdown格式)
"""
result = _extract_html(url, main_html, PipeTpl.NOCLIP)
result = _extract_html(url, main_html, PipeTpl.NOCLIP, language)
content_list = result.get_content_list()

if output_format == 'md':
Expand All @@ -133,18 +137,19 @@ def extract_content_from_main_html(url: str, main_html: str, output_format: str
raise InvalidOutputFormatException(f'Invalid output format: {output_format}')


def extract_content_from_html_with_magic_html(url: str, html_content: str, output_format: str = 'md') -> str:
def extract_content_from_html_with_magic_html(url: str, html_content: str, output_format: str = 'md', language: str = 'en') -> str:
"""场景3: 执行两个阶段,从magic_html抽取main_html,再从main_html抽取结构化内容.

Args:
url: 网页URL
html_content: 原始HTML内容
output_format: 输出格式,'md' 或 'mm_md'
language: 语言,可选:'en' 或 'zh'

Returns:
str: 结构化的内容(markdown格式)
"""
result = _extract_html(url, html_content, PipeTpl.MAGIC_HTML_NOCLIP)
result = _extract_html(url, html_content, PipeTpl.MAGIC_HTML_NOCLIP, language)
content_list = result.get_content_list()

if output_format == 'md':
Expand All @@ -157,18 +162,19 @@ def extract_content_from_html_with_magic_html(url: str, html_content: str, outpu
raise InvalidOutputFormatException(f'Invalid output format: {output_format}')


def extract_content_from_html_with_llm(url: str, html_content: str, output_format: str = 'md') -> str:
def extract_content_from_html_with_llm(url: str, html_content: str, output_format: str = 'md', language: str = 'en') -> str:
"""场景3: 执行两个阶段,从llm抽取main_html,再从main_html抽取结构化内容.

Args:
url: 网页URL
html_content: 原始HTML内容
output_format: 输出格式,'md' 或 'mm_md'
language: 语言,可选:'en' 或 'zh'

Returns:
str: 结构化的内容(markdown格式)
"""
result = _extract_html(url, html_content, PipeTpl.LLM_NOCLIP)
result = _extract_html(url, html_content, PipeTpl.LLM_NOCLIP, language)
content_list = result.get_content_list()

if output_format == 'md':
Expand All @@ -181,18 +187,19 @@ def extract_content_from_html_with_llm(url: str, html_content: str, output_forma
raise InvalidOutputFormatException(f'Invalid output format: {output_format}')


def extract_content_from_html_with_layout_batch(url: str, html_content: str, output_format: str = 'md') -> str:
def extract_content_from_html_with_layout_batch(url: str, html_content: str, output_format: str = 'md', language: str = 'en') -> str:
"""场景3: 执行两个阶段,从layout_batch抽取main_html,再从main_html抽取结构化内容.

Args:
url: 网页URL
html_content: 原始HTML内容
output_format: 输出格式,'md' 或 'mm_md'
language: 语言,可选:'en' 或 'zh'

Returns:
str: 结构化的内容(markdown格式)
"""
result = _extract_html(url, html_content, PipeTpl.LAYOUT_BATCH_NOCLIP)
result = _extract_html(url, html_content, PipeTpl.LAYOUT_BATCH_NOCLIP, language)
content_list = result.get_content_list()

if output_format == 'md':
Expand Down
29 changes: 21 additions & 8 deletions tests/llm_web_kit/simple/test_simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,17 +168,17 @@ def test_extract_noclip_mm_md(self):

def test_extract_main_html_only_default(self):
"""测试对外方法:extract_main_html_only默认使用MAGIC_HTML."""
result = extract_main_html_only(self.url, self.html_content)
result = extract_main_html_only(self.url, self.html_content, language='en')
self.assertEqual(result, self.main_html)

def test_extract_content_from_main_html_default(self):
"""测试对外方法:extract_content_from_main_html默认md格式."""
result = extract_content_from_main_html(self.url, self.main_html)
result = extract_content_from_main_html(self.url, self.main_html, language='en')
self.assertEqual(result, self.expected_md)

def test_extract_content_from_main_html_mm_md(self):
"""测试对外方法:extract_content_from_main_html输出mm_md格式."""
result = extract_content_from_main_html(self.url, self.main_html, 'mm_md')
result = extract_content_from_main_html(self.url, self.main_html, 'mm_md', language='en')
self.assertEqual(result, self.expected_mm_md)

def test_extract_content_from_html_with_llm(self):
Expand Down Expand Up @@ -215,12 +215,12 @@ def test_invalid_output_format_exception(self):

def test_output_format_md(self):
"""测试md输出格式."""
result = extract_content_from_main_html(self.url, self.main_html, 'md')
result = extract_content_from_main_html(self.url, self.main_html, 'md', language='en')
self.assertEqual(result, self.expected_md)

def test_output_format_mm_md(self):
"""测试mm_md输出格式."""
result = extract_content_from_main_html(self.url, self.main_html, 'mm_md')
result = extract_content_from_main_html(self.url, self.main_html, 'mm_md', language='en')
self.assertEqual(result, self.expected_mm_md)

def test_output_format_json(self):
Expand All @@ -230,6 +230,19 @@ def test_output_format_json(self):
# JSON输出应该包含JSON结构
self.assertTrue(result.startswith('{') or result.startswith('['))

def test_language_option_affects_spacing_en(self):
"""测试 language='en' 时英文片段合并插入空格."""
html_content = '<div><body><p><span>Hello</span><span>World</span></p></body></div>'
md = extract_content_from_main_html(self.url, html_content, language='en')
self.assertIn('Hello World', md)
self.assertNotIn('HelloWorld', md)

def test_language_option_affects_spacing_zh(self):
"""测试 language='zh' 时英文片段合并不插入空格(与中文等无分词语言一致策略)。"""
html_content = '<div><body><p><span>Hello</span><span>World</span></p></body></div>'
md = extract_content_from_main_html(self.url, html_content, language='zh')
self.assertIn('HelloWorld', md)

# ========================================
# 测试ExtractorFactory缓存机制和线程安全
# ========================================
Expand Down Expand Up @@ -524,17 +537,17 @@ def test_extract_word_press(self):
def test_full_pipeline_integration(self):
"""测试完整的两阶段流水线."""
# 第一阶段:提取main_html
main_html = extract_main_html_only(self.url, self.real_html_content)
main_html = extract_main_html_only(self.url, self.real_html_content, language='en')
self.assertIsInstance(main_html, str)
self.assertTrue(len(main_html) > 0)

# 第二阶段:从main_html提取内容
markdown = extract_content_from_main_html(self.url, main_html)
markdown = extract_content_from_main_html(self.url, main_html, language='en')
self.assertIsInstance(markdown, str)
self.assertTrue(len(markdown) > 0)

# 对比直接两阶段调用的结果
direct_result = extract_content_from_html_with_magic_html(self.url, self.real_html_content)
direct_result = extract_content_from_html_with_magic_html(self.url, self.real_html_content, language='en')
self.assertEqual(markdown, direct_result, "分步骤处理和直接处理的结果应该一致")

def test_multiple_calls_consistency(self):
Expand Down