diff --git a/README.md b/README.md index 9fbeaaec..4269b211 100644 --- a/README.md +++ b/README.md @@ -85,6 +85,7 @@ def extract(url:str, html:str) -> str: try: nlp_md = extract_content_from_html_with_magic_html(url, html) # or mm_nlp_md = extract_content_from_html_with_magic_html(url, html, 'mm_md') + # or nlp_md = extract_content_from_html_with_magic_html(url, html, language='zh') return nlp_md except Exception as e: logger.exception(e) @@ -113,6 +114,7 @@ def extract(url:str, html:str) -> str: try: nlp_md = extract_content_from_main_html(url, html) # or mm_nlp_md = extract_content_from_main_html(url, html, 'mm_md') + # or nlp_md = extract_content_from_main_html(url, html, language='zh') return nlp_md except Exception as e: logger.exception(e) diff --git a/llm_web_kit/main_html_parser/parser/layout_batch_parser.py b/llm_web_kit/main_html_parser/parser/layout_batch_parser.py index 616d72e9..5259cea3 100644 --- a/llm_web_kit/main_html_parser/parser/layout_batch_parser.py +++ b/llm_web_kit/main_html_parser/parser/layout_batch_parser.py @@ -151,7 +151,7 @@ def find_blocks_drop(self, element, depth, element_dict, parent_keyy, parent_lab length_tail = len(element.tail.strip()) idd = element.get('id') tag = element.tag - layer_nodes = element_dict[depth] + layer_nodes = element_dict.get(depth, {}) class_tag = element.get('class') ori_keyy = (tag, class_tag, idd) if idd and idd.strip(): diff --git a/llm_web_kit/simple.py b/llm_web_kit/simple.py index 263f9a41..e8444636 100644 --- a/llm_web_kit/simple.py +++ b/llm_web_kit/simple.py @@ -53,7 +53,7 @@ def get_extractor(pipe_tpl_name: str): return ExtractorFactory._extractors[pipe_tpl_name] -def _extract_html(url: str, html_content: str, pipe_tpl: str) -> DataJson: +def _extract_html(url: str, html_content: str, pipe_tpl: str, language: str = 'en') -> DataJson: """内部使用的统一HTML提取方法,返回处理后的DataJson对象. Args: @@ -70,6 +70,7 @@ def _extract_html(url: str, html_content: str, pipe_tpl: str) -> DataJson: - PipeTpl.MAGIC_HTML_NOCLIP: magic_html + markdown转换 - PipeTpl.LLM_NOCLIP: LLM + markdown转换 - PipeTpl.LAYOUT_BATCH_NOCLIP: layout_batch + markdown转换 + language: 语言,可选:'en' 或 'zh' Returns: DataJson: 处理后的DataJson对象,包含main_html和content_list等信息 @@ -83,6 +84,7 @@ def _extract_html(url: str, html_content: str, pipe_tpl: str) -> DataJson: 'dataset_name': f'llm-web-kit-{pipe_tpl}', 'data_source_category': 'HTML', 'file_bytes': len(html_content), + 'language': language, 'meta_info': {'input_datetime': datetime.now().strftime('%Y-%m-%d %H:%M:%S')} } @@ -94,33 +96,35 @@ def _extract_html(url: str, html_content: str, pipe_tpl: str) -> DataJson: # SDK方法(三种使用场景) # ======================================== -def extract_main_html_only(url: str, html_content: str, parser_type: str = PipeTpl.MAGIC_HTML) -> str: +def extract_main_html_only(url: str, html_content: str, parser_type: str = PipeTpl.MAGIC_HTML, language: str = 'en') -> str: """场景1: 只执行第一阶段,抽取main_html. Args: url: 网页URL html_content: 原始HTML内容 parser_type: 解析器类型,可选:PipeTpl.MAGIC_HTML, PipeTpl.LLM, PipeTpl.LAYOUT_BATCH + language: 语言,可选:'en' 或 'zh' Returns: str: 提取的主要HTML内容 """ - result = _extract_html(url, html_content, parser_type) + result = _extract_html(url, html_content, parser_type, language) return result.get('main_html', '') -def extract_content_from_main_html(url: str, main_html: str, output_format: str = 'md') -> str: +def extract_content_from_main_html(url: str, main_html: str, output_format: str = 'md', language: str = 'en') -> str: """场景2: 只执行第二阶段,从main_html抽取结构化内容. Args: url: 网页URL main_html: 已经抽取的主要HTML内容 output_format: 输出格式,'md' 或 'mm_md' + language: 语言,可选:'en' 或 'zh' Returns: str: 结构化的内容(markdown格式) """ - result = _extract_html(url, main_html, PipeTpl.NOCLIP) + result = _extract_html(url, main_html, PipeTpl.NOCLIP, language) content_list = result.get_content_list() if output_format == 'md': @@ -133,18 +137,19 @@ def extract_content_from_main_html(url: str, main_html: str, output_format: str raise InvalidOutputFormatException(f'Invalid output format: {output_format}') -def extract_content_from_html_with_magic_html(url: str, html_content: str, output_format: str = 'md') -> str: +def extract_content_from_html_with_magic_html(url: str, html_content: str, output_format: str = 'md', language: str = 'en') -> str: """场景3: 执行两个阶段,从magic_html抽取main_html,再从main_html抽取结构化内容. Args: url: 网页URL html_content: 原始HTML内容 output_format: 输出格式,'md' 或 'mm_md' + language: 语言,可选:'en' 或 'zh' Returns: str: 结构化的内容(markdown格式) """ - result = _extract_html(url, html_content, PipeTpl.MAGIC_HTML_NOCLIP) + result = _extract_html(url, html_content, PipeTpl.MAGIC_HTML_NOCLIP, language) content_list = result.get_content_list() if output_format == 'md': @@ -157,18 +162,19 @@ def extract_content_from_html_with_magic_html(url: str, html_content: str, outpu raise InvalidOutputFormatException(f'Invalid output format: {output_format}') -def extract_content_from_html_with_llm(url: str, html_content: str, output_format: str = 'md') -> str: +def extract_content_from_html_with_llm(url: str, html_content: str, output_format: str = 'md', language: str = 'en') -> str: """场景3: 执行两个阶段,从llm抽取main_html,再从main_html抽取结构化内容. Args: url: 网页URL html_content: 原始HTML内容 output_format: 输出格式,'md' 或 'mm_md' + language: 语言,可选:'en' 或 'zh' Returns: str: 结构化的内容(markdown格式) """ - result = _extract_html(url, html_content, PipeTpl.LLM_NOCLIP) + result = _extract_html(url, html_content, PipeTpl.LLM_NOCLIP, language) content_list = result.get_content_list() if output_format == 'md': @@ -181,18 +187,19 @@ def extract_content_from_html_with_llm(url: str, html_content: str, output_forma raise InvalidOutputFormatException(f'Invalid output format: {output_format}') -def extract_content_from_html_with_layout_batch(url: str, html_content: str, output_format: str = 'md') -> str: +def extract_content_from_html_with_layout_batch(url: str, html_content: str, output_format: str = 'md', language: str = 'en') -> str: """场景3: 执行两个阶段,从layout_batch抽取main_html,再从main_html抽取结构化内容. Args: url: 网页URL html_content: 原始HTML内容 output_format: 输出格式,'md' 或 'mm_md' + language: 语言,可选:'en' 或 'zh' Returns: str: 结构化的内容(markdown格式) """ - result = _extract_html(url, html_content, PipeTpl.LAYOUT_BATCH_NOCLIP) + result = _extract_html(url, html_content, PipeTpl.LAYOUT_BATCH_NOCLIP, language) content_list = result.get_content_list() if output_format == 'md': diff --git a/tests/llm_web_kit/simple/test_simple.py b/tests/llm_web_kit/simple/test_simple.py index 3d57fc16..f16b17a9 100644 --- a/tests/llm_web_kit/simple/test_simple.py +++ b/tests/llm_web_kit/simple/test_simple.py @@ -168,17 +168,17 @@ def test_extract_noclip_mm_md(self): def test_extract_main_html_only_default(self): """测试对外方法:extract_main_html_only默认使用MAGIC_HTML.""" - result = extract_main_html_only(self.url, self.html_content) + result = extract_main_html_only(self.url, self.html_content, language='en') self.assertEqual(result, self.main_html) def test_extract_content_from_main_html_default(self): """测试对外方法:extract_content_from_main_html默认md格式.""" - result = extract_content_from_main_html(self.url, self.main_html) + result = extract_content_from_main_html(self.url, self.main_html, language='en') self.assertEqual(result, self.expected_md) def test_extract_content_from_main_html_mm_md(self): """测试对外方法:extract_content_from_main_html输出mm_md格式.""" - result = extract_content_from_main_html(self.url, self.main_html, 'mm_md') + result = extract_content_from_main_html(self.url, self.main_html, 'mm_md', language='en') self.assertEqual(result, self.expected_mm_md) def test_extract_content_from_html_with_llm(self): @@ -215,12 +215,12 @@ def test_invalid_output_format_exception(self): def test_output_format_md(self): """测试md输出格式.""" - result = extract_content_from_main_html(self.url, self.main_html, 'md') + result = extract_content_from_main_html(self.url, self.main_html, 'md', language='en') self.assertEqual(result, self.expected_md) def test_output_format_mm_md(self): """测试mm_md输出格式.""" - result = extract_content_from_main_html(self.url, self.main_html, 'mm_md') + result = extract_content_from_main_html(self.url, self.main_html, 'mm_md', language='en') self.assertEqual(result, self.expected_mm_md) def test_output_format_json(self): @@ -230,6 +230,19 @@ def test_output_format_json(self): # JSON输出应该包含JSON结构 self.assertTrue(result.startswith('{') or result.startswith('[')) + def test_language_option_affects_spacing_en(self): + """测试 language='en' 时英文片段合并插入空格.""" + html_content = '

HelloWorld

' + md = extract_content_from_main_html(self.url, html_content, language='en') + self.assertIn('Hello World', md) + self.assertNotIn('HelloWorld', md) + + def test_language_option_affects_spacing_zh(self): + """测试 language='zh' 时英文片段合并不插入空格(与中文等无分词语言一致策略)。""" + html_content = '

HelloWorld

' + md = extract_content_from_main_html(self.url, html_content, language='zh') + self.assertIn('HelloWorld', md) + # ======================================== # 测试ExtractorFactory缓存机制和线程安全 # ======================================== @@ -524,17 +537,17 @@ def test_extract_word_press(self): def test_full_pipeline_integration(self): """测试完整的两阶段流水线.""" # 第一阶段:提取main_html - main_html = extract_main_html_only(self.url, self.real_html_content) + main_html = extract_main_html_only(self.url, self.real_html_content, language='en') self.assertIsInstance(main_html, str) self.assertTrue(len(main_html) > 0) # 第二阶段:从main_html提取内容 - markdown = extract_content_from_main_html(self.url, main_html) + markdown = extract_content_from_main_html(self.url, main_html, language='en') self.assertIsInstance(markdown, str) self.assertTrue(len(markdown) > 0) # 对比直接两阶段调用的结果 - direct_result = extract_content_from_html_with_magic_html(self.url, self.real_html_content) + direct_result = extract_content_from_html_with_magic_html(self.url, self.real_html_content, language='en') self.assertEqual(markdown, direct_result, "分步骤处理和直接处理的结果应该一致") def test_multiple_calls_consistency(self):