ccprocessor · e06084 · Aug 25, 2025 · Aug 25, 2025 · Aug 25, 2025
diff --git a/README.md b/README.md
@@ -85,6 +85,7 @@ def extract(url:str, html:str) -> str:
     try:
         nlp_md = extract_content_from_html_with_magic_html(url, html)
         # or mm_nlp_md = extract_content_from_html_with_magic_html(url, html, 'mm_md')
+        # or nlp_md = extract_content_from_html_with_magic_html(url, html, language='zh')
         return nlp_md
     except Exception as e:
         logger.exception(e)
@@ -113,6 +114,7 @@ def extract(url:str, html:str) -> str:
     try:
         nlp_md = extract_content_from_main_html(url, html)
         # or mm_nlp_md = extract_content_from_main_html(url, html, 'mm_md')
+        # or nlp_md = extract_content_from_main_html(url, html, language='zh')
         return nlp_md
     except Exception as e:
         logger.exception(e)

diff --git a/llm_web_kit/main_html_parser/parser/layout_batch_parser.py b/llm_web_kit/main_html_parser/parser/layout_batch_parser.py
@@ -151,7 +151,7 @@ def find_blocks_drop(self, element, depth, element_dict, parent_keyy, parent_lab
             length_tail = len(element.tail.strip())
         idd = element.get('id')
         tag = element.tag
-        layer_nodes = element_dict[depth]
+        layer_nodes = element_dict.get(depth, {})
         class_tag = element.get('class')
         ori_keyy = (tag, class_tag, idd)
         if idd and idd.strip():

diff --git a/llm_web_kit/simple.py b/llm_web_kit/simple.py
@@ -53,7 +53,7 @@ def get_extractor(pipe_tpl_name: str):
         return ExtractorFactory._extractors[pipe_tpl_name]
 
 
-def _extract_html(url: str, html_content: str, pipe_tpl: str) -> DataJson:
+def _extract_html(url: str, html_content: str, pipe_tpl: str, language: str = 'en') -> DataJson:
     """内部使用的统一HTML提取方法，返回处理后的DataJson对象.
 
     Args:
@@ -70,6 +70,7 @@ def _extract_html(url: str, html_content: str, pipe_tpl: str) -> DataJson:
             - PipeTpl.MAGIC_HTML_NOCLIP: magic_html + markdown转换
             - PipeTpl.LLM_NOCLIP: LLM + markdown转换
             - PipeTpl.LAYOUT_BATCH_NOCLIP: layout_batch + markdown转换
+        language: 语言，可选：'en' 或 'zh'
 
     Returns:
         DataJson: 处理后的DataJson对象，包含main_html和content_list等信息
@@ -83,6 +84,7 @@ def _extract_html(url: str, html_content: str, pipe_tpl: str) -> DataJson:
         'dataset_name': f'llm-web-kit-{pipe_tpl}',
         'data_source_category': 'HTML',
         'file_bytes': len(html_content),
+        'language': language,
         'meta_info': {'input_datetime': datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
     }
 
@@ -94,33 +96,35 @@ def _extract_html(url: str, html_content: str, pipe_tpl: str) -> DataJson:
 # SDK方法（三种使用场景）
 # ========================================
 
-def extract_main_html_only(url: str, html_content: str, parser_type: str = PipeTpl.MAGIC_HTML) -> str:
+def extract_main_html_only(url: str, html_content: str, parser_type: str = PipeTpl.MAGIC_HTML, language: str = 'en') -> str:
     """场景1: 只执行第一阶段，抽取main_html.
 
     Args:
         url: 网页URL
         html_content: 原始HTML内容
         parser_type: 解析器类型，可选：PipeTpl.MAGIC_HTML, PipeTpl.LLM, PipeTpl.LAYOUT_BATCH
+        language: 语言，可选：'en' 或 'zh'
 
     Returns:
         str: 提取的主要HTML内容
     """
-    result = _extract_html(url, html_content, parser_type)
+    result = _extract_html(url, html_content, parser_type, language)
     return result.get('main_html', '')
 
 
-def extract_content_from_main_html(url: str, main_html: str, output_format: str = 'md') -> str:
+def extract_content_from_main_html(url: str, main_html: str, output_format: str = 'md', language: str = 'en') -> str:
     """场景2: 只执行第二阶段，从main_html抽取结构化内容.
 
     Args:
         url: 网页URL
         main_html: 已经抽取的主要HTML内容
         output_format: 输出格式，'md' 或 'mm_md'
+        language: 语言，可选：'en' 或 'zh'
 
     Returns:
         str: 结构化的内容（markdown格式）
     """
-    result = _extract_html(url, main_html, PipeTpl.NOCLIP)
+    result = _extract_html(url, main_html, PipeTpl.NOCLIP, language)
     content_list = result.get_content_list()
 
     if output_format == 'md':
@@ -133,18 +137,19 @@ def extract_content_from_main_html(url: str, main_html: str, output_format: str
         raise InvalidOutputFormatException(f'Invalid output format: {output_format}')
 
 
-def extract_content_from_html_with_magic_html(url: str, html_content: str, output_format: str = 'md') -> str:
+def extract_content_from_html_with_magic_html(url: str, html_content: str, output_format: str = 'md', language: str = 'en') -> str:
     """场景3: 执行两个阶段，从magic_html抽取main_html，再从main_html抽取结构化内容.
 
     Args:
         url: 网页URL
         html_content: 原始HTML内容
         output_format: 输出格式，'md' 或 'mm_md'
+        language: 语言，可选：'en' 或 'zh'
 
     Returns:
         str: 结构化的内容（markdown格式）
     """
-    result = _extract_html(url, html_content, PipeTpl.MAGIC_HTML_NOCLIP)
+    result = _extract_html(url, html_content, PipeTpl.MAGIC_HTML_NOCLIP, language)
     content_list = result.get_content_list()
 
     if output_format == 'md':
@@ -157,18 +162,19 @@ def extract_content_from_html_with_magic_html(url: str, html_content: str, outpu
         raise InvalidOutputFormatException(f'Invalid output format: {output_format}')
 
 
-def extract_content_from_html_with_llm(url: str, html_content: str, output_format: str = 'md') -> str:
+def extract_content_from_html_with_llm(url: str, html_content: str, output_format: str = 'md', language: str = 'en') -> str:
     """场景3: 执行两个阶段，从llm抽取main_html，再从main_html抽取结构化内容.
 
     Args:
         url: 网页URL
         html_content: 原始HTML内容
         output_format: 输出格式，'md' 或 'mm_md'
+        language: 语言，可选：'en' 或 'zh'
 
     Returns:
         str: 结构化的内容（markdown格式）
     """
-    result = _extract_html(url, html_content, PipeTpl.LLM_NOCLIP)
+    result = _extract_html(url, html_content, PipeTpl.LLM_NOCLIP, language)
     content_list = result.get_content_list()
 
     if output_format == 'md':
@@ -181,18 +187,19 @@ def extract_content_from_html_with_llm(url: str, html_content: str, output_forma
         raise InvalidOutputFormatException(f'Invalid output format: {output_format}')
 
 
-def extract_content_from_html_with_layout_batch(url: str, html_content: str, output_format: str = 'md') -> str:
+def extract_content_from_html_with_layout_batch(url: str, html_content: str, output_format: str = 'md', language: str = 'en') -> str:
     """场景3: 执行两个阶段，从layout_batch抽取main_html，再从main_html抽取结构化内容.
 
     Args:
         url: 网页URL
         html_content: 原始HTML内容
         output_format: 输出格式，'md' 或 'mm_md'
+        language: 语言，可选：'en' 或 'zh'
 
     Returns:
         str: 结构化的内容（markdown格式）
     """
-    result = _extract_html(url, html_content, PipeTpl.LAYOUT_BATCH_NOCLIP)
+    result = _extract_html(url, html_content, PipeTpl.LAYOUT_BATCH_NOCLIP, language)
     content_list = result.get_content_list()
 
     if output_format == 'md':

diff --git a/tests/llm_web_kit/simple/test_simple.py b/tests/llm_web_kit/simple/test_simple.py
@@ -168,17 +168,17 @@ def test_extract_noclip_mm_md(self):
 
     def test_extract_main_html_only_default(self):
         """测试对外方法：extract_main_html_only默认使用MAGIC_HTML."""
-        result = extract_main_html_only(self.url, self.html_content)
+        result = extract_main_html_only(self.url, self.html_content, language='en')
         self.assertEqual(result, self.main_html)
 
     def test_extract_content_from_main_html_default(self):
         """测试对外方法：extract_content_from_main_html默认md格式."""
-        result = extract_content_from_main_html(self.url, self.main_html)
+        result = extract_content_from_main_html(self.url, self.main_html, language='en')
         self.assertEqual(result, self.expected_md)
 
     def test_extract_content_from_main_html_mm_md(self):
         """测试对外方法：extract_content_from_main_html输出mm_md格式."""
-        result = extract_content_from_main_html(self.url, self.main_html, 'mm_md')
+        result = extract_content_from_main_html(self.url, self.main_html, 'mm_md', language='en')
         self.assertEqual(result, self.expected_mm_md)
 
     def test_extract_content_from_html_with_llm(self):
@@ -215,12 +215,12 @@ def test_invalid_output_format_exception(self):
 
     def test_output_format_md(self):
         """测试md输出格式."""
-        result = extract_content_from_main_html(self.url, self.main_html, 'md')
+        result = extract_content_from_main_html(self.url, self.main_html, 'md', language='en')
         self.assertEqual(result, self.expected_md)
 
     def test_output_format_mm_md(self):
         """测试mm_md输出格式."""
-        result = extract_content_from_main_html(self.url, self.main_html, 'mm_md')
+        result = extract_content_from_main_html(self.url, self.main_html, 'mm_md', language='en')
         self.assertEqual(result, self.expected_mm_md)
 
     def test_output_format_json(self):
@@ -230,6 +230,19 @@ def test_output_format_json(self):
         # JSON输出应该包含JSON结构
         self.assertTrue(result.startswith('{') or result.startswith('['))
 
+    def test_language_option_affects_spacing_en(self):
+        """测试 language='en' 时英文片段合并插入空格."""
+        html_content = '<div><body><p><span>Hello</span><span>World</span></p></body></div>'
+        md = extract_content_from_main_html(self.url, html_content, language='en')
+        self.assertIn('Hello World', md)
+        self.assertNotIn('HelloWorld', md)
+
+    def test_language_option_affects_spacing_zh(self):
+        """测试 language='zh' 时英文片段合并不插入空格（与中文等无分词语言一致策略）。"""
+        html_content = '<div><body><p><span>Hello</span><span>World</span></p></body></div>'
+        md = extract_content_from_main_html(self.url, html_content, language='zh')
+        self.assertIn('HelloWorld', md)
+
     # ========================================
     # 测试ExtractorFactory缓存机制和线程安全
     # ========================================
@@ -524,17 +537,17 @@ def test_extract_word_press(self):
     def test_full_pipeline_integration(self):
         """测试完整的两阶段流水线."""
         # 第一阶段：提取main_html
-        main_html = extract_main_html_only(self.url, self.real_html_content)
+        main_html = extract_main_html_only(self.url, self.real_html_content, language='en')
         self.assertIsInstance(main_html, str)
         self.assertTrue(len(main_html) > 0)
 
         # 第二阶段：从main_html提取内容
-        markdown = extract_content_from_main_html(self.url, main_html)
+        markdown = extract_content_from_main_html(self.url, main_html, language='en')
         self.assertIsInstance(markdown, str)
         self.assertTrue(len(markdown) > 0)
 
         # 对比直接两阶段调用的结果
-        direct_result = extract_content_from_html_with_magic_html(self.url, self.real_html_content)
+        direct_result = extract_content_from_html_with_magic_html(self.url, self.real_html_content, language='en')
         self.assertEqual(markdown, direct_result, "分步骤处理和直接处理的结果应该一致")
 
     def test_multiple_calls_consistency(self):