ccprocessor · e06084 · Sep 10, 2025
diff --git a/README.md b/README.md
@@ -103,14 +103,14 @@ def extract(url:str, html:str) -> str:
 
 if __name__=="__main__":
     url = ""
-    html = '''<html><body>
+    raw_html = '''<html><body>
     <div class="options-div-0-0 option-box__items" style="display: none;">
         <span class="bedroom-rate__title">Room Only Rate</span>
         <span class="bedroom-rate__price">£1,230.00</span>
     </div>
     <p>正常内容</p>
     </body></html>'''
-    markdown = extract(url, html)
+    markdown = extract(url, raw_html)
     print(markdown)
 ```
 
@@ -120,26 +120,38 @@ if __name__=="__main__":
 from llm_web_kit.simple import extract_content_from_main_html
 from loguru import logger
 
-def extract(url:str, html:str) -> str:
+def extract(url:str, raw_html: str, main_html:str) -> str:
     try:
-        nlp_md = extract_content_from_main_html(url, html)
-        # or mm_nlp_md = extract_content_from_main_html(url, html, 'mm_md')
-        # or nlp_md = extract_content_from_main_html(url, html, language='zh')
+        nlp_md = extract_content_from_main_html(url = url, raw_html = raw_html, main_html = main_html)
+        # or mm_nlp_md = extract_content_from_main_html(url = url, raw_html = raw_html, main_html = main_html, 'mm_md')
+        # or nlp_md = extract_content_from_main_html(url = url, raw_html = raw_html, main_html = main_html, language='zh')
         return nlp_md
     except Exception as e:
         logger.exception(e)
     return None
 
 if __name__=="__main__":
     url = ""
-    html = '''<html><body>
+    raw_html = '''<html>
+    <meta charset="utf-8"><meta content="IE=edge" http-equiv="X-UA-Compatible"><meta content="width=device-width,initial-scale=1,shrink-to-fit=no" name="viewport">
+    <script>MathJax={tex:{inlineMath:[["$","$"],["\\(","\\)"]],processEscapes:!0},svg:{fontCache:"global"}}</script><script async="" id="MathJax-script" src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-svg.js" type="text/javascript"></script>
+    <body>
+    <div class="options-div-0-0 option-box__items" style="display: none;">
+        <span class="bedroom-rate__title">Room Only Rate</span>
+        <span class="bedroom-rate__price">£1,230.00</span>
+    </div>
+    <p cc-select="true" class="mark-selected" data-anno-uid="anno-uid-wygjielbjln" style="">Are the filtrations after these steps. Why only these? Looking at $\mathcal{F}_1$, we can obtain probabilities for the following events:</p>
+    <p>正常内容</p>
+    </body></html>'''
+    main_html = '''<html><body>
     <div class="options-div-0-0 option-box__items" style="display: none;">
         <span class="bedroom-rate__title">Room Only Rate</span>
         <span class="bedroom-rate__price">£1,230.00</span>
     </div>
+    <p cc-select="true" class="mark-selected" data-anno-uid="anno-uid-wygjielbjln" style="">Are the filtrations after these steps. Why only these? Looking at $\mathcal{F}_1$, we can obtain probabilities for the following events:</p>
     <p>正常内容</p>
     </body></html>'''
-    markdown = extract(url, html)
+    markdown = extract(url, raw_html, main_html)
     print(markdown)
 ```
 
@@ -159,14 +171,14 @@ def extract(url:str, html:str) -> str:
 
 if __name__=="__main__":
     url = ""
-    html = '''<html><body>
+    raw_html = '''<html><body>
     <div class="options-div-0-0 option-box__items" style="display: none;">
         <span class="bedroom-rate__title">Room Only Rate</span>
         <span class="bedroom-rate__price">£1,230.00</span>
     </div>
     <p>正常内容</p>
     </body></html>'''
-    main_html = extract(url, html)
+    main_html = extract(url, raw_html)
     print(main_html)
 ```
 

diff --git a/llm_web_kit/simple.py b/llm_web_kit/simple.py
@@ -53,12 +53,13 @@ def get_extractor(pipe_tpl_name: str):
         return ExtractorFactory._extractors[pipe_tpl_name]
 
 
-def _extract_html(url: str, html_content: str, pipe_tpl: str, language: str = 'en') -> DataJson:
+def _extract_html(url: str, raw_html: str, main_html: str, pipe_tpl: str, language: str = 'en') -> DataJson:
     """内部使用的统一HTML提取方法，返回处理后的DataJson对象.
 
     Args:
         url: 网页URL
-        html_content: 原始HTML内容（或main_html，取决于pipe_tpl）
+        raw_html: 原始HTML内容
+        main_html: 正文对应的main_html
         pipe_tpl: 处理类型，支持：
             # 只执行第一阶段：
             - PipeTpl.MAGIC_HTML: 使用magic_html提取main_html
@@ -80,10 +81,11 @@ def _extract_html(url: str, html_content: str, pipe_tpl: str, language: str = 'e
     input_data_dict = {
         'track_id': str(uuid.uuid4()),
         'url': url,
-        'html': html_content,
+        'html': raw_html,
+        'main_html': main_html,
         'dataset_name': f'llm-web-kit-{pipe_tpl}',
         'data_source_category': 'HTML',
-        'file_bytes': len(html_content),
+        'file_bytes': len(raw_html),
         'language': language,
         'meta_info': {'input_datetime': datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
     }
@@ -96,35 +98,36 @@ def _extract_html(url: str, html_content: str, pipe_tpl: str, language: str = 'e
 # SDK方法（三种使用场景）
 # ========================================
 
-def extract_main_html_only(url: str, html_content: str, parser_type: str = PipeTpl.MAGIC_HTML, language: str = 'en') -> str:
+def extract_main_html_only(url: str, raw_html: str, pipe_tpl: str = PipeTpl.MAGIC_HTML, language: str = 'en') -> str:
     """场景1: 只执行第一阶段，抽取main_html.
 
     Args:
         url: 网页URL
-        html_content: 原始HTML内容
-        parser_type: 解析器类型，可选：PipeTpl.MAGIC_HTML, PipeTpl.LLM, PipeTpl.LAYOUT_BATCH
+        raw_html: 原始HTML内容
+        pipe_tpl: 解析器类型，可选：PipeTpl.MAGIC_HTML, PipeTpl.LLM, PipeTpl.LAYOUT_BATCH
         language: 语言，可选：'en' 或 'zh'
 
     Returns:
         str: 提取的主要HTML内容
     """
-    result = _extract_html(url, html_content, parser_type, language)
+    result = _extract_html(url=url, raw_html=raw_html, main_html='', pipe_tpl=pipe_tpl, language=language)
     return result.get('main_html', '')
 
 
-def extract_content_from_main_html(url: str, main_html: str, output_format: str = 'md', language: str = 'en') -> str:
+def extract_content_from_main_html(url: str, raw_html: str, main_html: str, output_format: str = 'md', language: str = 'en') -> str:
     """场景2: 只执行第二阶段，从main_html抽取结构化内容.
 
     Args:
         url: 网页URL
-        main_html: 已经抽取的主要HTML内容
+        raw_html: 原始HTML内容
+        main_html: 正文对应的main_html
         output_format: 输出格式，'md' 或 'mm_md'
         language: 语言，可选：'en' 或 'zh'
 
     Returns:
         str: 结构化的内容（markdown格式）
     """
-    result = _extract_html(url, main_html, PipeTpl.NOCLIP, language)
+    result = _extract_html(url=url, raw_html=raw_html, main_html=main_html, pipe_tpl=PipeTpl.NOCLIP, language=language)
     content_list = result.get_content_list()
 
     if output_format == 'md':
@@ -137,19 +140,19 @@ def extract_content_from_main_html(url: str, main_html: str, output_format: str
         raise InvalidOutputFormatException(f'Invalid output format: {output_format}')
 
 
-def extract_content_from_html_with_magic_html(url: str, html_content: str, output_format: str = 'md', language: str = 'en') -> str:
+def extract_content_from_html_with_magic_html(url: str, raw_html: str, output_format: str = 'md', language: str = 'en') -> str:
     """场景3: 执行两个阶段，从magic_html抽取main_html，再从main_html抽取结构化内容.
 
     Args:
         url: 网页URL
-        html_content: 原始HTML内容
+        raw_html: 原始HTML内容
         output_format: 输出格式，'md' 或 'mm_md'
         language: 语言，可选：'en' 或 'zh'
 
     Returns:
         str: 结构化的内容（markdown格式）
     """
-    result = _extract_html(url, html_content, PipeTpl.MAGIC_HTML_NOCLIP, language)
+    result = _extract_html(url=url, raw_html=raw_html, main_html='', pipe_tpl=PipeTpl.MAGIC_HTML_NOCLIP, language=language)
     content_list = result.get_content_list()
 
     if output_format == 'md':
@@ -162,19 +165,19 @@ def extract_content_from_html_with_magic_html(url: str, html_content: str, outpu
         raise InvalidOutputFormatException(f'Invalid output format: {output_format}')
 
 
-def extract_content_from_html_with_llm(url: str, html_content: str, output_format: str = 'md', language: str = 'en') -> str:
+def extract_content_from_html_with_llm(url: str, raw_html: str, output_format: str = 'md', language: str = 'en') -> str:
     """场景3: 执行两个阶段，从llm抽取main_html，再从main_html抽取结构化内容.
 
     Args:
         url: 网页URL
-        html_content: 原始HTML内容
+        raw_html: 原始HTML内容
         output_format: 输出格式，'md' 或 'mm_md'
         language: 语言，可选：'en' 或 'zh'
 
     Returns:
         str: 结构化的内容（markdown格式）
     """
-    result = _extract_html(url, html_content, PipeTpl.LLM_NOCLIP, language)
+    result = _extract_html(url=url, raw_html=raw_html, main_html='', pipe_tpl=PipeTpl.LLM_NOCLIP, language=language)
     content_list = result.get_content_list()
 
     if output_format == 'md':
@@ -187,19 +190,19 @@ def extract_content_from_html_with_llm(url: str, html_content: str, output_forma
         raise InvalidOutputFormatException(f'Invalid output format: {output_format}')
 
 
-def extract_content_from_html_with_layout_batch(url: str, html_content: str, output_format: str = 'md', language: str = 'en') -> str:
+def extract_content_from_html_with_layout_batch(url: str, raw_html: str, output_format: str = 'md', language: str = 'en') -> str:
     """场景3: 执行两个阶段，从layout_batch抽取main_html，再从main_html抽取结构化内容.
 
     Args:
         url: 网页URL
-        html_content: 原始HTML内容
+        raw_html: 原始HTML内容
         output_format: 输出格式，'md' 或 'mm_md'
         language: 语言，可选：'en' 或 'zh'
 
     Returns:
         str: 结构化的内容（markdown格式）
     """
-    result = _extract_html(url, html_content, PipeTpl.LAYOUT_BATCH_NOCLIP, language)
+    result = _extract_html(url=url, raw_html=raw_html, main_html='', pipe_tpl=PipeTpl.LAYOUT_BATCH_NOCLIP, language=language)
     content_list = result.get_content_list()
 
     if output_format == 'md':