ccprocessor · e06084 · Sep 12, 2025 · Sep 12, 2025 · Sep 12, 2025
diff --git a/llm_web_kit/extractor/html/recognizer/text.py b/llm_web_kit/extractor/html/recognizer/text.py
@@ -267,10 +267,10 @@ def __get_paragraph_text_recusive(el: HtmlElement, text: str) -> str:
             return text
 
         if final := __get_paragraph_text_recusive(root, ''):
-            para_text.append({'c': final.replace('$br$', PARAGRAPH_SEPARATOR), 't': ParagraphTextType.TEXT})
+            para_text.append({'c': final, 't': ParagraphTextType.TEXT})
 
         for item in para_text:
-            item['c'] = restore_sub_sup_from_text_regex(item['c'])
+            item['c'] = restore_sub_sup_from_text_regex(item['c']).replace('$br$', PARAGRAPH_SEPARATOR)
         return para_text
 
     def __extract_paragraphs(self, root: HtmlElement):

diff --git a/llm_web_kit/input/datajson.py b/llm_web_kit/input/datajson.py
@@ -519,7 +519,6 @@ def __join_one_para(self, para: list, exclude_inline_types: list = []) -> str:
                 c = el['c']
                 if not c or not c.strip():
                     continue
-                c = c.strip()
                 new_c = self.__escape_md_special_chars(c)  # 转义特殊字符
                 one_para.append(new_c)
             elif el['t'] == ParagraphTextType.EQUATION_INLINE: