ccprocessor · yogacc33 · Mar 20, 2025 · Mar 20, 2025
diff --git a/llm_web_kit/extractor/html/recognizer/constant.py b/llm_web_kit/extractor/html/recognizer/constant.py
diff --git a/llm_web_kit/extractor/html/recognizer/text.py b/llm_web_kit/extractor/html/recognizer/text.py
@@ -11,8 +11,6 @@
 from llm_web_kit.libs.doc_element_type import DocElementType, ParagraphTextType
 from llm_web_kit.libs.html_utils import element_to_html
 
-from .constant import LINE_BREAK_UNIX, LINE_BREAK_WINDOWS, PARAGRAPH_SEPARATOR
-
 special_symbols = [  # TODO 从文件读取
     '®',  # 注册商标符号
     '™',  # 商标符号
@@ -114,11 +112,11 @@ def __combine_text(self, text1:str, text2:str, lang='en') -> str:
         text2 = text2.strip(' ') if text2 else ''
         if lang == 'zh':
             txt = text1 + text2
-            return txt.strip().replace(LINE_BREAK_WINDOWS, PARAGRAPH_SEPARATOR).replace(LINE_BREAK_UNIX, PARAGRAPH_SEPARATOR)
+            return txt.strip().replace('\\r\\n', '\n').replace('\\n', '\n')
         else:
             words_sep = '' if text2[0] in string.punctuation or text2[0] in special_symbols else ' '
             txt = text1 + words_sep + text2
-            return txt.strip().replace(LINE_BREAK_WINDOWS, PARAGRAPH_SEPARATOR).replace(LINE_BREAK_UNIX, PARAGRAPH_SEPARATOR)
+            return txt.strip().replace('\\r\\n', '\n').replace('\\n', '\n')
 
     def __get_paragraph_text(self, root: HtmlElement) -> List[dict]:
         """
@@ -147,7 +145,7 @@ def __get_paragraph_text_recusive(el: HtmlElement, text: str) -> str:
                     text = ''
                 para_text.append({'c': el.text, 't': ParagraphTextType.CODE_INLINE})
             elif el.tag in ['br']:
-                text += PARAGRAPH_SEPARATOR
+                text += '\n'
             else:
                 if el.text and el.text.strip():
                     text = self.__combine_text(text, el.text.strip())