ccprocessor · yogacc33 · Mar 20, 2025 · Mar 20, 2025 · Mar 20, 2025
diff --git a/llm_web_kit/extractor/html/recognizer/constant.py b/llm_web_kit/extractor/html/recognizer/constant.py
@@ -0,0 +1,3 @@
+LINE_BREAK_WINDOWS = '\\r\\n'
+LINE_BREAK_UNIX = '\\n'
+PARAGRAPH_SEPARATOR = '\n\n'
diff --git a/llm_web_kit/extractor/html/recognizer/text.py b/llm_web_kit/extractor/html/recognizer/text.py
@@ -11,6 +11,8 @@
 from llm_web_kit.libs.doc_element_type import DocElementType, ParagraphTextType
 from llm_web_kit.libs.html_utils import element_to_html
 
+from .constant import LINE_BREAK_UNIX, LINE_BREAK_WINDOWS, PARAGRAPH_SEPARATOR
+
 special_symbols = [  # TODO 从文件读取
     '®',  # 注册商标符号
     '™',  # 商标符号
@@ -112,11 +114,11 @@ def __combine_text(self, text1:str, text2:str, lang='en') -> str:
         text2 = text2.strip(' ') if text2 else ''
         if lang == 'zh':
             txt = text1 + text2
-            return txt.strip().replace('\\r\\n', '\n').replace('\\n', '\n')
+            return txt.strip().replace(LINE_BREAK_WINDOWS, PARAGRAPH_SEPARATOR).replace(LINE_BREAK_UNIX, PARAGRAPH_SEPARATOR)
         else:
             words_sep = '' if text2[0] in string.punctuation or text2[0] in special_symbols else ' '
             txt = text1 + words_sep + text2
-            return txt.strip().replace('\\r\\n', '\n').replace('\\n', '\n')
+            return txt.strip().replace(LINE_BREAK_WINDOWS, PARAGRAPH_SEPARATOR).replace(LINE_BREAK_UNIX, PARAGRAPH_SEPARATOR)
 
     def __get_paragraph_text(self, root: HtmlElement) -> List[dict]:
         """
@@ -145,7 +147,7 @@ def __get_paragraph_text_recusive(el: HtmlElement, text: str) -> str:
                     text = ''
                 para_text.append({'c': el.text, 't': ParagraphTextType.CODE_INLINE})
             elif el.tag in ['br']:
-                text += '\n'
+                text += PARAGRAPH_SEPARATOR
             else:
                 if el.text and el.text.strip():
                     text = self.__combine_text(text, el.text.strip())

diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/text10.html b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/text10.html
diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/text2.html b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/text2.html
diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/text3.html b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/text3.html
diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/text4.html b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/text4.html
diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/text5.html b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/text5.html
diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/text6.html b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/text6.html
diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/text7.html b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/text7.html
diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/text8.html b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/text8.html
diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/text.html b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/text.html
diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_text.py b/tests/llm_web_kit/extractor/html/recognizer/test_text.py
@@ -27,9 +27,9 @@ def test_text_1(self):
             html_content = file.read()
         assert self.text_recognize._TextParagraphRecognizer__combine_text('知识乱象\n',
                                                                           '中共中央政治局召开会议审议《成-2020年10月16日新闻联播',
-                                                                          'zh')[:7] == '知识乱象\n中共'
+                                                                          'zh') == '知识乱象\n中共中央政治局召开会议审议《成-2020年10月16日新闻联播'
         result = self.text_recognize.recognize('http://www.baidu.com', [(html_content, html_content)], html_content)
-        assert result[909][0][1413:1422] == '知识乱象\\n 中共'
+        assert '知识乱象\\n\\n 中共中央政治局' in result[908][0]
 
     def test_text_2(self):
         """
@@ -51,7 +51,7 @@ def test_text_2(self):
         input_data = DataJson(test_data)
         result = chain.extract(input_data)
         content_md = result.get_content_list().to_mm_md()
-        assert content_md[:130] == '''For Swivel Hand Rivet Squeezer or any snap Type .187 Shank Diameter Squeezer\n \n\n Instructions for Selecting Rivet Sets:\n\nTo develo'''
+        assert 'Selecting Rivet Sets:\n\n To develop maximum power' in content_md
 
     def test_text_3(self):
         """
@@ -73,7 +73,7 @@ def test_text_3(self):
         input_data = DataJson(test_data)
         result = chain.extract(input_data)
         content_md = result.get_content_list().to_mm_md()
-        assert content_md[371:584] == '''2.\n The speed of light in a material is 2.50x10^8 meters per second. What is the index of refraction of the\n material?\n\n\n\n\n\n 2. Relevant equations\n\n\n\n\n\n\n\n 3. The\n attempt at a solution\n\n1. di=22.22\n\n\n\n2. Dont know'''
+        assert "1. The problem statement, all variables and given/known data\n\n A woman of height 1.7 meters stands directly in front of a convex mirror 2.0 meters away. The mirror has a radius of curvature, R=-50cm. Find the location and size of a woman's image using the ray diagram and mirror/lens equation.\n\n\n\n----------\n\n\n\n 2. The speed of light in a material is 2.50x10^8 meters per second. What is the index of refraction of the material?\n\n\n\n\n\n 2. Relevant equations\n\n\n\n\n\n\n\n 3. The attempt at a solution\n\n 1. di=22.22\n\n\n\n 2. Dont know" in content_md
 
     def test_text_4(self):
         """
@@ -95,7 +95,7 @@ def test_text_4(self):
         input_data = DataJson(test_data)
         result = chain.extract(input_data)
         content_md = result.get_content_list().to_mm_md()
-        assert content_md[46:475] == '''1. The problem statement, all variables and given/known data\n\n 2. Relevant equations\n\n\n\nSee attachment\n\n\n\n 3. The attempt at a solution\n\nI solved the problem (on the same page as problem, written in pencil) but the direction of the acceleration that I calculated is different, I dont understand why my answer is wrong if the normal acceleration always towards the center and the tangent acceleration is suppossed to be clockwise.'''
+        assert '1. The problem statement, all variables and given/known data\n\n 2. Relevant equations\n\n\n\n See attachment\n\n\n\n 3. The attempt at a solution\n\n I solved the problem' in content_md
 
     def test_text_5(self):
         """
@@ -117,7 +117,7 @@ def test_text_5(self):
         input_data = DataJson(test_data)
         result = chain.extract(input_data)
         content_md = result.get_content_list().to_mm_md()
-        assert content_md[1214:1449] == '''Please Note:\n\n 1. Charge the battery on receiving even if it will not be used soon.\n\n 2. Charge the battery EVERY MONTH if not in use for long periods to prevent over-discharging of the battery. This can cause irreparable damage to it.'''
+        assert 'Please Note:\n\n 1. Charge the battery on receiving even if it will not be used soon.\n\n 2. Charge the battery EVERY MONTH if not in use for long periods to prevent over-discharging of the battery. This can cause irreparable damage to it.' in content_md
 
     def test_text_6(self):
         """
@@ -139,7 +139,7 @@ def test_text_6(self):
         input_data = DataJson(test_data)
         result = chain.extract(input_data)
         content_md = result.get_content_list().to_mm_md()
-        assert content_md[255:450] == '''1813 года\n\n5864.\\t Лабиринт волшебства, или удивительные приключения восточных принцев, сочинение В. Протопоповича; Москва, 1786 г. - в 8°. \n\n\n\n\n\n 5865.\\t Лакировальщик, или ясное и подробное нас'''
+        assert '1813 года\n\n5864. Лабиринт волшебства, или удивительные приключения восточных принцев, сочинение В. Протопоповича; Москва, 1786 г. - в 8°.\n\n\n\n 5865. Лакировальщик' in content_md
 
     def test_text_7(self):
         """
@@ -151,7 +151,7 @@ def test_text_7(self):
         with open(Path(__file__).parent.parent.parent / 'assets/extractor_chain_input/good_data/html/text7.html', 'r') as file:
             html_content = file.read()
         result = self.text_recognize.recognize('http://www.baidu.com', [(html_content, html_content)], html_content)
-        assert '1) A man takes 5 hrs and 45 mins to walk to a certain place and ride back' in result[0][0] and BaseHTMLElementRecognizer.is_cc_html(result[0][0])
+        assert '1) A man takes 5 hrs and 45 mins to walk to a certain place and ride back' in result[51][0] and BaseHTMLElementRecognizer.is_cc_html(result[51][0])
 
     def test_text_8(self):
         """
@@ -163,7 +163,7 @@ def test_text_8(self):
         with open(Path(__file__).parent.parent.parent / 'assets/extractor_chain_input/good_data/html/text8.html', 'r') as file:
             html_content = file.read()
         result = self.text_recognize.recognize('http://www.baidu.com', [(html_content, html_content)], html_content)
-        assert "40xy' -ln(x^8) = 0\\n\\n\\nInitial Condition: y(1)=31" in result[0][0] and BaseHTMLElementRecognizer.is_cc_html(result[0][0])
+        assert "40xy\' -ln(x^8) = 0\\n\\n\\n\\n Initial Condition: y(1)=31\\n\\n\\n\\n Work:" in result[54][0] and BaseHTMLElementRecognizer.is_cc_html(result[54][0])
 
     def test_text_9(self):
         """
@@ -175,7 +175,7 @@ def test_text_9(self):
         with open(Path(__file__).parent.parent.parent / 'assets/extractor_chain_input/good_data/html/text9.html', 'r') as file:
             html_content = file.read()
         result = self.text_recognize.recognize('http://www.baidu.com', [(html_content, html_content)], html_content)
-        assert '1) Consider the formula f(x)=lim(n--&gt;infinity)((x^n)/(1+x^n)).\\n Let D={x:f(x) is an element of R}. Calculate f(x) for all x elements of D and determine where f: D--&gt;R is continuous.\\n\\n 2) Let f: D--&gt;R and suppose that f(x) greater than equal 0 for all x elements of D. Define sqrt(f)--&gt;R by (sqrt(f))(x) = sqrt(f(x)). If f is continuous at c elements of D, prove that sqrt(f) is continuous at c.' in result[50][0] and BaseHTMLElementRecognizer.is_cc_html(result[50][0])
+        assert '1) Consider the formula f(x)=lim(n--&gt;infinity)((x^n)/(1+x^n)).\\n\\n Let D={x:f(x) is an element of R}. Calculate f(x) for all x elements of D and determine where f: D--&gt;R is continuous.\\n\\n\\n\\n 2) Let f: D--&gt;R and suppose that f(x) greater than equal 0 for all x elements of D. Define sqrt(f)--&gt;R by (sqrt(f))(x) = sqrt(f(x)). If f is continuous at c elements of D, prove that sqrt(f) is continuous at c.' in result[50][0] and BaseHTMLElementRecognizer.is_cc_html(result[50][0])
 
     def test_text_10(self):
         """
@@ -197,4 +197,4 @@ def test_text_10(self):
         input_data = DataJson(test_data)
         result = chain.extract(input_data)
         content_md = result.get_content_list().to_mm_md()
-        assert content_md[306:450] == '''So far I have 2 sets of questions (but I\'m onlin in the 2nd chapter now\n\n![\\":smile:\\"]( "\\"Smile")\n\n)\n\n\n\n1)\n\nIn the book, Michio Kaku says the '''
+        assert 'So far I have 2 sets of questions (but I\'m onlin in the 2nd chapter now\n\n![:smile:]( "Smile    :smile:")\n\n)\n\n\n\n 1)\n\n In the book' in content_md