Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions llm_web_kit/extractor/html/recognizer/constant.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
LINE_BREAK_WINDOWS = '\\r\\n'
LINE_BREAK_UNIX = '\\n'
PARAGRAPH_SEPARATOR = '\n\n'
8 changes: 5 additions & 3 deletions llm_web_kit/extractor/html/recognizer/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
from llm_web_kit.libs.doc_element_type import DocElementType, ParagraphTextType
from llm_web_kit.libs.html_utils import element_to_html

from .constant import LINE_BREAK_UNIX, LINE_BREAK_WINDOWS, PARAGRAPH_SEPARATOR

special_symbols = [ # TODO 从文件读取
'®', # 注册商标符号
'™', # 商标符号
Expand Down Expand Up @@ -112,11 +114,11 @@ def __combine_text(self, text1:str, text2:str, lang='en') -> str:
text2 = text2.strip(' ') if text2 else ''
if lang == 'zh':
txt = text1 + text2
return txt.strip().replace('\\r\\n', '\n').replace('\\n', '\n')
return txt.strip().replace(LINE_BREAK_WINDOWS, PARAGRAPH_SEPARATOR).replace(LINE_BREAK_UNIX, PARAGRAPH_SEPARATOR)
else:
words_sep = '' if text2[0] in string.punctuation or text2[0] in special_symbols else ' '
txt = text1 + words_sep + text2
return txt.strip().replace('\\r\\n', '\n').replace('\\n', '\n')
return txt.strip().replace(LINE_BREAK_WINDOWS, PARAGRAPH_SEPARATOR).replace(LINE_BREAK_UNIX, PARAGRAPH_SEPARATOR)

def __get_paragraph_text(self, root: HtmlElement) -> List[dict]:
"""
Expand Down Expand Up @@ -145,7 +147,7 @@ def __get_paragraph_text_recusive(el: HtmlElement, text: str) -> str:
text = ''
para_text.append({'c': el.text, 't': ParagraphTextType.CODE_INLINE})
elif el.tag in ['br']:
text += '\n'
text += PARAGRAPH_SEPARATOR
else:
if el.text and el.text.strip():
text = self.__combine_text(text, el.text.strip())
Expand Down

Large diffs are not rendered by default.

Large diffs are not rendered by default.

3,913 changes: 2,667 additions & 1,246 deletions tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/text3.html

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

7,559 changes: 172 additions & 7,387 deletions tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/text.html

Large diffs are not rendered by default.

22 changes: 11 additions & 11 deletions tests/llm_web_kit/extractor/html/recognizer/test_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,9 @@ def test_text_1(self):
html_content = file.read()
assert self.text_recognize._TextParagraphRecognizer__combine_text('知识乱象\n',
'中共中央政治局召开会议审议《成-2020年10月16日新闻联播',
'zh')[:7] == '知识乱象\n中共'
'zh') == '知识乱象\n中共中央政治局召开会议审议《成-2020年10月16日新闻联播'
result = self.text_recognize.recognize('http://www.baidu.com', [(html_content, html_content)], html_content)
assert result[909][0][1413:1422] == '知识乱象\\n 中共'
assert '知识乱象\\n\\n 中共中央政治局' in result[908][0]

def test_text_2(self):
"""
Expand All @@ -51,7 +51,7 @@ def test_text_2(self):
input_data = DataJson(test_data)
result = chain.extract(input_data)
content_md = result.get_content_list().to_mm_md()
assert content_md[:130] == '''For Swivel Hand Rivet Squeezer or any snap Type .187 Shank Diameter Squeezer\n \n\n Instructions for Selecting Rivet Sets:\n\nTo develo'''
assert 'Selecting Rivet Sets:\n\n To develop maximum power' in content_md

def test_text_3(self):
"""
Expand All @@ -73,7 +73,7 @@ def test_text_3(self):
input_data = DataJson(test_data)
result = chain.extract(input_data)
content_md = result.get_content_list().to_mm_md()
assert content_md[371:584] == '''2.\n The speed of light in a material is 2.50x10^8 meters per second. What is the index of refraction of the\n material?\n\n\n\n\n\n 2. Relevant equations\n\n\n\n\n\n\n\n 3. The\n attempt at a solution\n\n1. di=22.22\n\n\n\n2. Dont know'''
assert "1. The problem statement, all variables and given/known data\n\n A woman of height 1.7 meters stands directly in front of a convex mirror 2.0 meters away. The mirror has a radius of curvature, R=-50cm. Find the location and size of a woman's image using the ray diagram and mirror/lens equation.\n\n\n\n----------\n\n\n\n 2. The speed of light in a material is 2.50x10^8 meters per second. What is the index of refraction of the material?\n\n\n\n\n\n 2. Relevant equations\n\n\n\n\n\n\n\n 3. The attempt at a solution\n\n 1. di=22.22\n\n\n\n 2. Dont know" in content_md

def test_text_4(self):
"""
Expand All @@ -95,7 +95,7 @@ def test_text_4(self):
input_data = DataJson(test_data)
result = chain.extract(input_data)
content_md = result.get_content_list().to_mm_md()
assert content_md[46:475] == '''1. The problem statement, all variables and given/known data\n\n 2. Relevant equations\n\n\n\nSee attachment\n\n\n\n 3. The attempt at a solution\n\nI solved the problem (on the same page as problem, written in pencil) but the direction of the acceleration that I calculated is different, I dont understand why my answer is wrong if the normal acceleration always towards the center and the tangent acceleration is suppossed to be clockwise.'''
assert '1. The problem statement, all variables and given/known data\n\n 2. Relevant equations\n\n\n\n See attachment\n\n\n\n 3. The attempt at a solution\n\n I solved the problem' in content_md

def test_text_5(self):
"""
Expand All @@ -117,7 +117,7 @@ def test_text_5(self):
input_data = DataJson(test_data)
result = chain.extract(input_data)
content_md = result.get_content_list().to_mm_md()
assert content_md[1214:1449] == '''Please Note:\n\n 1. Charge the battery on receiving even if it will not be used soon.\n\n 2. Charge the battery EVERY MONTH if not in use for long periods to prevent over-discharging of the battery. This can cause irreparable damage to it.'''
assert 'Please Note:\n\n 1. Charge the battery on receiving even if it will not be used soon.\n\n 2. Charge the battery EVERY MONTH if not in use for long periods to prevent over-discharging of the battery. This can cause irreparable damage to it.' in content_md

def test_text_6(self):
"""
Expand All @@ -139,7 +139,7 @@ def test_text_6(self):
input_data = DataJson(test_data)
result = chain.extract(input_data)
content_md = result.get_content_list().to_mm_md()
assert content_md[255:450] == '''1813 года\n\n5864.\\t Лабиринт волшебства, или удивительные приключения восточных принцев, сочинение В. Протопоповича; Москва, 1786 г. - в 8°. \n\n\n\n\n\n 5865.\\t Лакировальщик, или ясное и подробное нас'''
assert '1813 года\n\n5864. Лабиринт волшебства, или удивительные приключения восточных принцев, сочинение В. Протопоповича; Москва, 1786 г. - в 8°.\n\n\n\n 5865. Лакировальщик' in content_md

def test_text_7(self):
"""
Expand All @@ -151,7 +151,7 @@ def test_text_7(self):
with open(Path(__file__).parent.parent.parent / 'assets/extractor_chain_input/good_data/html/text7.html', 'r') as file:
html_content = file.read()
result = self.text_recognize.recognize('http://www.baidu.com', [(html_content, html_content)], html_content)
assert '1) A man takes 5 hrs and 45 mins to walk to a certain place and ride back' in result[0][0] and BaseHTMLElementRecognizer.is_cc_html(result[0][0])
assert '1) A man takes 5 hrs and 45 mins to walk to a certain place and ride back' in result[51][0] and BaseHTMLElementRecognizer.is_cc_html(result[51][0])

def test_text_8(self):
"""
Expand All @@ -163,7 +163,7 @@ def test_text_8(self):
with open(Path(__file__).parent.parent.parent / 'assets/extractor_chain_input/good_data/html/text8.html', 'r') as file:
html_content = file.read()
result = self.text_recognize.recognize('http://www.baidu.com', [(html_content, html_content)], html_content)
assert "40xy' -ln(x^8) = 0\\n\\n\\nInitial Condition: y(1)=31" in result[0][0] and BaseHTMLElementRecognizer.is_cc_html(result[0][0])
assert "40xy\' -ln(x^8) = 0\\n\\n\\n\\n Initial Condition: y(1)=31\\n\\n\\n\\n Work:" in result[54][0] and BaseHTMLElementRecognizer.is_cc_html(result[54][0])

def test_text_9(self):
"""
Expand All @@ -175,7 +175,7 @@ def test_text_9(self):
with open(Path(__file__).parent.parent.parent / 'assets/extractor_chain_input/good_data/html/text9.html', 'r') as file:
html_content = file.read()
result = self.text_recognize.recognize('http://www.baidu.com', [(html_content, html_content)], html_content)
assert '1) Consider the formula f(x)=lim(n-->infinity)((x^n)/(1+x^n)).\\n Let D={x:f(x) is an element of R}. Calculate f(x) for all x elements of D and determine where f: D-->R is continuous.\\n\\n 2) Let f: D-->R and suppose that f(x) greater than equal 0 for all x elements of D. Define sqrt(f)-->R by (sqrt(f))(x) = sqrt(f(x)). If f is continuous at c elements of D, prove that sqrt(f) is continuous at c.' in result[50][0] and BaseHTMLElementRecognizer.is_cc_html(result[50][0])
assert '1) Consider the formula f(x)=lim(n-->infinity)((x^n)/(1+x^n)).\\n\\n Let D={x:f(x) is an element of R}. Calculate f(x) for all x elements of D and determine where f: D-->R is continuous.\\n\\n\\n\\n 2) Let f: D-->R and suppose that f(x) greater than equal 0 for all x elements of D. Define sqrt(f)-->R by (sqrt(f))(x) = sqrt(f(x)). If f is continuous at c elements of D, prove that sqrt(f) is continuous at c.' in result[50][0] and BaseHTMLElementRecognizer.is_cc_html(result[50][0])

def test_text_10(self):
"""
Expand All @@ -197,4 +197,4 @@ def test_text_10(self):
input_data = DataJson(test_data)
result = chain.extract(input_data)
content_md = result.get_content_list().to_mm_md()
assert content_md[306:450] == '''So far I have 2 sets of questions (but I\'m onlin in the 2nd chapter now\n\n![\\":smile:\\"]( "\\"Smile")\n\n)\n\n\n\n1)\n\nIn the book, Michio Kaku says the '''
assert 'So far I have 2 sets of questions (but I\'m onlin in the 2nd chapter now\n\n![:smile:]( "Smile :smile:")\n\n)\n\n\n\n 1)\n\n In the book' in content_md