Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 10 additions & 7 deletions llm_web_kit/extractor/html/recognizer/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,14 +108,15 @@ def __combine_text(self, text1:str, text2:str, lang='en') -> str:
text2: str: 第二段文本
lang: str: 语言 TODO 实现根据语言连接文本的不同方式, 还有就是一些特殊符号开头的连接不加空格。
"""
text1 = text1.strip() if text1 else ''
text2 = text2.strip() if text2 else ''
text1 = text1.strip(' ') if text1 else ''
text2 = text2.strip(' ') if text2 else ''
if lang == 'zh':
return text1.strip() + text2.strip()
txt = text1 + text2
return txt.strip().replace('\\r\\n', '\n').replace('\\n', '\n')
else:
words_sep = '' if text2[0] in string.punctuation or text2[0] in special_symbols else ' '
txt = text1 + words_sep + text2
return txt.strip()
return txt.strip().replace('\\r\\n', '\n').replace('\\n', '\n')

def __get_paragraph_text(self, root: HtmlElement) -> List[dict]:
"""
Expand All @@ -140,13 +141,15 @@ def __get_paragraph_text_recusive(el: HtmlElement, text: str) -> str:
para_text.append({'c':el.text, 't':ParagraphTextType.EQUATION_INLINE})
elif el.tag == CCTag.CC_CODE_INLINE:
if text:
para_text.append({'c':text, 't':ParagraphTextType.TEXT})
para_text.append({'c': text, 't': ParagraphTextType.TEXT})
text = ''
para_text.append({'c':el.text, 't':ParagraphTextType.CODE_INLINE})
para_text.append({'c': el.text, 't': ParagraphTextType.CODE_INLINE})
elif el.tag in ['br']:
text += '\n'
else:
if el.text and el.text.strip():
text = self.__combine_text(text, el.text.strip())
for child in el.getchildren():
for child in el:
text = __get_paragraph_text_recusive(child, text)

if el.tail and el.tail.strip():
Expand Down

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

7,387 changes: 7,387 additions & 0 deletions tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/text.html

Large diffs are not rendered by default.

236 changes: 236 additions & 0 deletions tests/llm_web_kit/extractor/html/recognizer/test_text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,236 @@
# 测试text识别器
import os
import unittest
from pathlib import Path

from llm_web_kit.extractor.extractor_chain import ExtractSimpleFactory
from llm_web_kit.extractor.html.recognizer.recognizer import \
BaseHTMLElementRecognizer
from llm_web_kit.extractor.html.recognizer.text import TextParagraphRecognizer
from llm_web_kit.input.datajson import DataJson


class TestTextParagraphRecognize(unittest.TestCase):
def setUp(self):
self.text_recognize = TextParagraphRecognizer()
# Config for HTML extraction
self.config = {
'extractor_pipe': {
'enable': True,
'validate_input_format': False,
'pre_extractor': [
{
'enable': True,
'python_class': 'llm_web_kit.extractor.html.pre_extractor.TestHTMLFileFormatFilterPreExtractor',
'class_init_kwargs': {
'html_parent_dir': 'tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/',
},
},
{
'enable': True,
'python_class': 'llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatFilterTablePreExtractor',
},
{
'enable': True,
'python_class': 'llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatCleanTagsPreExtractor',
'class_init_kwargs': {},
}
],
'extractor': [
{
'enable': True,
'python_class': 'llm_web_kit.extractor.html.extractor.HTMLFileFormatExtractor',
'class_init_kwargs': {},
}
],
'post_extractor': [
{
'enable': False,
'python_class': 'llm_web_kit.extractor.html.post_extractor.HTMLFileFormatPostExtractor',
'class_init_kwargs': {},
}
],
}
}

def test_text_1(self):
"""
测试1 s3://llm-pdf-text-1/qa/quyuan/output/part-67c01310620e-000064.jsonl
Returns:

"""
with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/text.html', 'r') as file:
html_content = file.read()
assert self.text_recognize._TextParagraphRecognizer__combine_text('知识乱象\n',
'中共中央政治局召开会议审议《成-2020年10月16日新闻联播',
'zh')[:7] == '知识乱象\n中共'
result = self.text_recognize.recognize('http://www.baidu.com', [(html_content, html_content)], html_content)
assert result[909][0][1413:1422] == '知识乱象\\n 中共'

def test_text_2(self):
"""
测试2 s3://llm-pdf-text-1/qa/quyuan/output/part-67c01310620e-004720.jsonl
Returns:

"""
chain = ExtractSimpleFactory.create(self.config)
self.assertIsNotNone(chain)
test_data = {
'track_id': 'text_md',
'dataset_name': 'text_md',
'url': 'https://www.aircraftspruce.com/catalog/pnpages/AT108AR-5_32.php',
'data_source_category': 'HTML',
'path': 'text2.html',
'file_bytes': 1000,
'meta_info': {'input_datetime': '2020-01-01 00:00:00'}
}
input_data = DataJson(test_data)
result = chain.extract(input_data)
content_md = result.get_content_list().to_mm_md()
assert content_md[:130] == '''For Swivel Hand Rivet Squeezer or any snap Type .187 Shank Diameter Squeezer\n \n\n Instructions for Selecting Rivet Sets:\n\nTo devel'''

def test_text_3(self):
"""
测试3 s3://llm-pdf-text-1/qa/quyuan/mathout/part-67c05902108f-001066.jsonl
Returns:

"""
chain = ExtractSimpleFactory.create(self.config)
self.assertIsNotNone(chain)
test_data = {
'track_id': 'text_md',
'dataset_name': 'text_md',
'url': 'https://www.physicsforums.com/threads/how-do-convex-mirrors-affect-image-location-and-size.240850/',
'data_source_category': 'HTML',
'path': 'text3.html',
'file_bytes': 1000,
'meta_info': {'input_datetime': '2020-01-01 00:00:00'}
}
input_data = DataJson(test_data)
result = chain.extract(input_data)
content_md = result.get_content_list().to_mm_md()
assert content_md[443:669] == '''2.\n The speed of light in a material is 2.50x10^8 meters per second. What is the index of refraction of the\n material?\n\n\n\n\n\n 2. Relevant equations\n\n\n\n\n\n\n\n 3. The\n attempt at a solution\n\n1. di=22.22\n\n\n\n2. Dont know'''

def test_text_4(self):
"""
测试4 s3://llm-pdf-text-1/qa/quyuan/mathout/part-67c05902108f-000050.jsonl
Returns:

"""
chain = ExtractSimpleFactory.create(self.config)
self.assertIsNotNone(chain)
test_data = {
'track_id': 'text_md',
'dataset_name': 'text_md',
'url': 'https://www.physicsforums.com/threads/isnt-the-normal-acceleration-always-towards-the-center.157291/',
'data_source_category': 'HTML',
'path': 'text4.html',
'file_bytes': 1000,
'meta_info': {'input_datetime': '2020-01-01 00:00:00'}
}
input_data = DataJson(test_data)
result = chain.extract(input_data)
content_md = result.get_content_list().to_mm_md()
assert content_md[46:475] == '''1. The problem statement, all variables and given/known data\n\n 2. Relevant equations\n\n\n\nSee attachment\n\n\n\n 3. The attempt at a solution\n\nI solved the problem (on the same page as problem, written in pencil) but the direction of the acceleration that I calculated is different, I dont understand why my answer is wrong if the normal acceleration always towards the center and the tangent acceleration is suppossed to be clockwise.'''

def test_text_5(self):
"""
测试5 s3://llm-pdf-text-1/qa/quyuan/output/part-67c01310620e-007988.jsonl
Returns:

"""
chain = ExtractSimpleFactory.create(self.config)
self.assertIsNotNone(chain)
test_data = {
'track_id': 'text_md',
'dataset_name': 'text_md',
'url': 'https://shopnado.com.au/product/rigo-ride-on-car-tractor-toy-kids-electric-cars-12v-battery-child-toddlers-blue/',
'data_source_category': 'HTML',
'path': 'text5.html',
'file_bytes': 1000,
'meta_info': {'input_datetime': '2020-01-01 00:00:00'}
}
input_data = DataJson(test_data)
result = chain.extract(input_data)
content_md = result.get_content_list().to_mm_md()
assert content_md[1214:1449] == '''Please Note:\n\n 1. Charge the battery on receiving even if it will not be used soon.\n\n 2. Charge the battery EVERY MONTH if not in use for long periods to prevent over-discharging of the battery. This can cause irreparable damage to it.'''

def test_text_6(self):
"""
测试6 s3://llm-pdf-text-1/qa/quyuan/output/part-67c01310620e-012288.jsonl
Returns:

"""
chain = ExtractSimpleFactory.create(self.config)
self.assertIsNotNone(chain)
test_data = {
'track_id': 'text_md',
'dataset_name': 'text_md',
'url': 'https://adelanta.biz/kuplu-knigi/the-experience-of-russian-bibliography-copikova-part-2-l/',
'data_source_category': 'HTML',
'path': 'text6.html',
'file_bytes': 1000,
'meta_info': {'input_datetime': '2020-01-01 00:00:00'}
}
input_data = DataJson(test_data)
result = chain.extract(input_data)
content_md = result.get_content_list().to_mm_md()
assert content_md[255:450] == '''1813 года\n\n5864.\\t Лабиринт волшебства, или удивительные приключения восточных принцев, сочинение В. Протопоповича; Москва, 1786 г. - в 8°. \n\n\n\n\n\n 5865.\\t Лакировальщик, или ясное и подробное нас'''

def test_text_7(self):
"""
测试7 s3://llm-pdf-text-1/qa/quyuan/mathout/part-67c05902108f-001871.jsonl
ps:badcase未保留行是因为走的cc
Returns:

"""
with open(Path(__file__).parent.parent.parent / 'assets/extractor_chain_input/good_data/html/text7.html', 'r') as file:
html_content = file.read()
result = self.text_recognize.recognize('http://www.baidu.com', [(html_content, html_content)], html_content)
assert '1) A man takes 5 hrs and 45 mins to walk to a certain place and ride back' in result[0][0] and BaseHTMLElementRecognizer.is_cc_html(result[0][0])

def test_text_8(self):
"""
测试8 s3://llm-pdf-text-1/qa/quyuan/mathout/part-67c05902108f-001477.jsonl
ps:badcase未保留行是因为走的cc
Returns:

"""
with open(Path(__file__).parent.parent.parent / 'assets/extractor_chain_input/good_data/html/text8.html', 'r') as file:
html_content = file.read()
result = self.text_recognize.recognize('http://www.baidu.com', [(html_content, html_content)], html_content)
assert "40xy' -ln(x^8) = 0\\n\\n\\nInitial Condition: y(1)=31" in result[0][0] and BaseHTMLElementRecognizer.is_cc_html(result[0][0])

def test_text_9(self):
"""
测试9 s3://llm-pdf-text-1/qa/quyuan/mathout/part-67c05902108f-000073.jsonl
ps:badcase未保留行是因为走的cc
Returns:

"""
with open(Path(__file__).parent.parent.parent / 'assets/extractor_chain_input/good_data/html/text9.html', 'r') as file:
html_content = file.read()
result = self.text_recognize.recognize('http://www.baidu.com', [(html_content, html_content)], html_content)
assert '1) Consider the formula f(x)=lim(n-->infinity)((x^n)/(1+x^n)).\\n Let D={x:f(x) is an element of R}. Calculate f(x) for all x elements of D and determine where f: D-->R is continuous.\\n\\n 2) Let f: D-->R and suppose that f(x) greater than equal 0 for all x elements of D. Define sqrt(f)-->R by (sqrt(f))(x) = sqrt(f(x)). If f is continuous at c elements of D, prove that sqrt(f) is continuous at c.' in result[50][0] and BaseHTMLElementRecognizer.is_cc_html(result[50][0])

def test_text_10(self):
"""
测试10 s3://llm-pdf-text-1/qa/quyuan/mathout/part-67c05902108f-000620.jsonl
Returns:

"""
chain = ExtractSimpleFactory.create(self.config)
self.assertIsNotNone(chain)
test_data = {
'track_id': 'text_md',
'dataset_name': 'text_md',
'url': 'https://www.physicsforums.com/threads/questions-about-parallel-worlds-by-michio-kaku-the-big-bang.612643/',
'data_source_category': 'HTML',
'path': 'text10.html',
'file_bytes': 1000,
'meta_info': {'input_datetime': '2020-01-01 00:00:00'}
}
input_data = DataJson(test_data)
result = chain.extract(input_data)
content_md = result.get_content_list().to_mm_md()
assert content_md[306:450] == '''So far I have 2 sets of questions (but I\'m onlin in the 2nd chapter now\n\n![\\":smile:\\"]( "\\"Smile")\n\n)\n\n\n\n1)\n\nIn the book, Michio Kaku says the '''