Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions llm_web_kit/input/datajson.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,8 +97,11 @@ def __to_md(self, exclude_nodes=[]):
md = md.strip() + self.__text_end # 加上结尾换行符
return md

def to_nlp_md(self):
md = self.__to_md(exclude_nodes=DocElementType.MM_NODE_LIST)
def to_nlp_md(self, MM_NODE_LIST=[]):
if MM_NODE_LIST:
md = self.__to_md(exclude_nodes=MM_NODE_LIST)
else:
md = self.__to_md(exclude_nodes=DocElementType.MM_NODE_LIST)
return md

def to_mm_md(self):
Expand Down
81 changes: 74 additions & 7 deletions tests/llm_web_kit/input/test_datajson.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,13 +109,13 @@ def test_datajson_validation():
def test_data_json_deepcopy():
"""从一个外部dict构建datajson, 改变datajson,不改变外部dict."""
d = {'track_id': '32266dfa-c335-45c5-896e-56f057889d28',
'url': 'http://mathematica.stackexchange.com/users/1931/ywdr1987?tab=activity&sort=all',
'html':'',
'page_layout_type': 'forum',
'domain': 'mathematica.stackexchange.com',
'dataset_name': 'math',
'data_source_category': 'HTML',
'meta_info': {'warc_headers': {'WARC-IP-Address': '104.16.12.13'}}}
'url': 'http://mathematica.stackexchange.com/users/1931/ywdr1987?tab=activity&sort=all',
'html': '',
'page_layout_type': 'forum',
'domain': 'mathematica.stackexchange.com',
'dataset_name': 'math',
'data_source_category': 'HTML',
'meta_info': {'warc_headers': {'WARC-IP-Address': '104.16.12.13'}}}
copied = copy.deepcopy(d)
_ = DataJson(copied)
cl = copied.get('content_list') # 不该变外部变量d
Expand Down Expand Up @@ -146,3 +146,70 @@ def test_datajson_to_dict_immutable():
# Verify the modifications only affected the dict copy
assert dict_data[DataJsonKey.DATASET_NAME] == 'modified_dataset'
assert dict_data[DataJsonKey.CONTENT_LIST][0]['content'] == 'modified content'


def test_data_json_to_nlp_md():
d = {
'track_id': '9fc6d25e-03ef-42a5-9675-7817c2b01936',
'url': 'http://boards.fool.com/quoti-think-flegs-watching-what-he-eats-30294220.aspx?sort=username',
'html': '',
'content_list': [
[
{
'type': 'paragraph',
'raw_content': '<div class=\"content\"><div class=\"description-wrapper\"><div class=\"container description\"><div class=\"report text-center\"><span class=\"text-muted\">\n\t\t\t\tZiet u iets wat niet hoort of niet klopt?\n\t\t\t</span></div></div></div></div>',
'content': [
{
'c': 'Ziet u iets wat niet hoort of niet klopt?',
't': 'text'
}
]
},
{
'type': 'title',
'raw_content': '<h2 class=\"text-center \" data-step=\"4\">Openingstijden</h2>',
'content': {
'title_content': 'Openingstijden',
'level': '2'
}
},
{
'type': 'table',
'raw_content': '<table class=\"table table-hover\" id=\"table-visitinghours\"><tr class=\"\"><td>\n\t\t\t\tMaandag\n\t\t\t</td><td class=\"text-right\">\n\n\t\t\t\t\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t</td></tr><tr class=\"\"><td>\n\t\t\t\tDinsdag\n\t\t\t</td><td class=\"text-right\">\n\n\t\t\t\t\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t</td></tr><tr class=\"\"><td>\n\t\t\t\tWoensdag\n\t\t\t</td><td class=\"text-right\">\n\n\t\t\t\t\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t</td></tr><tr class=\"\"><td>\n\t\t\t\tDonderdag\n\t\t\t</td><td class=\"text-right\">\n\n\t\t\t\t\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t</td></tr><tr class=\"\"><td>\n\t\t\t\tVrijdag\n\t\t\t</td><td class=\"text-right\">\n\n\t\t\t\t\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t</td></tr><tr class=\"\"><td>\n\t\t\t\tZaterdag\n\t\t\t</td><td class=\"text-right\">\n\n\t\t\t\t\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t</td></tr><tr class=\"\"><td>\n\t\t\t\tZondag\n\t\t\t</td><td class=\"text-right\">\n\n\t\t\t\t\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t</td></tr></table>',
'content': {
'html': '<table><tr><td>Maandag</td><td>-</td></tr><tr><td>Dinsdag</td><td>-</td></tr><tr><td>Woensdag</td><td>-</td></tr><tr><td>Donderdag</td><td>-</td></tr><tr><td>Vrijdag</td><td>-</td></tr><tr><td>Zaterdag</td><td>-</td></tr><tr><td>Zondag</td><td>-</td></tr></table>',
'is_complex': False,
'table_nest_level': '1'
}
},
{
'type': 'code',
'raw_content': '<code>frame.open();\nframe.write(html);\nframe.close();\n</code>',
'inline': False,
'content': {
'code_content': 'frame.open();\nframe.write(html);\nframe.close();',
'by': 'tag_pre_code'
}
}
]
]
}

def test_default_exclude():
datajson = DataJson(d)
md = datajson.get_content_list().to_nlp_md()
assert 'Ziet u iets wat niet hoort of niet klopt?' in md
assert 'Openingstijden' in md
assert 'Maandag' in md
assert 'frame.open();\nframe.write(html);\nframe.close();' in md

def test_custom_exclude():
datajson = DataJson(d)
md = datajson.get_content_list().to_nlp_md(MM_NODE_LIST=['table'])
assert 'Ziet u iets wat niet hoort of niet klopt?' in md
assert 'Openingstijden' in md
assert 'Maandag' not in md
assert 'frame.open();\nframe.write(html);\nframe.close();' in md

test_default_exclude()
test_custom_exclude()