diff --git a/llm_web_kit/input/datajson.py b/llm_web_kit/input/datajson.py index b086579c..49b36d0a 100644 --- a/llm_web_kit/input/datajson.py +++ b/llm_web_kit/input/datajson.py @@ -97,8 +97,11 @@ def __to_md(self, exclude_nodes=[]): md = md.strip() + self.__text_end # 加上结尾换行符 return md - def to_nlp_md(self): - md = self.__to_md(exclude_nodes=DocElementType.MM_NODE_LIST) + def to_nlp_md(self, MM_NODE_LIST=[]): + if MM_NODE_LIST: + md = self.__to_md(exclude_nodes=MM_NODE_LIST) + else: + md = self.__to_md(exclude_nodes=DocElementType.MM_NODE_LIST) return md def to_mm_md(self): diff --git a/tests/llm_web_kit/input/test_datajson.py b/tests/llm_web_kit/input/test_datajson.py index ac0b9ea1..c967e330 100644 --- a/tests/llm_web_kit/input/test_datajson.py +++ b/tests/llm_web_kit/input/test_datajson.py @@ -109,13 +109,13 @@ def test_datajson_validation(): def test_data_json_deepcopy(): """从一个外部dict构建datajson, 改变datajson,不改变外部dict.""" d = {'track_id': '32266dfa-c335-45c5-896e-56f057889d28', - 'url': 'http://mathematica.stackexchange.com/users/1931/ywdr1987?tab=activity&sort=all', - 'html':'', - 'page_layout_type': 'forum', - 'domain': 'mathematica.stackexchange.com', - 'dataset_name': 'math', - 'data_source_category': 'HTML', - 'meta_info': {'warc_headers': {'WARC-IP-Address': '104.16.12.13'}}} + 'url': 'http://mathematica.stackexchange.com/users/1931/ywdr1987?tab=activity&sort=all', + 'html': '', + 'page_layout_type': 'forum', + 'domain': 'mathematica.stackexchange.com', + 'dataset_name': 'math', + 'data_source_category': 'HTML', + 'meta_info': {'warc_headers': {'WARC-IP-Address': '104.16.12.13'}}} copied = copy.deepcopy(d) _ = DataJson(copied) cl = copied.get('content_list') # 不该变外部变量d @@ -146,3 +146,70 @@ def test_datajson_to_dict_immutable(): # Verify the modifications only affected the dict copy assert dict_data[DataJsonKey.DATASET_NAME] == 'modified_dataset' assert dict_data[DataJsonKey.CONTENT_LIST][0]['content'] == 'modified content' + + +def test_data_json_to_nlp_md(): + d = { + 'track_id': '9fc6d25e-03ef-42a5-9675-7817c2b01936', + 'url': 'http://boards.fool.com/quoti-think-flegs-watching-what-he-eats-30294220.aspx?sort=username', + 'html': '', + 'content_list': [ + [ + { + 'type': 'paragraph', + 'raw_content': '
\n\t\t\t\tZiet u iets wat niet hoort of niet klopt?\n\t\t\t
', + 'content': [ + { + 'c': 'Ziet u iets wat niet hoort of niet klopt?', + 't': 'text' + } + ] + }, + { + 'type': 'title', + 'raw_content': '

Openingstijden

', + 'content': { + 'title_content': 'Openingstijden', + 'level': '2' + } + }, + { + 'type': 'table', + 'raw_content': '
\n\t\t\t\tMaandag\n\t\t\t\n\n\t\t\t\t\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\tDinsdag\n\t\t\t\n\n\t\t\t\t\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\tWoensdag\n\t\t\t\n\n\t\t\t\t\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\tDonderdag\n\t\t\t\n\n\t\t\t\t\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\tVrijdag\n\t\t\t\n\n\t\t\t\t\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\tZaterdag\n\t\t\t\n\n\t\t\t\t\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\tZondag\n\t\t\t\n\n\t\t\t\t\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
', + 'content': { + 'html': '
Maandag-
Dinsdag-
Woensdag-
Donderdag-
Vrijdag-
Zaterdag-
Zondag-
', + 'is_complex': False, + 'table_nest_level': '1' + } + }, + { + 'type': 'code', + 'raw_content': 'frame.open();\nframe.write(html);\nframe.close();\n', + 'inline': False, + 'content': { + 'code_content': 'frame.open();\nframe.write(html);\nframe.close();', + 'by': 'tag_pre_code' + } + } + ] + ] + } + + def test_default_exclude(): + datajson = DataJson(d) + md = datajson.get_content_list().to_nlp_md() + assert 'Ziet u iets wat niet hoort of niet klopt?' in md + assert 'Openingstijden' in md + assert 'Maandag' in md + assert 'frame.open();\nframe.write(html);\nframe.close();' in md + + def test_custom_exclude(): + datajson = DataJson(d) + md = datajson.get_content_list().to_nlp_md(MM_NODE_LIST=['table']) + assert 'Ziet u iets wat niet hoort of niet klopt?' in md + assert 'Openingstijden' in md + assert 'Maandag' not in md + assert 'frame.open();\nframe.write(html);\nframe.close();' in md + + test_default_exclude() + test_custom_exclude()