From 330f925061e072ca54fb9c8e210544a1106a1257 Mon Sep 17 00:00:00 2001 From: shijinpjlab Date: Mon, 17 Mar 2025 14:09:02 +0800 Subject: [PATCH 1/3] feat: add MM_NODE_LIST in to_nlp_md --- llm_web_kit/input/datajson.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/llm_web_kit/input/datajson.py b/llm_web_kit/input/datajson.py index b086579c..49b36d0a 100644 --- a/llm_web_kit/input/datajson.py +++ b/llm_web_kit/input/datajson.py @@ -97,8 +97,11 @@ def __to_md(self, exclude_nodes=[]): md = md.strip() + self.__text_end # 加上结尾换行符 return md - def to_nlp_md(self): - md = self.__to_md(exclude_nodes=DocElementType.MM_NODE_LIST) + def to_nlp_md(self, MM_NODE_LIST=[]): + if MM_NODE_LIST: + md = self.__to_md(exclude_nodes=MM_NODE_LIST) + else: + md = self.__to_md(exclude_nodes=DocElementType.MM_NODE_LIST) return md def to_mm_md(self): From 11f5444fc6389d3f8b761c7ee5d325703bd5717a Mon Sep 17 00:00:00 2001 From: shijinpjlab Date: Tue, 18 Mar 2025 13:45:25 +0800 Subject: [PATCH 2/3] feat: add MM_NODE_LIST in to_nlp_md test case --- tests/llm_web_kit/input/test_datajson.py | 81 ++++++++++++++++++++++-- 1 file changed, 74 insertions(+), 7 deletions(-) diff --git a/tests/llm_web_kit/input/test_datajson.py b/tests/llm_web_kit/input/test_datajson.py index ac0b9ea1..9fb3febb 100644 --- a/tests/llm_web_kit/input/test_datajson.py +++ b/tests/llm_web_kit/input/test_datajson.py @@ -109,13 +109,13 @@ def test_datajson_validation(): def test_data_json_deepcopy(): """从一个外部dict构建datajson, 改变datajson,不改变外部dict.""" d = {'track_id': '32266dfa-c335-45c5-896e-56f057889d28', - 'url': 'http://mathematica.stackexchange.com/users/1931/ywdr1987?tab=activity&sort=all', - 'html':'', - 'page_layout_type': 'forum', - 'domain': 'mathematica.stackexchange.com', - 'dataset_name': 'math', - 'data_source_category': 'HTML', - 'meta_info': {'warc_headers': {'WARC-IP-Address': '104.16.12.13'}}} + 'url': 'http://mathematica.stackexchange.com/users/1931/ywdr1987?tab=activity&sort=all', + 'html': '', + 'page_layout_type': 'forum', + 'domain': 'mathematica.stackexchange.com', + 'dataset_name': 'math', + 'data_source_category': 'HTML', + 'meta_info': {'warc_headers': {'WARC-IP-Address': '104.16.12.13'}}} copied = copy.deepcopy(d) _ = DataJson(copied) cl = copied.get('content_list') # 不该变外部变量d @@ -146,3 +146,70 @@ def test_datajson_to_dict_immutable(): # Verify the modifications only affected the dict copy assert dict_data[DataJsonKey.DATASET_NAME] == 'modified_dataset' assert dict_data[DataJsonKey.CONTENT_LIST][0]['content'] == 'modified content' + + +def test_data_json_to_nlp_md(): + d = { + "track_id": "9fc6d25e-03ef-42a5-9675-7817c2b01936", + "url": "http://boards.fool.com/quoti-think-flegs-watching-what-he-eats-30294220.aspx?sort=username", + "html": "", + "content_list": [ + [ + { + "type": "paragraph", + "raw_content": "
\n\t\t\t\tZiet u iets wat niet hoort of niet klopt?\n\t\t\t
", + "content": [ + { + "c": "Ziet u iets wat niet hoort of niet klopt?", + "t": "text" + } + ] + }, + { + "type": "title", + "raw_content": "

Openingstijden

", + "content": { + "title_content": "Openingstijden", + "level": "2" + } + }, + { + "type": "table", + "raw_content": "
\n\t\t\t\tMaandag\n\t\t\t\n\n\t\t\t\t\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\tDinsdag\n\t\t\t\n\n\t\t\t\t\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\tWoensdag\n\t\t\t\n\n\t\t\t\t\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\tDonderdag\n\t\t\t\n\n\t\t\t\t\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\tVrijdag\n\t\t\t\n\n\t\t\t\t\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\tZaterdag\n\t\t\t\n\n\t\t\t\t\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\tZondag\n\t\t\t\n\n\t\t\t\t\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
", + "content": { + "html": "
Maandag-
Dinsdag-
Woensdag-
Donderdag-
Vrijdag-
Zaterdag-
Zondag-
", + "is_complex": False, + "table_nest_level": "1" + } + }, + { + "type": "code", + "raw_content": "frame.open();\nframe.write(html);\nframe.close();\n", + "inline": False, + "content": { + "code_content": "frame.open();\nframe.write(html);\nframe.close();", + "by": "tag_pre_code" + } + } + ] + ] + } + + def test_default_exclude(): + datajson = DataJson(d) + md = datajson.get_content_list().to_nlp_md() + assert "Ziet u iets wat niet hoort of niet klopt?" in md + assert "Openingstijden" in md + assert "Maandag" in md + assert "frame.open();\nframe.write(html);\nframe.close();" in md + + def test_custom_exclude(): + datajson = DataJson(d) + md = datajson.get_content_list().to_nlp_md(MM_NODE_LIST=['table']) + assert "Ziet u iets wat niet hoort of niet klopt?" in md + assert "Openingstijden" in md + assert "Maandag" not in md + assert "frame.open();\nframe.write(html);\nframe.close();" in md + + test_default_exclude() + test_custom_exclude() From 92145f65ebc4532fc6c22600e8ad3726414c844e Mon Sep 17 00:00:00 2001 From: shijinpjlab Date: Wed, 19 Mar 2025 12:55:16 +0800 Subject: [PATCH 3/3] feat: fix lint --- tests/llm_web_kit/input/test_datajson.py | 68 ++++++++++++------------ 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/tests/llm_web_kit/input/test_datajson.py b/tests/llm_web_kit/input/test_datajson.py index 9fb3febb..c967e330 100644 --- a/tests/llm_web_kit/input/test_datajson.py +++ b/tests/llm_web_kit/input/test_datajson.py @@ -150,45 +150,45 @@ def test_datajson_to_dict_immutable(): def test_data_json_to_nlp_md(): d = { - "track_id": "9fc6d25e-03ef-42a5-9675-7817c2b01936", - "url": "http://boards.fool.com/quoti-think-flegs-watching-what-he-eats-30294220.aspx?sort=username", - "html": "", - "content_list": [ + 'track_id': '9fc6d25e-03ef-42a5-9675-7817c2b01936', + 'url': 'http://boards.fool.com/quoti-think-flegs-watching-what-he-eats-30294220.aspx?sort=username', + 'html': '', + 'content_list': [ [ { - "type": "paragraph", - "raw_content": "
\n\t\t\t\tZiet u iets wat niet hoort of niet klopt?\n\t\t\t
", - "content": [ + 'type': 'paragraph', + 'raw_content': '
\n\t\t\t\tZiet u iets wat niet hoort of niet klopt?\n\t\t\t
', + 'content': [ { - "c": "Ziet u iets wat niet hoort of niet klopt?", - "t": "text" + 'c': 'Ziet u iets wat niet hoort of niet klopt?', + 't': 'text' } ] }, { - "type": "title", - "raw_content": "

Openingstijden

", - "content": { - "title_content": "Openingstijden", - "level": "2" + 'type': 'title', + 'raw_content': '

Openingstijden

', + 'content': { + 'title_content': 'Openingstijden', + 'level': '2' } }, { - "type": "table", - "raw_content": "
\n\t\t\t\tMaandag\n\t\t\t\n\n\t\t\t\t\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\tDinsdag\n\t\t\t\n\n\t\t\t\t\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\tWoensdag\n\t\t\t\n\n\t\t\t\t\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\tDonderdag\n\t\t\t\n\n\t\t\t\t\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\tVrijdag\n\t\t\t\n\n\t\t\t\t\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\tZaterdag\n\t\t\t\n\n\t\t\t\t\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\tZondag\n\t\t\t\n\n\t\t\t\t\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
", - "content": { - "html": "
Maandag-
Dinsdag-
Woensdag-
Donderdag-
Vrijdag-
Zaterdag-
Zondag-
", - "is_complex": False, - "table_nest_level": "1" + 'type': 'table', + 'raw_content': '
\n\t\t\t\tMaandag\n\t\t\t\n\n\t\t\t\t\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\tDinsdag\n\t\t\t\n\n\t\t\t\t\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\tWoensdag\n\t\t\t\n\n\t\t\t\t\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\tDonderdag\n\t\t\t\n\n\t\t\t\t\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\tVrijdag\n\t\t\t\n\n\t\t\t\t\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\tZaterdag\n\t\t\t\n\n\t\t\t\t\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n\t\t\t\tZondag\n\t\t\t\n\n\t\t\t\t\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
', + 'content': { + 'html': '
Maandag-
Dinsdag-
Woensdag-
Donderdag-
Vrijdag-
Zaterdag-
Zondag-
', + 'is_complex': False, + 'table_nest_level': '1' } }, { - "type": "code", - "raw_content": "frame.open();\nframe.write(html);\nframe.close();\n", - "inline": False, - "content": { - "code_content": "frame.open();\nframe.write(html);\nframe.close();", - "by": "tag_pre_code" + 'type': 'code', + 'raw_content': 'frame.open();\nframe.write(html);\nframe.close();\n', + 'inline': False, + 'content': { + 'code_content': 'frame.open();\nframe.write(html);\nframe.close();', + 'by': 'tag_pre_code' } } ] @@ -198,18 +198,18 @@ def test_data_json_to_nlp_md(): def test_default_exclude(): datajson = DataJson(d) md = datajson.get_content_list().to_nlp_md() - assert "Ziet u iets wat niet hoort of niet klopt?" in md - assert "Openingstijden" in md - assert "Maandag" in md - assert "frame.open();\nframe.write(html);\nframe.close();" in md + assert 'Ziet u iets wat niet hoort of niet klopt?' in md + assert 'Openingstijden' in md + assert 'Maandag' in md + assert 'frame.open();\nframe.write(html);\nframe.close();' in md def test_custom_exclude(): datajson = DataJson(d) md = datajson.get_content_list().to_nlp_md(MM_NODE_LIST=['table']) - assert "Ziet u iets wat niet hoort of niet klopt?" in md - assert "Openingstijden" in md - assert "Maandag" not in md - assert "frame.open();\nframe.write(html);\nframe.close();" in md + assert 'Ziet u iets wat niet hoort of niet klopt?' in md + assert 'Openingstijden' in md + assert 'Maandag' not in md + assert 'frame.open();\nframe.write(html);\nframe.close();' in md test_default_exclude() test_custom_exclude()