diff --git a/llm_web_kit/input/datajson.py b/llm_web_kit/input/datajson.py
index b086579c..49b36d0a 100644
--- a/llm_web_kit/input/datajson.py
+++ b/llm_web_kit/input/datajson.py
@@ -97,8 +97,11 @@ def __to_md(self, exclude_nodes=[]):
md = md.strip() + self.__text_end # 加上结尾换行符
return md
- def to_nlp_md(self):
- md = self.__to_md(exclude_nodes=DocElementType.MM_NODE_LIST)
+ def to_nlp_md(self, MM_NODE_LIST=[]):
+ if MM_NODE_LIST:
+ md = self.__to_md(exclude_nodes=MM_NODE_LIST)
+ else:
+ md = self.__to_md(exclude_nodes=DocElementType.MM_NODE_LIST)
return md
def to_mm_md(self):
diff --git a/tests/llm_web_kit/input/test_datajson.py b/tests/llm_web_kit/input/test_datajson.py
index ac0b9ea1..c967e330 100644
--- a/tests/llm_web_kit/input/test_datajson.py
+++ b/tests/llm_web_kit/input/test_datajson.py
@@ -109,13 +109,13 @@ def test_datajson_validation():
def test_data_json_deepcopy():
"""从一个外部dict构建datajson, 改变datajson,不改变外部dict."""
d = {'track_id': '32266dfa-c335-45c5-896e-56f057889d28',
- 'url': 'http://mathematica.stackexchange.com/users/1931/ywdr1987?tab=activity&sort=all',
- 'html':'',
- 'page_layout_type': 'forum',
- 'domain': 'mathematica.stackexchange.com',
- 'dataset_name': 'math',
- 'data_source_category': 'HTML',
- 'meta_info': {'warc_headers': {'WARC-IP-Address': '104.16.12.13'}}}
+ 'url': 'http://mathematica.stackexchange.com/users/1931/ywdr1987?tab=activity&sort=all',
+ 'html': '',
+ 'page_layout_type': 'forum',
+ 'domain': 'mathematica.stackexchange.com',
+ 'dataset_name': 'math',
+ 'data_source_category': 'HTML',
+ 'meta_info': {'warc_headers': {'WARC-IP-Address': '104.16.12.13'}}}
copied = copy.deepcopy(d)
_ = DataJson(copied)
cl = copied.get('content_list') # 不该变外部变量d
@@ -146,3 +146,70 @@ def test_datajson_to_dict_immutable():
# Verify the modifications only affected the dict copy
assert dict_data[DataJsonKey.DATASET_NAME] == 'modified_dataset'
assert dict_data[DataJsonKey.CONTENT_LIST][0]['content'] == 'modified content'
+
+
+def test_data_json_to_nlp_md():
+ d = {
+ 'track_id': '9fc6d25e-03ef-42a5-9675-7817c2b01936',
+ 'url': 'http://boards.fool.com/quoti-think-flegs-watching-what-he-eats-30294220.aspx?sort=username',
+ 'html': '',
+ 'content_list': [
+ [
+ {
+ 'type': 'paragraph',
+ 'raw_content': '
\n\t\t\t\tZiet u iets wat niet hoort of niet klopt?\n\t\t\t
',
+ 'content': [
+ {
+ 'c': 'Ziet u iets wat niet hoort of niet klopt?',
+ 't': 'text'
+ }
+ ]
+ },
+ {
+ 'type': 'title',
+ 'raw_content': 'Openingstijden
',
+ 'content': {
+ 'title_content': 'Openingstijden',
+ 'level': '2'
+ }
+ },
+ {
+ 'type': 'table',
+ 'raw_content': '| \n\t\t\t\tMaandag\n\t\t\t | \n\n\t\t\t\t\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t |
| \n\t\t\t\tDinsdag\n\t\t\t | \n\n\t\t\t\t\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t |
| \n\t\t\t\tWoensdag\n\t\t\t | \n\n\t\t\t\t\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t |
| \n\t\t\t\tDonderdag\n\t\t\t | \n\n\t\t\t\t\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t |
| \n\t\t\t\tVrijdag\n\t\t\t | \n\n\t\t\t\t\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t |
| \n\t\t\t\tZaterdag\n\t\t\t | \n\n\t\t\t\t\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t |
| \n\t\t\t\tZondag\n\t\t\t | \n\n\t\t\t\t\t\t\t\t\t\t\t\t-\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t |
',
+ 'content': {
+ 'html': '| Maandag | - |
| Dinsdag | - |
| Woensdag | - |
| Donderdag | - |
| Vrijdag | - |
| Zaterdag | - |
| Zondag | - |
',
+ 'is_complex': False,
+ 'table_nest_level': '1'
+ }
+ },
+ {
+ 'type': 'code',
+ 'raw_content': 'frame.open();\nframe.write(html);\nframe.close();\n',
+ 'inline': False,
+ 'content': {
+ 'code_content': 'frame.open();\nframe.write(html);\nframe.close();',
+ 'by': 'tag_pre_code'
+ }
+ }
+ ]
+ ]
+ }
+
+ def test_default_exclude():
+ datajson = DataJson(d)
+ md = datajson.get_content_list().to_nlp_md()
+ assert 'Ziet u iets wat niet hoort of niet klopt?' in md
+ assert 'Openingstijden' in md
+ assert 'Maandag' in md
+ assert 'frame.open();\nframe.write(html);\nframe.close();' in md
+
+ def test_custom_exclude():
+ datajson = DataJson(d)
+ md = datajson.get_content_list().to_nlp_md(MM_NODE_LIST=['table'])
+ assert 'Ziet u iets wat niet hoort of niet klopt?' in md
+ assert 'Openingstijden' in md
+ assert 'Maandag' not in md
+ assert 'frame.open();\nframe.write(html);\nframe.close();' in md
+
+ test_default_exclude()
+ test_custom_exclude()