Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,4 @@ output/
coverage.xml

llm_web_kit.egg-info/*
.llm-web-kit.jsonc
7 changes: 5 additions & 2 deletions llm_web_kit/input/datajson.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,9 @@ def to_json(self, pretty=False) -> str:
else:
return json.dumps(content_lst, ensure_ascii=False)

def to_dict(self) -> dict:
return copy.deepcopy(self._get_data())

@abstractmethod
def _get_data(self) -> List[Dict]:
raise NotImplementedError('This method must be implemented by the subclass.')
Expand Down Expand Up @@ -470,7 +473,7 @@ def to_json(self, pretty=False) -> str:
str: json字符串
"""
json_dict = self.__json_data.copy()
json_dict[DataJsonKey.CONTENT_LIST] = self.get_content_list()._get_data()
json_dict[DataJsonKey.CONTENT_LIST] = self.get_content_list().to_dict()
if pretty:
return json.dumps(json_dict, indent=2, ensure_ascii=False)
return json.dumps(json_dict, ensure_ascii=False)
Expand All @@ -482,5 +485,5 @@ def to_dict(self) -> dict:
dict: dict对象
"""
json_dict = self.__json_data.copy()
json_dict[DataJsonKey.CONTENT_LIST] = self.get_content_list()._get_data()
json_dict[DataJsonKey.CONTENT_LIST] = self.get_content_list().to_dict()
return json_dict
26 changes: 26 additions & 0 deletions tests/llm_web_kit/input/test_datajson.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,3 +120,29 @@ def test_data_json_deepcopy():
_ = DataJson(copied)
cl = copied.get('content_list') # 不该变外部变量d
assert cl is None

def test_datajson_to_dict_immutable():
"""测试to_dict()返回的dict修改不会影响原DataJson对象."""
data = {
DataJsonKey.DATASET_NAME: 'test_dataset',
DataJsonKey.FILE_FORMAT: 'html',
DataJsonKey.CONTENT_LIST: [
{'type': 'text', 'content': 'test content'}
]
}
datajson = DataJson(data)

# Get dict representation
dict_data = datajson.to_dict()

# Modify the returned dict
dict_data[DataJsonKey.DATASET_NAME] = 'modified_dataset'
dict_data[DataJsonKey.CONTENT_LIST][0]['content'] = 'modified content'

# Original DataJson should remain unchanged
assert datajson.get_dataset_name() == 'test_dataset'
assert datajson.get_content_list()._get_data()[0]['content'] == 'test content'

# Verify the modifications only affected the dict copy
assert dict_data[DataJsonKey.DATASET_NAME] == 'modified_dataset'
assert dict_data[DataJsonKey.CONTENT_LIST][0]['content'] == 'modified content'
15 changes: 12 additions & 3 deletions tests/llm_web_kit/libs/test_standard_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,13 +48,13 @@ def test_json_loads(input: Union[str, bytes], target_dict) -> None:
'0': 'aaa',
'1': 'bbb',
'2': 'ccc'
}, '''{"0": "aaa", "1": "bbb", "2": "ccc"}'''),
}, '''{"0":"aaa","1":"bbb","2":"ccc"}'''),
({
'track_id': '7c5b99d3',
'warc_record_offset': 65390694,
'warc_record_length': '16190',
'layout_id': 0
}, '{"track_id": "7c5b99d3", "warc_record_offset": 65390694, "warc_record_length": "16190", "layout_id": 0}'),
}, '{"track_id":"7c5b99d3","warc_record_offset":65390694,"warc_record_length":"16190","layout_id":0}'),
])
def test_json_dumps(input_dict: dict, target_str) -> None:
"""
Expand All @@ -66,4 +66,13 @@ def test_json_dumps(input_dict: dict, target_str) -> None:
Returns: None

"""
assert target_str == json_dumps(input_dict)
expected_obj = json_loads(target_str)
# 比较两个对象是否相等
for key, value in input_dict.items():
assert expected_obj[key] == value

# 比较json_dumps的输出是否与target_str相等
json_str = json_dumps(input_dict) # 由于不同的python版本,json_dumps的输出可能不同,所以需要比较json_loads的输出
obj = json_loads(json_str)
for key, value in input_dict.items():
assert obj[key] == value