diff --git a/.gitignore b/.gitignore index a17bd6d1..fddaf477 100644 --- a/.gitignore +++ b/.gitignore @@ -46,3 +46,4 @@ output/ coverage.xml llm_web_kit.egg-info/* +.llm-web-kit.jsonc diff --git a/llm_web_kit/input/datajson.py b/llm_web_kit/input/datajson.py index 20c3842f..b086579c 100644 --- a/llm_web_kit/input/datajson.py +++ b/llm_web_kit/input/datajson.py @@ -130,6 +130,9 @@ def to_json(self, pretty=False) -> str: else: return json.dumps(content_lst, ensure_ascii=False) + def to_dict(self) -> dict: + return copy.deepcopy(self._get_data()) + @abstractmethod def _get_data(self) -> List[Dict]: raise NotImplementedError('This method must be implemented by the subclass.') @@ -470,7 +473,7 @@ def to_json(self, pretty=False) -> str: str: json字符串 """ json_dict = self.__json_data.copy() - json_dict[DataJsonKey.CONTENT_LIST] = self.get_content_list()._get_data() + json_dict[DataJsonKey.CONTENT_LIST] = self.get_content_list().to_dict() if pretty: return json.dumps(json_dict, indent=2, ensure_ascii=False) return json.dumps(json_dict, ensure_ascii=False) @@ -482,5 +485,5 @@ def to_dict(self) -> dict: dict: dict对象 """ json_dict = self.__json_data.copy() - json_dict[DataJsonKey.CONTENT_LIST] = self.get_content_list()._get_data() + json_dict[DataJsonKey.CONTENT_LIST] = self.get_content_list().to_dict() return json_dict diff --git a/tests/llm_web_kit/input/test_datajson.py b/tests/llm_web_kit/input/test_datajson.py index ce6b4798..ac0b9ea1 100644 --- a/tests/llm_web_kit/input/test_datajson.py +++ b/tests/llm_web_kit/input/test_datajson.py @@ -120,3 +120,29 @@ def test_data_json_deepcopy(): _ = DataJson(copied) cl = copied.get('content_list') # 不该变外部变量d assert cl is None + + def test_datajson_to_dict_immutable(): + """测试to_dict()返回的dict修改不会影响原DataJson对象.""" + data = { + DataJsonKey.DATASET_NAME: 'test_dataset', + DataJsonKey.FILE_FORMAT: 'html', + DataJsonKey.CONTENT_LIST: [ + {'type': 'text', 'content': 'test content'} + ] + } + datajson = DataJson(data) + + # Get dict representation + dict_data = datajson.to_dict() + + # Modify the returned dict + dict_data[DataJsonKey.DATASET_NAME] = 'modified_dataset' + dict_data[DataJsonKey.CONTENT_LIST][0]['content'] = 'modified content' + + # Original DataJson should remain unchanged + assert datajson.get_dataset_name() == 'test_dataset' + assert datajson.get_content_list()._get_data()[0]['content'] == 'test content' + + # Verify the modifications only affected the dict copy + assert dict_data[DataJsonKey.DATASET_NAME] == 'modified_dataset' + assert dict_data[DataJsonKey.CONTENT_LIST][0]['content'] == 'modified content' diff --git a/tests/llm_web_kit/libs/test_standard_utils.py b/tests/llm_web_kit/libs/test_standard_utils.py index e66557e6..f2627d9f 100644 --- a/tests/llm_web_kit/libs/test_standard_utils.py +++ b/tests/llm_web_kit/libs/test_standard_utils.py @@ -48,13 +48,13 @@ def test_json_loads(input: Union[str, bytes], target_dict) -> None: '0': 'aaa', '1': 'bbb', '2': 'ccc' - }, '''{"0": "aaa", "1": "bbb", "2": "ccc"}'''), + }, '''{"0":"aaa","1":"bbb","2":"ccc"}'''), ({ 'track_id': '7c5b99d3', 'warc_record_offset': 65390694, 'warc_record_length': '16190', 'layout_id': 0 - }, '{"track_id": "7c5b99d3", "warc_record_offset": 65390694, "warc_record_length": "16190", "layout_id": 0}'), + }, '{"track_id":"7c5b99d3","warc_record_offset":65390694,"warc_record_length":"16190","layout_id":0}'), ]) def test_json_dumps(input_dict: dict, target_str) -> None: """ @@ -66,4 +66,13 @@ def test_json_dumps(input_dict: dict, target_str) -> None: Returns: None """ - assert target_str == json_dumps(input_dict) + expected_obj = json_loads(target_str) + # 比较两个对象是否相等 + for key, value in input_dict.items(): + assert expected_obj[key] == value + + # 比较json_dumps的输出是否与target_str相等 + json_str = json_dumps(input_dict) # 由于不同的python版本,json_dumps的输出可能不同,所以需要比较json_loads的输出 + obj = json_loads(json_str) + for key, value in input_dict.items(): + assert obj[key] == value