From 522a324ff4ce4c8ad0ae1fb02783bc96b6bce21c Mon Sep 17 00:00:00 2001 From: drunkpig Date: Fri, 28 Feb 2025 20:29:16 +0800 Subject: [PATCH 1/5] feat: content_list to_dict() --- .gitignore | 1 + llm_web_kit/input/datajson.py | 3 +++ 2 files changed, 4 insertions(+) diff --git a/.gitignore b/.gitignore index a17bd6d1..fddaf477 100644 --- a/.gitignore +++ b/.gitignore @@ -46,3 +46,4 @@ output/ coverage.xml llm_web_kit.egg-info/* +.llm-web-kit.jsonc diff --git a/llm_web_kit/input/datajson.py b/llm_web_kit/input/datajson.py index 20c3842f..0f3f6932 100644 --- a/llm_web_kit/input/datajson.py +++ b/llm_web_kit/input/datajson.py @@ -130,6 +130,9 @@ def to_json(self, pretty=False) -> str: else: return json.dumps(content_lst, ensure_ascii=False) + def to_dict(self) -> dict: + return copy.deepcopy(self._get_data()) + @abstractmethod def _get_data(self) -> List[Dict]: raise NotImplementedError('This method must be implemented by the subclass.') From 50fafbae2d05249a1ba00e2f9dcf5c4ed245654d Mon Sep 17 00:00:00 2001 From: drunkpig Date: Tue, 11 Mar 2025 15:17:08 +0800 Subject: [PATCH 2/5] fix: test error --- tests/llm_web_kit/libs/test_standard_utils.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/llm_web_kit/libs/test_standard_utils.py b/tests/llm_web_kit/libs/test_standard_utils.py index e66557e6..181e8442 100644 --- a/tests/llm_web_kit/libs/test_standard_utils.py +++ b/tests/llm_web_kit/libs/test_standard_utils.py @@ -48,13 +48,13 @@ def test_json_loads(input: Union[str, bytes], target_dict) -> None: '0': 'aaa', '1': 'bbb', '2': 'ccc' - }, '''{"0": "aaa", "1": "bbb", "2": "ccc"}'''), + }, '''{"0":"aaa","1":"bbb","2":"ccc"}'''), ({ 'track_id': '7c5b99d3', 'warc_record_offset': 65390694, 'warc_record_length': '16190', 'layout_id': 0 - }, '{"track_id": "7c5b99d3", "warc_record_offset": 65390694, "warc_record_length": "16190", "layout_id": 0}'), + }, '{"track_id":"7c5b99d3","warc_record_offset":65390694,"warc_record_length":"16190","layout_id":0}'), ]) def test_json_dumps(input_dict: dict, target_str) -> None: """ @@ -66,4 +66,10 @@ def test_json_dumps(input_dict: dict, target_str) -> None: Returns: None """ + expected_obj = json_loads(target_str) + # 比较两个对象是否相等 + for key, value in input_dict.items(): + assert expected_obj[key] == value + + # 比较json_dumps的输出是否与target_str相等 assert target_str == json_dumps(input_dict) From e9b09125cfddb2457df6d4340d3a5987974ac507 Mon Sep 17 00:00:00 2001 From: drunkpig Date: Tue, 11 Mar 2025 16:45:27 +0800 Subject: [PATCH 3/5] fix: json utils error with different python version --- tests/llm_web_kit/libs/test_standard_utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/llm_web_kit/libs/test_standard_utils.py b/tests/llm_web_kit/libs/test_standard_utils.py index 181e8442..f2627d9f 100644 --- a/tests/llm_web_kit/libs/test_standard_utils.py +++ b/tests/llm_web_kit/libs/test_standard_utils.py @@ -72,4 +72,7 @@ def test_json_dumps(input_dict: dict, target_str) -> None: assert expected_obj[key] == value # 比较json_dumps的输出是否与target_str相等 - assert target_str == json_dumps(input_dict) + json_str = json_dumps(input_dict) # 由于不同的python版本,json_dumps的输出可能不同,所以需要比较json_loads的输出 + obj = json_loads(json_str) + for key, value in input_dict.items(): + assert obj[key] == value From d73fab9f108a0e64e911643678e89713ac7ae96b Mon Sep 17 00:00:00 2001 From: drunkpig Date: Tue, 11 Mar 2025 17:01:38 +0800 Subject: [PATCH 4/5] fix: json utils error with different python version --- tests/llm_web_kit/input/test_datajson.py | 26 ++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/tests/llm_web_kit/input/test_datajson.py b/tests/llm_web_kit/input/test_datajson.py index ce6b4798..ac0b9ea1 100644 --- a/tests/llm_web_kit/input/test_datajson.py +++ b/tests/llm_web_kit/input/test_datajson.py @@ -120,3 +120,29 @@ def test_data_json_deepcopy(): _ = DataJson(copied) cl = copied.get('content_list') # 不该变外部变量d assert cl is None + + def test_datajson_to_dict_immutable(): + """测试to_dict()返回的dict修改不会影响原DataJson对象.""" + data = { + DataJsonKey.DATASET_NAME: 'test_dataset', + DataJsonKey.FILE_FORMAT: 'html', + DataJsonKey.CONTENT_LIST: [ + {'type': 'text', 'content': 'test content'} + ] + } + datajson = DataJson(data) + + # Get dict representation + dict_data = datajson.to_dict() + + # Modify the returned dict + dict_data[DataJsonKey.DATASET_NAME] = 'modified_dataset' + dict_data[DataJsonKey.CONTENT_LIST][0]['content'] = 'modified content' + + # Original DataJson should remain unchanged + assert datajson.get_dataset_name() == 'test_dataset' + assert datajson.get_content_list()._get_data()[0]['content'] == 'test content' + + # Verify the modifications only affected the dict copy + assert dict_data[DataJsonKey.DATASET_NAME] == 'modified_dataset' + assert dict_data[DataJsonKey.CONTENT_LIST][0]['content'] == 'modified content' From 83a75c8a5f044bfb702a96f8079190f7334971cb Mon Sep 17 00:00:00 2001 From: drunkpig Date: Tue, 11 Mar 2025 17:35:56 +0800 Subject: [PATCH 5/5] fix: datajson.to_dict() --- llm_web_kit/input/datajson.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llm_web_kit/input/datajson.py b/llm_web_kit/input/datajson.py index 0f3f6932..b086579c 100644 --- a/llm_web_kit/input/datajson.py +++ b/llm_web_kit/input/datajson.py @@ -473,7 +473,7 @@ def to_json(self, pretty=False) -> str: str: json字符串 """ json_dict = self.__json_data.copy() - json_dict[DataJsonKey.CONTENT_LIST] = self.get_content_list()._get_data() + json_dict[DataJsonKey.CONTENT_LIST] = self.get_content_list().to_dict() if pretty: return json.dumps(json_dict, indent=2, ensure_ascii=False) return json.dumps(json_dict, ensure_ascii=False) @@ -485,5 +485,5 @@ def to_dict(self) -> dict: dict: dict对象 """ json_dict = self.__json_data.copy() - json_dict[DataJsonKey.CONTENT_LIST] = self.get_content_list()._get_data() + json_dict[DataJsonKey.CONTENT_LIST] = self.get_content_list().to_dict() return json_dict