update code format

tukwila · tukwila · commit f8c223115f8d · 2025-09-08T14:29:13.000+08:00
Signed-off-by: guangli.bao &lt;guangli.bao@daocloud.io&gt;
diff --git a/src/guidellm/utils/prepare_sharegpt_data.sh b/src/guidellm/utils/prepare_sharegpt_data.sh
@@ -1,4 +1,4 @@
 #!/bin/bash
 
 wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-python3 shareGPT_data_preprocessing.py --parse $1
+python3 sharegpt_data_preprocessing.py --parse $1
diff --git a/src/guidellm/utils/shareGPT_data_preprocessing.py b/src/guidellm/utils/shareGPT_data_preprocessing.py
@@ -1,47 +1,54 @@
-# SPDX-License-Identifier: Apache-2.0
-# Standard
 import argparse
 import json
 import os
+import re
+from pathlib import Path
 
-# Third Party
-from transformers import AutoTokenizer
 import numpy as np
-
 from datasets import load_dataset
-import re
+from transformers import AutoTokenizer
 
-def extract_and_save_with_filtering():
+MIN_CHAR = 10
+MAX_CHAR = 1000
+
+
+def extract_and_save_with_filtering(file):
     """substract human prompts and apply filtering conditions"""
-    
-    dataset = load_dataset('json', data_files='./ShareGPT.json', split='train')
-    
+    dataset = load_dataset("json", data_files=file, split="train")
     filtered_prompts = []
-    
+
     for example in dataset:
-        conversations = example.get('conversations', [])
-        
+        conversations = example.get("conversations", [])
         if isinstance(conversations, list):
             for turn in conversations:
-                if turn.get('from') in ['human', 'user']:
-                    prompt_text = turn['value'].strip()
-                    
-                    # 应用过滤条件
-                    if (len(prompt_text) >= 10 and  # 至少10个字符
-                        len(prompt_text) <= 1000 and  # 最多1000个字符
-                        not prompt_text.startswith(('http://', 'https://')) and  # 排除URL
-                        not re.search(r'[<>{}[\]\\]', prompt_text) and  # 排除特殊字符
-                        not prompt_text.isdigit()):  # 排除纯数字
-                        
-                        filtered_prompts.append({
-                            'from': turn.get('from'),
-                            'text': prompt_text,
-                            'char_count': len(prompt_text),
-                            'word_count': len(prompt_text.split())
-                        })
-    
+                if turn.get("from") in ["human", "user"]:
+                    prompt_text = turn["value"].strip()
+                    # apply filter conditions: more than 10 characters
+                    if (
+                        len(prompt_text) >= MIN_CHAR
+                        and
+                        # less thant 1000 characters
+                        len(prompt_text) <= MAX_CHAR
+                        and
+                        # except URLs
+                        not prompt_text.startswith(("http://", "https://"))
+                        and
+                        # except special characters
+                        not re.search(r"[<>{}[\]\\]", prompt_text)
+                        and not prompt_text.isdigit()
+                    ):  # except pure numbers
+                        filtered_prompts.append(
+                            {
+                                "from": turn.get("from"),
+                                "text": prompt_text,
+                                "char_count": len(prompt_text),
+                                "word_count": len(prompt_text.split()),
+                            }
+                        )
+
     return filtered_prompts
-    
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Process data percentage.")
     parser.add_argument(
@@ -50,13 +57,12 @@ def extract_and_save_with_filtering():
         default=1,
         help="The percentage of data to process (0 to 1). Default is 1 (100%).",
     )
-
     args = parser.parse_args()
 
-    with open("ShareGPT_V3_unfiltered_cleaned_split.json", "r", encoding="utf-8") as file:
+    sharegpt_file = "ShareGPT_V3_unfiltered_cleaned_split.json"
+    with Path(sharegpt_file).open("r", encoding="utf-8") as file:
         data = json.load(file)
 
-
     def estimate_num_tokens(text: str) -> int:
         if not hasattr(estimate_num_tokens, "tokenizer"):
             os.environ["TOKENIZERS_PARALLELISM"] = "false"
@@ -65,15 +71,10 @@ def estimate_num_tokens(text: str) -> int:
             )
         return len(estimate_num_tokens.tokenizer.tokenize(text))
 
-
     num_of_ids = len(data)
-    print(f"Number of IDs: {num_of_ids}")
     data = data[: int(num_of_ids * args.parse)]
-
-    count = 0
-
     for d in data:
-        d["num_round"] = len(d["conversations"])  # human is one round, gpt is another round
+        d["num_round"] = len(d["conversations"])
         human_tokens = []
         gpt_tokens = []
         for conv in d["conversations"]:
@@ -96,15 +97,10 @@ def estimate_num_tokens(text: str) -> int:
             d["average_gpt_token"] = float(np.mean(gpt_tokens))
             d["max_gpt_token"] = float(np.max(gpt_tokens))
 
-        count += 1
-        print(f"Finished {count}")
-
     # save unfiletered datasets to ShareGPT.json
-    with open("ShareGPT.json", "w", encoding="utf-8") as file:
+    with Path("ShareGPT.json").open("w", encoding="utf-8") as file:
         json.dump(data, file, ensure_ascii=False, indent=2)
     # filter from: human prompts and save again
-    filtered_result = extract_and_save_with_filtering()
-    with open("ShareGPT.json", "w", encoding="utf-8") as file:
+    filtered_result = extract_and_save_with_filtering("ShareGPT.json")
+    with Path("ShareGPT.json").open("w", encoding="utf-8") as file:
         json.dump(filtered_result, file, ensure_ascii=False, indent=2)
-
-