support ShareGPT dataset as data file

tukwila · tukwila · commit d246dee3d27a · 2025-09-08T14:29:53.000+08:00
Signed-off-by: guangli.bao &lt;guangli.bao@daocloud.io&gt;
diff --git a/docs/datasets.md b/docs/datasets.md
@@ -220,3 +220,33 @@ benchmark_generative_text(data=data, ...)
 - For lists of dictionaries, all items must have the same keys.
 - For lists of items, all elements must be of the same type.
 - A processor/tokenizer is only required if `GUIDELLM__PREFERRED_PROMPT_TOKENS_SOURCE="local"` or `GUIDELLM__PREFERRED_OUTPUT_TOKENS_SOURCE="local"` is set in the environment. In this case, the processor/tokenizer must be specified using the `--processor` argument. If not set, the processor/tokenizer will be set to the model passed in or retrieved from the server.
+
+
+### ShareGPT Datasets
+
+You can use ShareGPT_V3_unfiltered_cleaned_split.json as benchmark datasets.
+
+1. Download and prepare the ShareGPT dataset 
+    You can specify the proportion of data to process by providing a number between 0 and 1 as an argument to the script.
+
+    ```bash
+    cd src/guidellm/utils
+    pip install -r requirements.txt
+    bash prepare_sharegpt_data.sh 1
+    ```
+
+    In this example, 1 indicates processing 100% of the dataset. You can adjust this value as needed.
+
+    Conda env Recommanded to install libs.
+
+2. Run the benchmark
+    Example:
+
+    ```bash
+    guidellm benchmark \
+      --target "http://localhost:8000" \
+      --rate-type "throughput" \
+      --data-args '{"prompt_column": "value", "split": "train"}' \
+      --max-requests 10 \
+      --data "/${local_path}/ShareGPT.json"
+    ```
diff --git a/src/guidellm/utils/prepare_sharegpt_data.sh b/src/guidellm/utils/prepare_sharegpt_data.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+python3 sharegpt_data_preprocessing.py --parse $1
diff --git a/src/guidellm/utils/requirements.txt b/src/guidellm/utils/requirements.txt
@@ -0,0 +1,4 @@
+tqdm
+pandas
+openai
+pyyaml
diff --git a/src/guidellm/utils/shareGPT_data_preprocessing.py b/src/guidellm/utils/shareGPT_data_preprocessing.py
@@ -0,0 +1,106 @@
+import argparse
+import json
+import os
+import re
+from pathlib import Path
+
+import numpy as np
+from datasets import load_dataset
+from transformers import AutoTokenizer
+
+MIN_CHAR = 10
+MAX_CHAR = 1000
+
+
+def extract_and_save_with_filtering(file):
+    """substract human prompts and apply filtering conditions"""
+    dataset = load_dataset("json", data_files=file, split="train")
+    filtered_prompts = []
+
+    for example in dataset:
+        conversations = example.get("conversations", [])
+        if isinstance(conversations, list):
+            for turn in conversations:
+                if turn.get("from") in ["human", "user"]:
+                    prompt_text = turn["value"].strip()
+                    # apply filter conditions: more than 10 characters
+                    if (
+                        len(prompt_text) >= MIN_CHAR
+                        and
+                        # less thant 1000 characters
+                        len(prompt_text) <= MAX_CHAR
+                        and
+                        # except URLs
+                        not prompt_text.startswith(("http://", "https://"))
+                        and
+                        # except special characters
+                        not re.search(r"[<>{}[\]\\]", prompt_text)
+                        and not prompt_text.isdigit()
+                    ):  # except pure numbers
+                        filtered_prompts.append(
+                            {
+                                "from": turn.get("from"),
+                                "text": prompt_text,
+                                "char_count": len(prompt_text),
+                                "word_count": len(prompt_text.split()),
+                            }
+                        )
+
+    return filtered_prompts
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Process data percentage.")
+    parser.add_argument(
+        "--parse",
+        type=float,
+        default=1,
+        help="The percentage of data to process (0 to 1). Default is 1 (100%).",
+    )
+    args = parser.parse_args()
+
+    sharegpt_file = "ShareGPT_V3_unfiltered_cleaned_split.json"
+    with Path(sharegpt_file).open("r", encoding="utf-8") as file:
+        data = json.load(file)
+
+    def estimate_num_tokens(text: str) -> int:
+        if not hasattr(estimate_num_tokens, "tokenizer"):
+            os.environ["TOKENIZERS_PARALLELISM"] = "false"
+            estimate_num_tokens.tokenizer = AutoTokenizer.from_pretrained(
+                "mistralai/Mistral-7B-Instruct-v0.2"
+            )
+        return len(estimate_num_tokens.tokenizer.tokenize(text))
+
+    num_of_ids = len(data)
+    data = data[: int(num_of_ids * args.parse)]
+    for d in data:
+        d["num_round"] = len(d["conversations"])
+        human_tokens = []
+        gpt_tokens = []
+        for conv in d["conversations"]:
+            if conv["from"] == "human":
+                human_tokens.append(estimate_num_tokens(conv["value"]))
+            if conv["from"] == "gpt":
+                token_number = estimate_num_tokens(conv["value"])
+                conv["num_tokens"] = token_number
+                gpt_tokens.append(token_number)
+        if len(human_tokens) == 0:
+            d["average_human_token"] = 0
+            d["max_human_token"] = 0
+        else:
+            d["average_human_token"] = float(np.mean(human_tokens))
+            d["max_human_token"] = float(np.max(human_tokens))
+        if len(gpt_tokens) == 0:
+            d["average_gpt_token"] = 0
+            d["max_gpt_token"] = 0
+        else:
+            d["average_gpt_token"] = float(np.mean(gpt_tokens))
+            d["max_gpt_token"] = float(np.max(gpt_tokens))
+
+    # save unfiletered datasets to ShareGPT.json
+    with Path("ShareGPT.json").open("w", encoding="utf-8") as file:
+        json.dump(data, file, ensure_ascii=False, indent=2)
+    # filter from: human prompts and save again
+    filtered_result = extract_and_save_with_filtering("ShareGPT.json")
+    with Path("ShareGPT.json").open("w", encoding="utf-8") as file:
+        json.dump(filtered_result, file, ensure_ascii=False, indent=2)

-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +tqdm
 +pandas
 +openai
 +pyyaml