Skip to content

Commit d246dee

Browse files
committed
support ShareGPT dataset as data file
Signed-off-by: guangli.bao <guangli.bao@daocloud.io>
1 parent ad9513f commit d246dee

File tree

4 files changed

+144
-0
lines changed

4 files changed

+144
-0
lines changed

docs/datasets.md

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,3 +220,33 @@ benchmark_generative_text(data=data, ...)
220220
- For lists of dictionaries, all items must have the same keys.
221221
- For lists of items, all elements must be of the same type.
222222
- A processor/tokenizer is only required if `GUIDELLM__PREFERRED_PROMPT_TOKENS_SOURCE="local"` or `GUIDELLM__PREFERRED_OUTPUT_TOKENS_SOURCE="local"` is set in the environment. In this case, the processor/tokenizer must be specified using the `--processor` argument. If not set, the processor/tokenizer will be set to the model passed in or retrieved from the server.
223+
224+
225+
### ShareGPT Datasets
226+
227+
You can use ShareGPT_V3_unfiltered_cleaned_split.json as benchmark datasets.
228+
229+
1. Download and prepare the ShareGPT dataset
230+
You can specify the proportion of data to process by providing a number between 0 and 1 as an argument to the script.
231+
232+
```bash
233+
cd src/guidellm/utils
234+
pip install -r requirements.txt
235+
bash prepare_sharegpt_data.sh 1
236+
```
237+
238+
In this example, 1 indicates processing 100% of the dataset. You can adjust this value as needed.
239+
240+
Conda env Recommanded to install libs.
241+
242+
2. Run the benchmark
243+
Example:
244+
245+
```bash
246+
guidellm benchmark \
247+
--target "http://localhost:8000" \
248+
--rate-type "throughput" \
249+
--data-args '{"prompt_column": "value", "split": "train"}' \
250+
--max-requests 10 \
251+
--data "/${local_path}/ShareGPT.json"
252+
```
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
#!/bin/bash
2+
3+
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
4+
python3 sharegpt_data_preprocessing.py --parse $1
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
tqdm
2+
pandas
3+
openai
4+
pyyaml
Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
import argparse
2+
import json
3+
import os
4+
import re
5+
from pathlib import Path
6+
7+
import numpy as np
8+
from datasets import load_dataset
9+
from transformers import AutoTokenizer
10+
11+
MIN_CHAR = 10
12+
MAX_CHAR = 1000
13+
14+
15+
def extract_and_save_with_filtering(file):
16+
"""substract human prompts and apply filtering conditions"""
17+
dataset = load_dataset("json", data_files=file, split="train")
18+
filtered_prompts = []
19+
20+
for example in dataset:
21+
conversations = example.get("conversations", [])
22+
if isinstance(conversations, list):
23+
for turn in conversations:
24+
if turn.get("from") in ["human", "user"]:
25+
prompt_text = turn["value"].strip()
26+
# apply filter conditions: more than 10 characters
27+
if (
28+
len(prompt_text) >= MIN_CHAR
29+
and
30+
# less thant 1000 characters
31+
len(prompt_text) <= MAX_CHAR
32+
and
33+
# except URLs
34+
not prompt_text.startswith(("http://", "https://"))
35+
and
36+
# except special characters
37+
not re.search(r"[<>{}[\]\\]", prompt_text)
38+
and not prompt_text.isdigit()
39+
): # except pure numbers
40+
filtered_prompts.append(
41+
{
42+
"from": turn.get("from"),
43+
"text": prompt_text,
44+
"char_count": len(prompt_text),
45+
"word_count": len(prompt_text.split()),
46+
}
47+
)
48+
49+
return filtered_prompts
50+
51+
52+
if __name__ == "__main__":
53+
parser = argparse.ArgumentParser(description="Process data percentage.")
54+
parser.add_argument(
55+
"--parse",
56+
type=float,
57+
default=1,
58+
help="The percentage of data to process (0 to 1). Default is 1 (100%).",
59+
)
60+
args = parser.parse_args()
61+
62+
sharegpt_file = "ShareGPT_V3_unfiltered_cleaned_split.json"
63+
with Path(sharegpt_file).open("r", encoding="utf-8") as file:
64+
data = json.load(file)
65+
66+
def estimate_num_tokens(text: str) -> int:
67+
if not hasattr(estimate_num_tokens, "tokenizer"):
68+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
69+
estimate_num_tokens.tokenizer = AutoTokenizer.from_pretrained(
70+
"mistralai/Mistral-7B-Instruct-v0.2"
71+
)
72+
return len(estimate_num_tokens.tokenizer.tokenize(text))
73+
74+
num_of_ids = len(data)
75+
data = data[: int(num_of_ids * args.parse)]
76+
for d in data:
77+
d["num_round"] = len(d["conversations"])
78+
human_tokens = []
79+
gpt_tokens = []
80+
for conv in d["conversations"]:
81+
if conv["from"] == "human":
82+
human_tokens.append(estimate_num_tokens(conv["value"]))
83+
if conv["from"] == "gpt":
84+
token_number = estimate_num_tokens(conv["value"])
85+
conv["num_tokens"] = token_number
86+
gpt_tokens.append(token_number)
87+
if len(human_tokens) == 0:
88+
d["average_human_token"] = 0
89+
d["max_human_token"] = 0
90+
else:
91+
d["average_human_token"] = float(np.mean(human_tokens))
92+
d["max_human_token"] = float(np.max(human_tokens))
93+
if len(gpt_tokens) == 0:
94+
d["average_gpt_token"] = 0
95+
d["max_gpt_token"] = 0
96+
else:
97+
d["average_gpt_token"] = float(np.mean(gpt_tokens))
98+
d["max_gpt_token"] = float(np.max(gpt_tokens))
99+
100+
# save unfiletered datasets to ShareGPT.json
101+
with Path("ShareGPT.json").open("w", encoding="utf-8") as file:
102+
json.dump(data, file, ensure_ascii=False, indent=2)
103+
# filter from: human prompts and save again
104+
filtered_result = extract_and_save_with_filtering("ShareGPT.json")
105+
with Path("ShareGPT.json").open("w", encoding="utf-8") as file:
106+
json.dump(filtered_result, file, ensure_ascii=False, indent=2)

0 commit comments

Comments
 (0)