Skip to content

Commit 15a9fd8

Browse files
committed
support ShareGPT dataset as data file
Signed-off-by: guangli.bao <guangli.bao@daocloud.io>
1 parent a4bdbb5 commit 15a9fd8

File tree

5 files changed

+167
-0
lines changed

5 files changed

+167
-0
lines changed
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# ShareGPT Datasets
2+
3+
You can use ShareGPT_V3_unfiltered_cleaned_split.json as benchmark datasets.
4+
5+
## Prerequisites
6+
Before you begin, ensure you have the following installed:
7+
8+
* Python 3.9 or higher
9+
* pip (Python package manager)
10+
11+
## Example Commands
12+
13+
Download and prepare the ShareGPT dataset; You can specify the proportion of data to process by providing a number between 0 and 1 as an argument to the script.
14+
15+
```bash
16+
cd contrib/sharegpt_preprocess
17+
pip install -r requirements.txt
18+
bash prepare_sharegpt_data.sh 1
19+
20+
```
21+
22+
In this example, 1 indicates processing 100% of the dataset. You can adjust this value as needed. Conda env is Recommanded to install libs.
23+
24+
```bash
25+
guidellm benchmark \
26+
--target "http://localhost:8000" \
27+
--rate-type "throughput" \
28+
--data-args '{"prompt_column": "value", "split": "train"}' \
29+
--max-requests 10 \
30+
--data "/${local_path}/ShareGPT.json"
31+
```

contrib/sharegpt_preprocess/__init__.py

Whitespace-only changes.
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
#!/bin/bash
2+
3+
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
4+
python3 preprocessing_sharegpt_data.py --parse $1
Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
import argparse
2+
import json
3+
import os
4+
import re
5+
from pathlib import Path
6+
from typing import Optional
7+
8+
import numpy as np
9+
from datasets import load_dataset
10+
from transformers import AutoTokenizer, PreTrainedTokenizerBase
11+
12+
MIN_CHAR = 10
13+
MAX_CHAR = 1000
14+
15+
16+
class TokenCounter:
17+
def __init__(self, model_name: str = "mistralai/Mistral-7B-Instruct-v0.2"):
18+
self.model_name = model_name
19+
self._tokenizer: Optional[PreTrainedTokenizerBase] = None
20+
21+
def _initialize_tokenizer(self) -> None:
22+
if self._tokenizer is None:
23+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
24+
try:
25+
self._tokenizer = AutoTokenizer.from_pretrained(self.model_name)
26+
except (OSError, ImportError, ValueError) as e:
27+
raise RuntimeError(f"Failed to initialize tokenizer: {e}") from e
28+
29+
def estimate_num_tokens(self, text: str) -> int:
30+
self._initialize_tokenizer()
31+
32+
if self._tokenizer is None:
33+
return 0
34+
35+
try:
36+
encoding = self._tokenizer.__call__(text, return_tensors=None)
37+
return len(encoding["input_ids"])
38+
except (AttributeError, TypeError, RuntimeError) as e:
39+
raise ValueError(f"Error processing text: {e}") from e
40+
41+
42+
def extract_and_save_with_filtering(file):
43+
"""substract human prompts and apply filtering conditions"""
44+
dataset = load_dataset("json", data_files=file, split="train")
45+
filtered_prompts = []
46+
47+
for example in dataset:
48+
conversations = example.get("conversations", [])
49+
if isinstance(conversations, list):
50+
for turn in conversations:
51+
if turn.get("from") in ["human", "user"]:
52+
prompt_text = turn["value"].strip()
53+
# apply filter conditions: more than 10 characters
54+
if (
55+
len(prompt_text) >= MIN_CHAR
56+
and
57+
# less thant 1000 characters
58+
len(prompt_text) <= MAX_CHAR
59+
and
60+
# except URLs
61+
not prompt_text.startswith(("http://", "https://"))
62+
and
63+
# except special characters
64+
not re.search(r"[<>{}[\]\\]", prompt_text)
65+
# except pure numbers
66+
and not prompt_text.isdigit()
67+
):
68+
filtered_prompts.append(
69+
{
70+
"from": turn.get("from"),
71+
"text": prompt_text,
72+
"char_count": len(prompt_text),
73+
"word_count": len(prompt_text.split()),
74+
}
75+
)
76+
77+
return filtered_prompts
78+
79+
80+
if __name__ == "__main__":
81+
parser = argparse.ArgumentParser(description="Process data percentage.")
82+
parser.add_argument(
83+
"--parse",
84+
type=float,
85+
default=1,
86+
help="The percentage of data to process (0 to 1). Default is 1 (100%).",
87+
)
88+
args = parser.parse_args()
89+
90+
sharegpt_file = "ShareGPT_V3_unfiltered_cleaned_split.json"
91+
with Path(sharegpt_file).open("r", encoding="utf-8") as file:
92+
data = json.load(file)
93+
94+
counter = TokenCounter()
95+
num_of_ids = len(data)
96+
data = data[: int(num_of_ids * args.parse)]
97+
for d in data:
98+
d["num_round"] = len(d["conversations"])
99+
human_tokens = []
100+
gpt_tokens = []
101+
for conv in d["conversations"]:
102+
if conv["from"] == "human":
103+
human_tokens.append(counter.estimate_num_tokens(conv["value"]))
104+
if conv["from"] == "gpt":
105+
token_number = counter.estimate_num_tokens(conv["value"])
106+
conv["num_tokens"] = token_number
107+
gpt_tokens.append(token_number)
108+
if len(human_tokens) == 0:
109+
d["average_human_token"] = 0
110+
d["max_human_token"] = 0
111+
else:
112+
d["average_human_token"] = float(np.mean(human_tokens))
113+
d["max_human_token"] = float(np.max(human_tokens))
114+
if len(gpt_tokens) == 0:
115+
d["average_gpt_token"] = 0
116+
d["max_gpt_token"] = 0
117+
else:
118+
d["average_gpt_token"] = float(np.mean(gpt_tokens))
119+
d["max_gpt_token"] = float(np.max(gpt_tokens))
120+
121+
# save unfiletered datasets to ShareGPT.json
122+
with Path("ShareGPT.json").open("w", encoding="utf-8") as file:
123+
json.dump(data, file, ensure_ascii=False, indent=2)
124+
# filter from: human prompts and save again
125+
filtered_result = extract_and_save_with_filtering("ShareGPT.json")
126+
with Path("ShareGPT.json").open("w", encoding="utf-8") as file:
127+
json.dump(filtered_result, file, ensure_ascii=False, indent=2)
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
tqdm==4.67.1
2+
pandas==2.3.1
3+
openai==1.99.9
4+
datasets==4.0.0
5+
transformers==4.55.4

0 commit comments

Comments
 (0)