Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions contrib/sharegpt_preprocess/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# ShareGPT Datasets

You can use ShareGPT_V3_unfiltered_cleaned_split.json as benchmark datasets.

## Prerequisites
Before you begin, ensure you have the following installed:

* Python 3.9 or higher
* pip (Python package manager)

## Example Commands

Download and prepare the ShareGPT dataset; You can specify the proportion of data to process by providing a number between 0 and 1 as an argument to the script.

```bash
cd contrib/sharegpt_preprocess
pip install -r requirements.txt
bash prepare_sharegpt_data.sh 1

```

In this example, 1 indicates processing 100% of the dataset. You can adjust this value as needed. Conda env is Recommanded to install libs.

```bash
guidellm benchmark \
--target "http://localhost:8000" \
--rate-type "throughput" \
--data-args '{"prompt_column": "value", "split": "train"}' \
--max-requests 10 \
--data "/${local_path}/ShareGPT.json"
```
Empty file.
4 changes: 4 additions & 0 deletions contrib/sharegpt_preprocess/prepare_sharegpt_data.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/bash

wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
python3 preprocessing_sharegpt_data.py --parse $1
127 changes: 127 additions & 0 deletions contrib/sharegpt_preprocess/preprocessing_sharegpt_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
import argparse
import json
import os
import re
from pathlib import Path
from typing import Optional

import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, PreTrainedTokenizerBase

MIN_CHAR = 10
MAX_CHAR = 1000


class TokenCounter:
def __init__(self, model_name: str = "mistralai/Mistral-7B-Instruct-v0.2"):
self.model_name = model_name
self._tokenizer: Optional[PreTrainedTokenizerBase] = None

def _initialize_tokenizer(self) -> None:
if self._tokenizer is None:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
try:
self._tokenizer = AutoTokenizer.from_pretrained(self.model_name)
except (OSError, ImportError, ValueError) as e:
raise RuntimeError(f"Failed to initialize tokenizer: {e}") from e

def estimate_num_tokens(self, text: str) -> int:
self._initialize_tokenizer()

if self._tokenizer is None:
return 0

try:
encoding = self._tokenizer.__call__(text, return_tensors=None)
return len(encoding["input_ids"])
except (AttributeError, TypeError, RuntimeError) as e:
raise ValueError(f"Error processing text: {e}") from e


def extract_and_save_with_filtering(file):
"""substract human prompts and apply filtering conditions"""
dataset = load_dataset("json", data_files=file, split="train")
filtered_prompts = []

for example in dataset:
conversations = example.get("conversations", [])
if isinstance(conversations, list):
for turn in conversations:
if turn.get("from") in ["human", "user"]:
prompt_text = turn["value"].strip()
# apply filter conditions: more than 10 characters
if (
len(prompt_text) >= MIN_CHAR
and
# less thant 1000 characters
len(prompt_text) <= MAX_CHAR
and
# except URLs
not prompt_text.startswith(("http://", "https://"))
and
# except special characters
not re.search(r"[<>{}[\]\\]", prompt_text)
# except pure numbers
and not prompt_text.isdigit()
):
filtered_prompts.append(
{
"from": turn.get("from"),
"text": prompt_text,
"char_count": len(prompt_text),
"word_count": len(prompt_text.split()),
}
)

return filtered_prompts


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Process data percentage.")
parser.add_argument(
"--parse",
type=float,
default=1,
help="The percentage of data to process (0 to 1). Default is 1 (100%).",
)
args = parser.parse_args()

sharegpt_file = "ShareGPT_V3_unfiltered_cleaned_split.json"
with Path(sharegpt_file).open("r", encoding="utf-8") as file:
data = json.load(file)

counter = TokenCounter()
num_of_ids = len(data)
data = data[: int(num_of_ids * args.parse)]
for d in data:
d["num_round"] = len(d["conversations"])
human_tokens = []
gpt_tokens = []
for conv in d["conversations"]:
if conv["from"] == "human":
human_tokens.append(counter.estimate_num_tokens(conv["value"]))
if conv["from"] == "gpt":
token_number = counter.estimate_num_tokens(conv["value"])
conv["num_tokens"] = token_number
gpt_tokens.append(token_number)
if len(human_tokens) == 0:
d["average_human_token"] = 0
d["max_human_token"] = 0
else:
d["average_human_token"] = float(np.mean(human_tokens))
d["max_human_token"] = float(np.max(human_tokens))
if len(gpt_tokens) == 0:
d["average_gpt_token"] = 0
d["max_gpt_token"] = 0
else:
d["average_gpt_token"] = float(np.mean(gpt_tokens))
d["max_gpt_token"] = float(np.max(gpt_tokens))

# save unfiletered datasets to ShareGPT.json
with Path("ShareGPT.json").open("w", encoding="utf-8") as file:
json.dump(data, file, ensure_ascii=False, indent=2)
# filter from: human prompts and save again
filtered_result = extract_and_save_with_filtering("ShareGPT.json")
with Path("ShareGPT.json").open("w", encoding="utf-8") as file:
json.dump(filtered_result, file, ensure_ascii=False, indent=2)
5 changes: 5 additions & 0 deletions contrib/sharegpt_preprocess/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
tqdm==4.67.1
pandas==2.3.1
openai==1.99.9
datasets==4.0.0
transformers==4.55.4