From 4afa1cf9da242a7becc471a913a2cb4a550f3f0c Mon Sep 17 00:00:00 2001 From: "xinjun.jiang" Date: Fri, 12 Sep 2025 14:09:26 +0800 Subject: [PATCH] feat(data): Default synthetic samples to max_requests Signed-off-by: xinjun.jiang --- src/guidellm/benchmark/entrypoints.py | 1 + src/guidellm/dataset/creator.py | 4 +++- src/guidellm/dataset/entrypoints.py | 2 ++ src/guidellm/dataset/file.py | 1 + src/guidellm/dataset/hf_datasets.py | 1 + src/guidellm/dataset/in_memory.py | 1 + src/guidellm/dataset/synthetic.py | 3 +++ src/guidellm/request/loader.py | 4 +++- 8 files changed, 15 insertions(+), 2 deletions(-) diff --git a/src/guidellm/benchmark/entrypoints.py b/src/guidellm/benchmark/entrypoints.py index 2ef85c3e..3d206b86 100644 --- a/src/guidellm/benchmark/entrypoints.py +++ b/src/guidellm/benchmark/entrypoints.py @@ -89,6 +89,7 @@ async def benchmark_generative_text( else "infinite" # default to infinite so we don't run out of data ), random_seed=random_seed, + max_requests=max_requests, ) unique_requests = request_loader.num_unique_items(raise_err=False) console.print_line( diff --git a/src/guidellm/dataset/creator.py b/src/guidellm/dataset/creator.py index a74ec8c0..1f8acbbc 100644 --- a/src/guidellm/dataset/creator.py +++ b/src/guidellm/dataset/creator.py @@ -85,6 +85,7 @@ def create( processor_args: Optional[dict[str, Any]], random_seed: int = 42, split_pref_order: Optional[list[str]] = None, + max_requests: Optional[int] = None, ) -> tuple[Union[Dataset, IterableDataset], dict[ColumnInputTypes, str]]: if not cls.is_supported(data, data_args): raise ValueError(f"Unsupported data type: {type(data)} given for {data}. ") @@ -92,7 +93,7 @@ def create( split = cls.extract_args_split(data_args) column_mappings = cls.extract_args_column_mappings(data_args) dataset = cls.handle_create( - data, data_args, processor, processor_args, random_seed + data, data_args, processor, processor_args, random_seed, max_requests ) if isinstance(dataset, (DatasetDict, IterableDatasetDict)): @@ -210,4 +211,5 @@ def handle_create( processor: Optional[Union[str, Path, PreTrainedTokenizerBase]], processor_args: Optional[dict[str, Any]], random_seed: int, + max_requests: Optional[int] = None, ) -> Union[Dataset, DatasetDict, IterableDataset, IterableDatasetDict]: ... diff --git a/src/guidellm/dataset/entrypoints.py b/src/guidellm/dataset/entrypoints.py index cf689956..a83649dd 100644 --- a/src/guidellm/dataset/entrypoints.py +++ b/src/guidellm/dataset/entrypoints.py @@ -20,6 +20,7 @@ def load_dataset( processor_args: Optional[dict[str, Any]], random_seed: int = 42, split_pref_order: Optional[list[str]] = None, + max_requests: Optional[int] = None, ) -> tuple[Union[Dataset, IterableDataset], dict[ColumnInputTypes, str]]: creators = [ InMemoryDatasetCreator, @@ -37,6 +38,7 @@ def load_dataset( processor_args, random_seed, split_pref_order, + max_requests, ) raise ValueError(f"Unsupported data type: {type(data)} given for {data}. ") diff --git a/src/guidellm/dataset/file.py b/src/guidellm/dataset/file.py index 5d6df1d9..185f887d 100644 --- a/src/guidellm/dataset/file.py +++ b/src/guidellm/dataset/file.py @@ -45,6 +45,7 @@ def handle_create( processor: Optional[Union[str, Path, PreTrainedTokenizerBase]], # noqa: ARG003 processor_args: Optional[dict[str, Any]], # noqa: ARG003 random_seed: int, # noqa: ARG003 + max_requests: Optional[int] = None, # noqa: ARG003 ) -> Union[Dataset, DatasetDict, IterableDataset, IterableDatasetDict]: if not isinstance(data, (str, Path)): raise ValueError(f"Unsupported data type: {type(data)} given for {data}. ") diff --git a/src/guidellm/dataset/hf_datasets.py b/src/guidellm/dataset/hf_datasets.py index 7f91facd..863993e3 100644 --- a/src/guidellm/dataset/hf_datasets.py +++ b/src/guidellm/dataset/hf_datasets.py @@ -46,6 +46,7 @@ def handle_create( processor: Optional[Union[str, Path, PreTrainedTokenizerBase]], # noqa: ARG003 processor_args: Optional[dict[str, Any]], # noqa: ARG003 random_seed: int, # noqa: ARG003 + max_requests: Optional[int] = None, # noqa: ARG003 ) -> Union[Dataset, DatasetDict, IterableDataset, IterableDatasetDict]: if isinstance(data, (str, Path)): data = load_dataset(data, **(data_args or {})) diff --git a/src/guidellm/dataset/in_memory.py b/src/guidellm/dataset/in_memory.py index af84f658..f2704d38 100644 --- a/src/guidellm/dataset/in_memory.py +++ b/src/guidellm/dataset/in_memory.py @@ -28,6 +28,7 @@ def handle_create( processor: Optional[Union[str, Path, PreTrainedTokenizerBase]], # noqa: ARG003 processor_args: Optional[dict[str, Any]], # noqa: ARG003 random_seed: int, # noqa: ARG003 + max_requests: Optional[int] = None, # noqa: ARG003 ) -> Union[Dataset, DatasetDict, IterableDataset, IterableDatasetDict]: if not isinstance(data, Iterable): raise TypeError( diff --git a/src/guidellm/dataset/synthetic.py b/src/guidellm/dataset/synthetic.py index 8c30f0f7..eab46017 100644 --- a/src/guidellm/dataset/synthetic.py +++ b/src/guidellm/dataset/synthetic.py @@ -252,6 +252,7 @@ def handle_create( processor: Optional[Union[str, Path, PreTrainedTokenizerBase]], processor_args: Optional[dict[str, Any]], random_seed: int, + max_requests: Optional[int] = None, ) -> Union[Dataset, DatasetDict, IterableDataset, IterableDatasetDict]: processor = check_load_processor( processor, @@ -262,6 +263,8 @@ def handle_create( ) config = SyntheticDatasetConfig.parse_str(data) + if "samples=" not in str(data) and max_requests is not None: + config.samples = max_requests generator = SyntheticTextItemsGenerator(config, processor, random_seed) items = list(generator) diff --git a/src/guidellm/request/loader.py b/src/guidellm/request/loader.py index 48566976..c879fd8d 100644 --- a/src/guidellm/request/loader.py +++ b/src/guidellm/request/loader.py @@ -84,6 +84,7 @@ def __init__( shuffle: bool = True, iter_type: Literal["finite", "infinite"] = "finite", random_seed: int = 42, + max_requests: Optional[int] = None, ): self.data = data self.data_args = data_args @@ -93,6 +94,7 @@ def __init__( processor, processor_args, random_seed, + max_requests=max_requests, ) self.dataset = dataset self.processor = processor @@ -281,4 +283,4 @@ def _create_request(self, item: dict[str, Any]) -> GenerationRequest: constraints=( {"output_tokens": output_tokens} if output_tokens is not None else {} ), - ) + ) \ No newline at end of file