From e20e2ba505c167dd87878c622142550caabe8853 Mon Sep 17 00:00:00 2001 From: Harshith-umesh Date: Mon, 21 Jul 2025 16:33:19 -0400 Subject: [PATCH 1/3] Add dataset-from-file command to extract datasets from benchmark reports --- docs/preprocess.md | 134 +++++++ src/guidellm/__main__.py | 58 +++ src/guidellm/preprocess/__init__.py | 3 +- src/guidellm/preprocess/dataset_from_file.py | 185 ++++++++++ .../test_dataset_from_file_entrypoint.py | 346 ++++++++++++++++++ 5 files changed, 725 insertions(+), 1 deletion(-) create mode 100644 docs/preprocess.md create mode 100644 src/guidellm/preprocess/dataset_from_file.py create mode 100644 tests/unit/entrypoints/test_dataset_from_file_entrypoint.py diff --git a/docs/preprocess.md b/docs/preprocess.md new file mode 100644 index 00000000..58d1ab9c --- /dev/null +++ b/docs/preprocess.md @@ -0,0 +1,134 @@ +# Preprocess Commands + +GuideLLM provides preprocessing capabilities to transform and prepare data for benchmarking workflows. The preprocess module includes tools for creating datasets from existing benchmark results, enabling "apples-to-apples" comparisons and reusable benchmark datasets. + +## Overview + +The `guidellm preprocess` command provides utilities to: + +- **Extract datasets from benchmark results**: Convert completed benchmark reports into reusable datasets with known prompt and output token counts for consistent comparisons + + +## Commands + +### `dataset-from-file` + +Extracts prompts and their corresponding output token counts from saved benchmark report files to create datasets for future benchmarking runs. + +#### Purpose + +When you run a benchmark with GuideLLM, you get detailed results about how a model performed with specific prompts. The `dataset-from-file` command allows you to extract those successful prompt-response pairs and convert them into a standardized dataset format. This enables: + +1. **Consistent Comparisons**: Use the exact same prompts across different models or configurations +2. **Known Expectations**: Each prompt comes with its expected output token count +3. **Reproducible Benchmarks**: Eliminate variability from different prompts when comparing models + +#### Syntax + +```bash +guidellm preprocess dataset-from-file [OPTIONS] BENCHMARK_FILE +``` + +#### Arguments + +- `BENCHMARK_FILE`: Path to the saved benchmark report file (JSON format) + +#### Options + +- `-o, --output-path PATH`: Output dataset file path (default: `dataset_from_benchmark.json`) +- `--show-stats`: Show dataset statistics after creation +- `--disable-console-outputs`: Disable console output for silent operation +- `--help`: Show help message and exit + +#### Example Usage + +##### Basic Usage + +```bash +# Convert a benchmark report to a dataset +guidellm preprocess dataset-from-file benchmark-results.json + +# Specify custom output path +guidellm preprocess dataset-from-file benchmark-results.json -o my_dataset.json + +# Show statistics about the created dataset +guidellm preprocess dataset-from-file benchmark-results.json --show-stats +``` + +#### Input File Requirements + +The input benchmark file must be a valid GuideLLM benchmark report containing: + +- **Valid JSON format**: The file must be properly formatted +- **Benchmark report structure**: Must contain the expected benchmark report schema +- **Successful requests**: Must contain at least one successful request to extract data from + +##### Supported Input Formats + +```json +{ + "benchmarks": [ + { + "requests": { + "successful": [ + { + "prompt": "What is the capital of France?", + "output_tokens": 5, + "... other request fields ..." + } + ], + "errored": [], + "incomplete": [] + } + } + ] +} +``` + +#### Output Format + +The generated dataset follows this structure: + +```json +{ + "version": "1.0", + "description": "Dataset created from benchmark results for apples-to-apples comparisons", + "data": [ + { + "prompt": "What is the capital of France?", + "output_tokens_count": 5, + "prompt_tokens_count": 12 + }, + { + "prompt": "Explain quantum computing in simple terms.", + "output_tokens_count": 45, + "prompt_tokens_count": 8 + } + ] +} +``` + + +Each data item contains: +- `prompt`: The original prompt text +- `output_tokens_count`: The number of tokens in the model's response +- `prompt_tokens_count`: The number of tokens in the original prompt + +#### Statistics Output + +When using `--show-stats`, you'll see detailed information about the created dataset: + +``` +Dataset Statistics: +================== +Total items: 95 +Prompt length statistics: + Min: 8 characters + Max: 245 characters + Mean: 87.3 characters +Output tokens statistics: + Min: 1 tokens + Max: 512 tokens + Mean: 124.8 tokens +``` + diff --git a/src/guidellm/__main__.py b/src/guidellm/__main__.py index f0bd9043..648c5a54 100644 --- a/src/guidellm/__main__.py +++ b/src/guidellm/__main__.py @@ -15,6 +15,7 @@ from guidellm.benchmark.scenario import GenerativeTextScenario, get_builtin_scenarios from guidellm.config import print_config from guidellm.preprocess.dataset import ShortPromptStrategy, process_dataset +from guidellm.preprocess.dataset_from_file import create_dataset_from_file, DatasetCreationError from guidellm.scheduler import StrategyType from guidellm.utils import DefaultGroupHandler from guidellm.utils import cli as cli_tools @@ -491,6 +492,12 @@ def dataset( hub_dataset_id, random_seed, ): + """ + Convert a dataset to have specific prompt and output token counts. + + This creates a filtered and processed dataset where prompts and outputs + match specified token counts, useful for consistent benchmarking. + """ process_dataset( data=data, output_path=output_path, @@ -508,5 +515,56 @@ def dataset( ) +@preprocess.command("dataset-from-file", help="Create a dataset from a saved benchmark report file.") +@click.argument( + "benchmark_file", + type=click.Path(exists=True, file_okay=True, dir_okay=False, path_type=Path), +) +@click.option( + "-o", + "--output-path", + type=click.Path(file_okay=True, dir_okay=False, path_type=Path), + default=Path("dataset_from_benchmark.json"), + help="Output dataset file path.", +) +@click.option( + "--show-stats", + is_flag=True, + help="Show dataset statistics after creation.", +) +@click.option( + "--disable-console-outputs", + is_flag=True, + help="Set this flag to disable console output.", +) +def dataset_from_file( + benchmark_file, + output_path, + show_stats, + disable_console_outputs, +): + """ + Create a dataset from a saved benchmark report file. + + This extracts prompts and their corresponding output token counts from + benchmark results to create an 'apples-to-apples' comparison dataset. + + BENCHMARK_FILE: Path to the benchmark results JSON file. + """ + try: + create_dataset_from_file( + benchmark_file=benchmark_file, + output_path=Path(output_path), + show_stats=show_stats, + enable_console=not disable_console_outputs, + ) + except DatasetCreationError as e: + # To print clean error message without a traceback + if not disable_console_outputs: + click.echo(f"Error: {e}", err=True) + ctx = click.get_current_context() + ctx.exit(1) + + if __name__ == "__main__": cli() diff --git a/src/guidellm/preprocess/__init__.py b/src/guidellm/preprocess/__init__.py index 95d01e5f..a53b378f 100644 --- a/src/guidellm/preprocess/__init__.py +++ b/src/guidellm/preprocess/__init__.py @@ -1,3 +1,4 @@ from .dataset import ShortPromptStrategy, process_dataset +from .dataset_from_file import create_dataset_from_file, DatasetCreationError -__all__ = ["ShortPromptStrategy", "process_dataset"] +__all__ = ["ShortPromptStrategy", "process_dataset", "create_dataset_from_file", "DatasetCreationError"] diff --git a/src/guidellm/preprocess/dataset_from_file.py b/src/guidellm/preprocess/dataset_from_file.py new file mode 100644 index 00000000..a3e47d82 --- /dev/null +++ b/src/guidellm/preprocess/dataset_from_file.py @@ -0,0 +1,185 @@ +""" +Module for creating datasets from saved benchmark report files. + +This module provides functionality to extract prompts and their corresponding +output token counts from benchmark results to create datasets for future +'apples-to-apples' comparisons. +""" + +import json +import sys +from pathlib import Path +from typing import Any + +from guidellm.benchmark.output import GenerativeBenchmarksReport + +__all__ = [ + "validate_benchmark_file", + "extract_dataset_from_benchmark_report", + "save_dataset_from_benchmark", + "print_dataset_statistics", + "create_dataset_from_file", + "DatasetCreationError", +] + + +class DatasetCreationError(Exception): + """Exception raised when dataset creation fails.""" + pass + + +def validate_benchmark_file(filepath: Path) -> GenerativeBenchmarksReport: + """ + Validate that the file is a proper GuideLLM benchmark report. + + Args: + filepath: Path to the benchmark report file + + Returns: + GenerativeBenchmarksReport: The validated and loaded report + + Raises: + DatasetCreationError: If file validation fails + """ + try: + report = GenerativeBenchmarksReport.load_file(filepath) + + if not report.benchmarks: + raise DatasetCreationError("Benchmark report contains no benchmark data") + + return report + + except Exception as e: + raise DatasetCreationError( + f"Invalid benchmark report file '{filepath}': {e}" + ) from e + + +def extract_dataset_from_benchmark_report(report: GenerativeBenchmarksReport) -> list[dict[str, Any]]: + """ + Extract prompts and output tokens from a validated benchmark report. + + Args: + report: A validated GenerativeBenchmarksReport instance + + Returns: + List of dataset items with prompt and token information + """ + dataset_items = [] + + for benchmark in report.benchmarks: + requests_breakdown = benchmark.requests + + successful_requests = requests_breakdown.successful + + for request in successful_requests: + prompt = request.prompt + output_tokens = request.output_tokens + prompt_tokens = request.prompt_tokens + + if prompt and output_tokens > 0: + dataset_items.append({ + "prompt": prompt, + "output_tokens": output_tokens, + "prompt_tokens": prompt_tokens, + }) + + return dataset_items + + +def save_dataset_from_benchmark(dataset_items: list[dict[str, Any]], output_file: Path) -> None: + """Save the dataset to a JSON file.""" + # Convert to the format expected by guidellm documentation + formatted_items = [] + for item in dataset_items: + formatted_items.append({ + "prompt": item["prompt"], + "output_tokens_count": item["output_tokens"], + "prompt_tokens_count": item["prompt_tokens"], + }) + + dataset_data = { + "version": "1.0", + "description": "Dataset created from benchmark results for apples-to-apples comparisons", + "data": formatted_items + } + + with output_file.open("w") as f: + json.dump(dataset_data, f, indent=2) + + +def print_dataset_statistics(dataset_items: list[dict[str, Any]], enable_console: bool = True) -> None: + """Print statistics about the dataset.""" + if not enable_console: + return + + if not dataset_items: + print("No valid items found in dataset", file=sys.stderr) + return + + total_items = len(dataset_items) + prompt_tokens = [item["prompt_tokens"] for item in dataset_items] + output_tokens = [item["output_tokens"] for item in dataset_items] + + print(f"\nDataset Statistics:") + print(f"Total items: {total_items}") + print(f"Prompt tokens - Min: {min(prompt_tokens)}, Max: {max(prompt_tokens)}, Mean: {sum(prompt_tokens)/len(prompt_tokens):.1f}") + print(f"Output tokens - Min: {min(output_tokens)}, Max: {max(output_tokens)}, Mean: {sum(output_tokens)/len(output_tokens):.1f}") + + +def create_dataset_from_file( + benchmark_file: Path, + output_path: Path, + show_stats: bool = False, + enable_console: bool = True, +) -> None: + """ + Create a dataset from a saved benchmark report file. + + This function validates the benchmark file format, loads it using the same + validation as the 'from-file' command, then extracts prompts and their + corresponding output token counts from successful requests. + + Args: + benchmark_file: Path to the benchmark results JSON/YAML file + output_path: Path where the dataset should be saved + show_stats: Whether to display dataset statistics + enable_console: Whether to enable console output + + Raises: + DatasetCreationError: If validation fails or no valid requests found + """ + if enable_console: + print(f"Validating benchmark report file: {benchmark_file}") + + try: + report = validate_benchmark_file(benchmark_file) + + if enable_console: + print(f"Valid benchmark report with {len(report.benchmarks)} benchmark(s)") + print(f"Loading and extracting dataset from benchmark results...") + + dataset_items = extract_dataset_from_benchmark_report(report) + + if not dataset_items: + error_msg = "No valid requests with prompts and output tokens found in benchmark report" + if enable_console: + print(f"Error: {error_msg}", file=sys.stderr) + raise DatasetCreationError(error_msg) + + save_dataset_from_benchmark(dataset_items, output_path) + + if enable_console: + print(f"Dataset saved to: {output_path}") + print(f"Success, Created dataset with {len(dataset_items)} items") + print(f"You can now use this dataset for future guidellm runs by specifying: --data {output_path}") + + if show_stats: + print_dataset_statistics(dataset_items, enable_console) + + except DatasetCreationError: + raise + except Exception as e: + if enable_console: + print(f"Unexpected error: {e}", file=sys.stderr) + raise DatasetCreationError(f"Failed to process benchmark file: {e}") from e \ No newline at end of file diff --git a/tests/unit/entrypoints/test_dataset_from_file_entrypoint.py b/tests/unit/entrypoints/test_dataset_from_file_entrypoint.py new file mode 100644 index 00000000..491da4b9 --- /dev/null +++ b/tests/unit/entrypoints/test_dataset_from_file_entrypoint.py @@ -0,0 +1,346 @@ +import json +import os +import tempfile +import unittest +from pathlib import Path + +import pytest + +from guidellm.preprocess.dataset_from_file import ( + DatasetCreationError, + create_dataset_from_file, + validate_benchmark_file, + extract_dataset_from_benchmark_report, + save_dataset_from_benchmark, + print_dataset_statistics, +) + +REGENERATE_ARTIFACTS = False + + +@pytest.fixture +def get_test_asset_dir(): + def _() -> Path: + return Path(__file__).parent / "assets" + + return _ + + +@pytest.fixture +def cleanup(): + to_delete: list[Path] = [] + yield to_delete + for item in to_delete: + if item.exists(): + item.unlink() # Deletes the file + + +@pytest.fixture +def temp_file(): + """Create a temporary file that gets cleaned up automatically.""" + with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f: + temp_path = Path(f.name) + yield temp_path + if temp_path.exists(): + temp_path.unlink() + + +def test_create_dataset_from_valid_benchmark_json(get_test_asset_dir, cleanup): + """Test creating dataset from a valid benchmark JSON file.""" + asset_dir = get_test_asset_dir() + source_file = asset_dir / "benchmarks_stripped.json" + output_file = asset_dir / "test_dataset_output.json" + cleanup.append(output_file) + + create_dataset_from_file( + benchmark_file=source_file, + output_path=output_file, + show_stats=False, + enable_console=False, + ) + + assert output_file.exists() + + with output_file.open() as f: + dataset = json.load(f) + + # Verify dataset structure + assert "version" in dataset + assert "description" in dataset + assert "data" in dataset + assert isinstance(dataset["data"], list) + assert len(dataset["data"]) > 0 + + # Verify each dataset item has required fields + for item in dataset["data"]: + assert "prompt" in item + assert "output_tokens_count" in item + assert "prompt_tokens_count" in item + assert isinstance(item["prompt"], str) + assert isinstance(item["output_tokens_count"], int) + assert isinstance(item["prompt_tokens_count"], int) + assert len(item["prompt"]) > 0 + assert item["output_tokens_count"] > 0 + + +def test_create_dataset_from_valid_benchmark_yaml(get_test_asset_dir, cleanup): + """Test creating dataset from a valid benchmark YAML file.""" + asset_dir = get_test_asset_dir() + source_file = asset_dir / "benchmarks_stripped.yaml" + output_file = asset_dir / "test_dataset_yaml_output.json" + cleanup.append(output_file) + + create_dataset_from_file( + benchmark_file=source_file, + output_path=output_file, + show_stats=False, + enable_console=False, + ) + + assert output_file.exists() + + with output_file.open() as f: + dataset = json.load(f) + + assert "data" in dataset + assert len(dataset["data"]) > 0 + + +def test_create_dataset_with_stats_output(capfd, get_test_asset_dir, cleanup): + """Test creating dataset with statistics output enabled.""" + asset_dir = get_test_asset_dir() + source_file = asset_dir / "benchmarks_stripped.json" + output_file = asset_dir / "test_dataset_stats_output.json" + cleanup.append(output_file) + + create_dataset_from_file( + benchmark_file=source_file, + output_path=output_file, + show_stats=True, + enable_console=True, + ) + + # Verify console output includes statistics + out, err = capfd.readouterr() + assert "Validating benchmark report file" in out + assert "Valid benchmark report with" in out + assert "Dataset saved to" in out + assert "Success, Created dataset with" in out + assert "Dataset Statistics:" in out + assert "Total items:" in out + assert "Prompt tokens - Min:" in out + assert "Output tokens - Min:" in out + + +def test_create_dataset_with_console_disabled(capfd, get_test_asset_dir, cleanup): + """Test creating dataset with console output disabled.""" + asset_dir = get_test_asset_dir() + source_file = asset_dir / "benchmarks_stripped.json" + output_file = asset_dir / "test_dataset_no_console.json" + cleanup.append(output_file) + + create_dataset_from_file( + benchmark_file=source_file, + output_path=output_file, + show_stats=True, + enable_console=False, + ) + + # Verify no console output + out, err = capfd.readouterr() + assert out == "" + assert err == "" + + assert output_file.exists() + + +def test_validate_benchmark_file_valid_file(get_test_asset_dir): + """Test validation with a valid benchmark file.""" + asset_dir = get_test_asset_dir() + source_file = asset_dir / "benchmarks_stripped.json" + + report = validate_benchmark_file(source_file) + assert report is not None + assert len(report.benchmarks) > 0 + + +def test_validate_benchmark_file_invalid_json(temp_file): + """Test validation with invalid JSON.""" + # Write invalid JSON + temp_file.write_text("This is not JSON") + + with pytest.raises(DatasetCreationError) as exc_info: + validate_benchmark_file(temp_file) + + assert "Invalid benchmark report file" in str(exc_info.value) + assert "Expecting value" in str(exc_info.value) + + +def test_validate_benchmark_file_invalid_structure(temp_file): + """Test validation with valid JSON but invalid benchmark structure.""" + # Write valid JSON but wrong structure + temp_file.write_text('{"invalid": "structure"}') + + with pytest.raises(DatasetCreationError) as exc_info: + validate_benchmark_file(temp_file) + + assert "Invalid benchmark report file" in str(exc_info.value) + + +def test_validate_benchmark_file_no_benchmarks(temp_file): + """Test validation with valid structure but no benchmarks.""" + # Write valid structure but empty benchmarks + temp_file.write_text('{"benchmarks": []}') + + with pytest.raises(DatasetCreationError) as exc_info: + validate_benchmark_file(temp_file) + + assert "Benchmark report contains no benchmark data" in str(exc_info.value) + + +def test_extract_dataset_from_benchmark_report(get_test_asset_dir): + """Test extracting dataset from a validated benchmark report.""" + asset_dir = get_test_asset_dir() + source_file = asset_dir / "benchmarks_stripped.json" + + # First validate and load the report + report = validate_benchmark_file(source_file) + + # Extract dataset + dataset_items = extract_dataset_from_benchmark_report(report) + + assert len(dataset_items) > 0 + + # Verify structure of extracted items + for item in dataset_items: + assert "prompt" in item + assert "output_tokens" in item + assert "prompt_tokens" in item + assert len(item["prompt"]) > 0 + assert item["output_tokens"] > 0 + assert item["prompt_tokens"] > 0 + + +def test_save_dataset_from_benchmark(cleanup): + """Test saving dataset to file.""" + # Create test dataset items + dataset_items = [ + { + "prompt": "Test prompt 1", + "output_tokens": 100, + "prompt_tokens": 50, + }, + { + "prompt": "Test prompt 2", + "output_tokens": 200, + "prompt_tokens": 75, + } + ] + + output_file = Path("test_save_dataset.json") + cleanup.append(output_file) + + # Save dataset + save_dataset_from_benchmark(dataset_items, output_file) + + # Verify file exists and has correct structure + assert output_file.exists() + + with output_file.open() as f: + saved_data = json.load(f) + + assert "version" in saved_data + assert "description" in saved_data + assert "data" in saved_data + assert len(saved_data["data"]) == 2 + + # Verify field names are converted correctly + for item in saved_data["data"]: + assert "prompt" in item + assert "output_tokens_count" in item + assert "prompt_tokens_count" in item + + +def test_print_dataset_statistics_with_data(capfd): + """Test printing statistics with valid dataset.""" + dataset_items = [ + {"prompt": "Test 1", "output_tokens": 100, "prompt_tokens": 50}, + {"prompt": "Test 2", "output_tokens": 200, "prompt_tokens": 75}, + {"prompt": "Test 3", "output_tokens": 150, "prompt_tokens": 60}, + ] + + print_dataset_statistics(dataset_items, enable_console=True) + + out, err = capfd.readouterr() + assert "Dataset Statistics:" in out + assert "Total items: 3" in out + assert "Prompt tokens - Min: 50, Max: 75, Mean: 61.7" in out + assert "Output tokens - Min: 100, Max: 200, Mean: 150.0" in out + + +def test_print_dataset_statistics_empty_dataset(capfd): + """Test printing statistics with empty dataset.""" + dataset_items = [] + + print_dataset_statistics(dataset_items, enable_console=True) + + out, err = capfd.readouterr() + assert "No valid items found in dataset" in err + + +def test_print_dataset_statistics_console_disabled(capfd): + """Test printing statistics with console disabled.""" + dataset_items = [ + {"prompt": "Test", "output_tokens": 100, "prompt_tokens": 50}, + ] + + print_dataset_statistics(dataset_items, enable_console=False) + + out, err = capfd.readouterr() + assert out == "" + assert err == "" + + +def test_create_dataset_from_file_nonexistent_file(): + """Test error handling for nonexistent file.""" + nonexistent_file = Path("does_not_exist.json") + output_file = Path("output.json") + + with pytest.raises(DatasetCreationError): + create_dataset_from_file( + benchmark_file=nonexistent_file, + output_path=output_file, + show_stats=False, + enable_console=False, + ) + + +def test_create_dataset_from_file_no_successful_requests(temp_file): + """Test handling of benchmark with no successful requests.""" + # Create benchmark with no successful requests + benchmark_data = { + "benchmarks": [{ + "requests": { + "successful": [], + "errored": [], + "incomplete": [] + } + }] + } + temp_file.write_text(json.dumps(benchmark_data)) + + output_file = Path("output.json") + + with pytest.raises(DatasetCreationError) as exc_info: + create_dataset_from_file( + benchmark_file=temp_file, + output_path=output_file, + show_stats=False, + enable_console=False, + ) + + assert "Invalid benchmark report file" in str(exc_info.value) + + +if __name__ == "__main__": + unittest.main() \ No newline at end of file From 52d054e0e53fffbd296e37eb1f053d94f77af8a4 Mon Sep 17 00:00:00 2001 From: Harshith-umesh Date: Tue, 22 Jul 2025 13:36:45 -0400 Subject: [PATCH 2/3] Fix comments in tests --- .../test_dataset_from_file_entrypoint.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/tests/unit/entrypoints/test_dataset_from_file_entrypoint.py b/tests/unit/entrypoints/test_dataset_from_file_entrypoint.py index 491da4b9..857dc7b7 100644 --- a/tests/unit/entrypoints/test_dataset_from_file_entrypoint.py +++ b/tests/unit/entrypoints/test_dataset_from_file_entrypoint.py @@ -64,14 +64,12 @@ def test_create_dataset_from_valid_benchmark_json(get_test_asset_dir, cleanup): with output_file.open() as f: dataset = json.load(f) - # Verify dataset structure assert "version" in dataset assert "description" in dataset assert "data" in dataset assert isinstance(dataset["data"], list) assert len(dataset["data"]) > 0 - # Verify each dataset item has required fields for item in dataset["data"]: assert "prompt" in item assert "output_tokens_count" in item @@ -120,7 +118,6 @@ def test_create_dataset_with_stats_output(capfd, get_test_asset_dir, cleanup): enable_console=True, ) - # Verify console output includes statistics out, err = capfd.readouterr() assert "Validating benchmark report file" in out assert "Valid benchmark report with" in out @@ -146,7 +143,6 @@ def test_create_dataset_with_console_disabled(capfd, get_test_asset_dir, cleanup enable_console=False, ) - # Verify no console output out, err = capfd.readouterr() assert out == "" assert err == "" @@ -166,7 +162,6 @@ def test_validate_benchmark_file_valid_file(get_test_asset_dir): def test_validate_benchmark_file_invalid_json(temp_file): """Test validation with invalid JSON.""" - # Write invalid JSON temp_file.write_text("This is not JSON") with pytest.raises(DatasetCreationError) as exc_info: @@ -178,7 +173,6 @@ def test_validate_benchmark_file_invalid_json(temp_file): def test_validate_benchmark_file_invalid_structure(temp_file): """Test validation with valid JSON but invalid benchmark structure.""" - # Write valid JSON but wrong structure temp_file.write_text('{"invalid": "structure"}') with pytest.raises(DatasetCreationError) as exc_info: @@ -189,7 +183,6 @@ def test_validate_benchmark_file_invalid_structure(temp_file): def test_validate_benchmark_file_no_benchmarks(temp_file): """Test validation with valid structure but no benchmarks.""" - # Write valid structure but empty benchmarks temp_file.write_text('{"benchmarks": []}') with pytest.raises(DatasetCreationError) as exc_info: @@ -203,15 +196,12 @@ def test_extract_dataset_from_benchmark_report(get_test_asset_dir): asset_dir = get_test_asset_dir() source_file = asset_dir / "benchmarks_stripped.json" - # First validate and load the report report = validate_benchmark_file(source_file) - # Extract dataset dataset_items = extract_dataset_from_benchmark_report(report) assert len(dataset_items) > 0 - # Verify structure of extracted items for item in dataset_items: assert "prompt" in item assert "output_tokens" in item @@ -223,7 +213,6 @@ def test_extract_dataset_from_benchmark_report(get_test_asset_dir): def test_save_dataset_from_benchmark(cleanup): """Test saving dataset to file.""" - # Create test dataset items dataset_items = [ { "prompt": "Test prompt 1", @@ -240,10 +229,8 @@ def test_save_dataset_from_benchmark(cleanup): output_file = Path("test_save_dataset.json") cleanup.append(output_file) - # Save dataset save_dataset_from_benchmark(dataset_items, output_file) - # Verify file exists and has correct structure assert output_file.exists() with output_file.open() as f: @@ -254,7 +241,6 @@ def test_save_dataset_from_benchmark(cleanup): assert "data" in saved_data assert len(saved_data["data"]) == 2 - # Verify field names are converted correctly for item in saved_data["data"]: assert "prompt" in item assert "output_tokens_count" in item @@ -317,7 +303,6 @@ def test_create_dataset_from_file_nonexistent_file(): def test_create_dataset_from_file_no_successful_requests(temp_file): """Test handling of benchmark with no successful requests.""" - # Create benchmark with no successful requests benchmark_data = { "benchmarks": [{ "requests": { From 72d60fb594a76fbfc9a61826be5caf7a561511c6 Mon Sep 17 00:00:00 2001 From: Harshith-umesh Date: Sat, 26 Jul 2025 18:01:22 -0400 Subject: [PATCH 3/3] Fix linting and quality check errors --- docs/preprocess.md | 6 +- src/guidellm/__main__.py | 9 +- src/guidellm/preprocess/__init__.py | 9 +- src/guidellm/preprocess/dataset_from_file.py | 174 +++++++++++------- .../test_dataset_from_file_entrypoint.py | 110 ++++++----- 5 files changed, 173 insertions(+), 135 deletions(-) diff --git a/docs/preprocess.md b/docs/preprocess.md index 58d1ab9c..062bfc84 100644 --- a/docs/preprocess.md +++ b/docs/preprocess.md @@ -8,7 +8,6 @@ The `guidellm preprocess` command provides utilities to: - **Extract datasets from benchmark results**: Convert completed benchmark reports into reusable datasets with known prompt and output token counts for consistent comparisons - ## Commands ### `dataset-from-file` @@ -108,8 +107,8 @@ The generated dataset follows this structure: } ``` - Each data item contains: + - `prompt`: The original prompt text - `output_tokens_count`: The number of tokens in the model's response - `prompt_tokens_count`: The number of tokens in the original prompt @@ -124,11 +123,10 @@ Dataset Statistics: Total items: 95 Prompt length statistics: Min: 8 characters - Max: 245 characters + Max: 245 characters Mean: 87.3 characters Output tokens statistics: Min: 1 tokens Max: 512 tokens Mean: 124.8 tokens ``` - diff --git a/src/guidellm/__main__.py b/src/guidellm/__main__.py index 648c5a54..db697869 100644 --- a/src/guidellm/__main__.py +++ b/src/guidellm/__main__.py @@ -15,7 +15,10 @@ from guidellm.benchmark.scenario import GenerativeTextScenario, get_builtin_scenarios from guidellm.config import print_config from guidellm.preprocess.dataset import ShortPromptStrategy, process_dataset -from guidellm.preprocess.dataset_from_file import create_dataset_from_file, DatasetCreationError +from guidellm.preprocess.dataset_from_file import ( + DatasetCreationError, + create_dataset_from_file, +) from guidellm.scheduler import StrategyType from guidellm.utils import DefaultGroupHandler from guidellm.utils import cli as cli_tools @@ -515,7 +518,9 @@ def dataset( ) -@preprocess.command("dataset-from-file", help="Create a dataset from a saved benchmark report file.") +@preprocess.command( + "dataset-from-file", help="Create a dataset from a saved benchmark report file." +) @click.argument( "benchmark_file", type=click.Path(exists=True, file_okay=True, dir_okay=False, path_type=Path), diff --git a/src/guidellm/preprocess/__init__.py b/src/guidellm/preprocess/__init__.py index a53b378f..83aeb207 100644 --- a/src/guidellm/preprocess/__init__.py +++ b/src/guidellm/preprocess/__init__.py @@ -1,4 +1,9 @@ from .dataset import ShortPromptStrategy, process_dataset -from .dataset_from_file import create_dataset_from_file, DatasetCreationError +from .dataset_from_file import DatasetCreationError, create_dataset_from_file -__all__ = ["ShortPromptStrategy", "process_dataset", "create_dataset_from_file", "DatasetCreationError"] +__all__ = [ + "DatasetCreationError", + "ShortPromptStrategy", + "create_dataset_from_file", + "process_dataset", +] diff --git a/src/guidellm/preprocess/dataset_from_file.py b/src/guidellm/preprocess/dataset_from_file.py index a3e47d82..5a1202d6 100644 --- a/src/guidellm/preprocess/dataset_from_file.py +++ b/src/guidellm/preprocess/dataset_from_file.py @@ -7,124 +7,147 @@ """ import json -import sys from pathlib import Path from typing import Any +from rich.console import Console + from guidellm.benchmark.output import GenerativeBenchmarksReport __all__ = [ - "validate_benchmark_file", - "extract_dataset_from_benchmark_report", - "save_dataset_from_benchmark", - "print_dataset_statistics", - "create_dataset_from_file", "DatasetCreationError", + "create_dataset_from_file", + "extract_dataset_from_benchmark_report", + "print_dataset_statistics", + "save_dataset_from_benchmark", + "validate_benchmark_file", ] class DatasetCreationError(Exception): """Exception raised when dataset creation fails.""" - pass def validate_benchmark_file(filepath: Path) -> GenerativeBenchmarksReport: """ Validate that the file is a proper GuideLLM benchmark report. - + Args: filepath: Path to the benchmark report file - + Returns: GenerativeBenchmarksReport: The validated and loaded report - + Raises: DatasetCreationError: If file validation fails """ try: report = GenerativeBenchmarksReport.load_file(filepath) - if not report.benchmarks: raise DatasetCreationError("Benchmark report contains no benchmark data") - return report - except Exception as e: - raise DatasetCreationError( - f"Invalid benchmark report file '{filepath}': {e}" - ) from e + error_msg = f"Invalid benchmark report file: {e}" + raise DatasetCreationError(error_msg) from e -def extract_dataset_from_benchmark_report(report: GenerativeBenchmarksReport) -> list[dict[str, Any]]: +def extract_dataset_from_benchmark_report( + report: GenerativeBenchmarksReport, +) -> list[dict[str, Any]]: """ Extract prompts and output tokens from a validated benchmark report. - + Args: report: A validated GenerativeBenchmarksReport instance - + Returns: List of dataset items with prompt and token information """ dataset_items = [] - + for benchmark in report.benchmarks: + # Access the StatusBreakdown properties directly requests_breakdown = benchmark.requests - + + # Get successful requests (these are the ones we want) successful_requests = requests_breakdown.successful - + for request in successful_requests: + # Extract the needed data - these are Request objects prompt = request.prompt output_tokens = request.output_tokens prompt_tokens = request.prompt_tokens - + + # Only include items with valid data if prompt and output_tokens > 0: - dataset_items.append({ - "prompt": prompt, - "output_tokens": output_tokens, - "prompt_tokens": prompt_tokens, - }) - + dataset_items.append( + { + "prompt": prompt, + "output_tokens": output_tokens, + "prompt_tokens": prompt_tokens, + } + ) + return dataset_items -def save_dataset_from_benchmark(dataset_items: list[dict[str, Any]], output_file: Path) -> None: +def save_dataset_from_benchmark( + dataset_items: list[dict[str, Any]], output_file: Path +) -> None: """Save the dataset to a JSON file.""" # Convert to the format expected by guidellm documentation formatted_items = [] for item in dataset_items: - formatted_items.append({ - "prompt": item["prompt"], - "output_tokens_count": item["output_tokens"], - "prompt_tokens_count": item["prompt_tokens"], - }) - + formatted_items.append( + { + "prompt": item["prompt"], + "output_tokens_count": item["output_tokens"], + "prompt_tokens_count": item["prompt_tokens"], + } + ) + dataset_data = { "version": "1.0", - "description": "Dataset created from benchmark results for apples-to-apples comparisons", - "data": formatted_items + "description": ( + "Dataset created from benchmark results for apples-to-apples comparisons" + ), + "data": formatted_items, } - + with output_file.open("w") as f: json.dump(dataset_data, f, indent=2) -def print_dataset_statistics(dataset_items: list[dict[str, Any]], enable_console: bool = True) -> None: +def print_dataset_statistics( + dataset_items: list[dict[str, Any]], enable_console: bool = True +) -> None: """Print statistics about the dataset.""" if not enable_console: return - + + console = Console() + console_err = Console(stderr=True) + if not dataset_items: - print("No valid items found in dataset", file=sys.stderr) + console_err.print("No valid items found in dataset") return - + total_items = len(dataset_items) prompt_tokens = [item["prompt_tokens"] for item in dataset_items] output_tokens = [item["output_tokens"] for item in dataset_items] - - print(f"\nDataset Statistics:") - print(f"Total items: {total_items}") - print(f"Prompt tokens - Min: {min(prompt_tokens)}, Max: {max(prompt_tokens)}, Mean: {sum(prompt_tokens)/len(prompt_tokens):.1f}") - print(f"Output tokens - Min: {min(output_tokens)}, Max: {max(output_tokens)}, Mean: {sum(output_tokens)/len(output_tokens):.1f}") + + console.print("\nDataset Statistics:") + console.print(f"Total items: {total_items}") + console.print( + f"Prompt tokens - Min: {min(prompt_tokens)}, " + f"Max: {max(prompt_tokens)}, " + f"Mean: {sum(prompt_tokens) / len(prompt_tokens):.1f}" + ) + console.print( + f"Output tokens - Min: {min(output_tokens)}, " + f"Max: {max(output_tokens)}, " + f"Mean: {sum(output_tokens) / len(output_tokens):.1f}" + ) def create_dataset_from_file( @@ -135,51 +158,62 @@ def create_dataset_from_file( ) -> None: """ Create a dataset from a saved benchmark report file. - + This function validates the benchmark file format, loads it using the same - validation as the 'from-file' command, then extracts prompts and their + validation as the 'from-file' command, then extracts prompts and their corresponding output token counts from successful requests. - + Args: benchmark_file: Path to the benchmark results JSON/YAML file output_path: Path where the dataset should be saved show_stats: Whether to display dataset statistics enable_console: Whether to enable console output - + Raises: DatasetCreationError: If validation fails or no valid requests found """ + console = Console() + console_err = Console(stderr=True) + if enable_console: - print(f"Validating benchmark report file: {benchmark_file}") - + console.print(f"Validating benchmark report file: {benchmark_file}") + try: report = validate_benchmark_file(benchmark_file) - + if enable_console: - print(f"Valid benchmark report with {len(report.benchmarks)} benchmark(s)") - print(f"Loading and extracting dataset from benchmark results...") - + console.print( + f"Valid benchmark report with {len(report.benchmarks)} benchmark(s)" + ) + console.print("Loading and extracting dataset from benchmark results...") + dataset_items = extract_dataset_from_benchmark_report(report) - + if not dataset_items: - error_msg = "No valid requests with prompts and output tokens found in benchmark report" + error_msg = ( + "No valid requests with prompts and output tokens " + "found in benchmark report" + ) if enable_console: - print(f"Error: {error_msg}", file=sys.stderr) + console_err.print(f"Error: {error_msg}") raise DatasetCreationError(error_msg) - + save_dataset_from_benchmark(dataset_items, output_path) - + if enable_console: - print(f"Dataset saved to: {output_path}") - print(f"Success, Created dataset with {len(dataset_items)} items") - print(f"You can now use this dataset for future guidellm runs by specifying: --data {output_path}") - + console.print(f"Dataset saved to: {output_path}") + console.print(f"Success, Created dataset with {len(dataset_items)} items") + console.print( + f"You can now use this dataset for future guidellm runs " + f"by specifying: --data {output_path}" + ) + if show_stats: print_dataset_statistics(dataset_items, enable_console) - + except DatasetCreationError: raise except Exception as e: if enable_console: - print(f"Unexpected error: {e}", file=sys.stderr) - raise DatasetCreationError(f"Failed to process benchmark file: {e}") from e \ No newline at end of file + console_err.print(f"Unexpected error: {e}") + raise DatasetCreationError(f"Failed to process benchmark file: {e}") from e diff --git a/tests/unit/entrypoints/test_dataset_from_file_entrypoint.py b/tests/unit/entrypoints/test_dataset_from_file_entrypoint.py index 857dc7b7..12a668e6 100644 --- a/tests/unit/entrypoints/test_dataset_from_file_entrypoint.py +++ b/tests/unit/entrypoints/test_dataset_from_file_entrypoint.py @@ -1,18 +1,18 @@ import json -import os import tempfile import unittest from pathlib import Path +from typing import Any import pytest from guidellm.preprocess.dataset_from_file import ( DatasetCreationError, create_dataset_from_file, - validate_benchmark_file, extract_dataset_from_benchmark_report, - save_dataset_from_benchmark, print_dataset_statistics, + save_dataset_from_benchmark, + validate_benchmark_file, ) REGENERATE_ARTIFACTS = False @@ -38,7 +38,7 @@ def cleanup(): @pytest.fixture def temp_file(): """Create a temporary file that gets cleaned up automatically.""" - with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f: + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: temp_path = Path(f.name) yield temp_path if temp_path.exists(): @@ -51,25 +51,25 @@ def test_create_dataset_from_valid_benchmark_json(get_test_asset_dir, cleanup): source_file = asset_dir / "benchmarks_stripped.json" output_file = asset_dir / "test_dataset_output.json" cleanup.append(output_file) - + create_dataset_from_file( benchmark_file=source_file, output_path=output_file, show_stats=False, enable_console=False, ) - + assert output_file.exists() - + with output_file.open() as f: dataset = json.load(f) - + assert "version" in dataset assert "description" in dataset assert "data" in dataset assert isinstance(dataset["data"], list) assert len(dataset["data"]) > 0 - + for item in dataset["data"]: assert "prompt" in item assert "output_tokens_count" in item @@ -87,19 +87,19 @@ def test_create_dataset_from_valid_benchmark_yaml(get_test_asset_dir, cleanup): source_file = asset_dir / "benchmarks_stripped.yaml" output_file = asset_dir / "test_dataset_yaml_output.json" cleanup.append(output_file) - + create_dataset_from_file( benchmark_file=source_file, output_path=output_file, show_stats=False, enable_console=False, ) - + assert output_file.exists() - + with output_file.open() as f: dataset = json.load(f) - + assert "data" in dataset assert len(dataset["data"]) > 0 @@ -110,14 +110,14 @@ def test_create_dataset_with_stats_output(capfd, get_test_asset_dir, cleanup): source_file = asset_dir / "benchmarks_stripped.json" output_file = asset_dir / "test_dataset_stats_output.json" cleanup.append(output_file) - + create_dataset_from_file( benchmark_file=source_file, output_path=output_file, show_stats=True, enable_console=True, ) - + out, err = capfd.readouterr() assert "Validating benchmark report file" in out assert "Valid benchmark report with" in out @@ -135,18 +135,18 @@ def test_create_dataset_with_console_disabled(capfd, get_test_asset_dir, cleanup source_file = asset_dir / "benchmarks_stripped.json" output_file = asset_dir / "test_dataset_no_console.json" cleanup.append(output_file) - + create_dataset_from_file( benchmark_file=source_file, output_path=output_file, show_stats=True, enable_console=False, ) - + out, err = capfd.readouterr() assert out == "" assert err == "" - + assert output_file.exists() @@ -154,7 +154,7 @@ def test_validate_benchmark_file_valid_file(get_test_asset_dir): """Test validation with a valid benchmark file.""" asset_dir = get_test_asset_dir() source_file = asset_dir / "benchmarks_stripped.json" - + report = validate_benchmark_file(source_file) assert report is not None assert len(report.benchmarks) > 0 @@ -163,10 +163,10 @@ def test_validate_benchmark_file_valid_file(get_test_asset_dir): def test_validate_benchmark_file_invalid_json(temp_file): """Test validation with invalid JSON.""" temp_file.write_text("This is not JSON") - + with pytest.raises(DatasetCreationError) as exc_info: validate_benchmark_file(temp_file) - + assert "Invalid benchmark report file" in str(exc_info.value) assert "Expecting value" in str(exc_info.value) @@ -174,20 +174,20 @@ def test_validate_benchmark_file_invalid_json(temp_file): def test_validate_benchmark_file_invalid_structure(temp_file): """Test validation with valid JSON but invalid benchmark structure.""" temp_file.write_text('{"invalid": "structure"}') - + with pytest.raises(DatasetCreationError) as exc_info: validate_benchmark_file(temp_file) - + assert "Invalid benchmark report file" in str(exc_info.value) def test_validate_benchmark_file_no_benchmarks(temp_file): """Test validation with valid structure but no benchmarks.""" temp_file.write_text('{"benchmarks": []}') - + with pytest.raises(DatasetCreationError) as exc_info: validate_benchmark_file(temp_file) - + assert "Benchmark report contains no benchmark data" in str(exc_info.value) @@ -195,13 +195,13 @@ def test_extract_dataset_from_benchmark_report(get_test_asset_dir): """Test extracting dataset from a validated benchmark report.""" asset_dir = get_test_asset_dir() source_file = asset_dir / "benchmarks_stripped.json" - + report = validate_benchmark_file(source_file) - + dataset_items = extract_dataset_from_benchmark_report(report) - + assert len(dataset_items) > 0 - + for item in dataset_items: assert "prompt" in item assert "output_tokens" in item @@ -220,27 +220,27 @@ def test_save_dataset_from_benchmark(cleanup): "prompt_tokens": 50, }, { - "prompt": "Test prompt 2", + "prompt": "Test prompt 2", "output_tokens": 200, "prompt_tokens": 75, - } + }, ] - + output_file = Path("test_save_dataset.json") cleanup.append(output_file) - + save_dataset_from_benchmark(dataset_items, output_file) - + assert output_file.exists() - + with output_file.open() as f: saved_data = json.load(f) - + assert "version" in saved_data assert "description" in saved_data assert "data" in saved_data assert len(saved_data["data"]) == 2 - + for item in saved_data["data"]: assert "prompt" in item assert "output_tokens_count" in item @@ -254,9 +254,9 @@ def test_print_dataset_statistics_with_data(capfd): {"prompt": "Test 2", "output_tokens": 200, "prompt_tokens": 75}, {"prompt": "Test 3", "output_tokens": 150, "prompt_tokens": 60}, ] - + print_dataset_statistics(dataset_items, enable_console=True) - + out, err = capfd.readouterr() assert "Dataset Statistics:" in out assert "Total items: 3" in out @@ -266,10 +266,10 @@ def test_print_dataset_statistics_with_data(capfd): def test_print_dataset_statistics_empty_dataset(capfd): """Test printing statistics with empty dataset.""" - dataset_items = [] - + dataset_items: list[dict[str, Any]] = [] + print_dataset_statistics(dataset_items, enable_console=True) - + out, err = capfd.readouterr() assert "No valid items found in dataset" in err @@ -279,9 +279,9 @@ def test_print_dataset_statistics_console_disabled(capfd): dataset_items = [ {"prompt": "Test", "output_tokens": 100, "prompt_tokens": 50}, ] - + print_dataset_statistics(dataset_items, enable_console=False) - + out, err = capfd.readouterr() assert out == "" assert err == "" @@ -291,7 +291,7 @@ def test_create_dataset_from_file_nonexistent_file(): """Test error handling for nonexistent file.""" nonexistent_file = Path("does_not_exist.json") output_file = Path("output.json") - + with pytest.raises(DatasetCreationError): create_dataset_from_file( benchmark_file=nonexistent_file, @@ -303,19 +303,15 @@ def test_create_dataset_from_file_nonexistent_file(): def test_create_dataset_from_file_no_successful_requests(temp_file): """Test handling of benchmark with no successful requests.""" - benchmark_data = { - "benchmarks": [{ - "requests": { - "successful": [], - "errored": [], - "incomplete": [] - } - }] + benchmark_data: dict[str, Any] = { + "benchmarks": [ + {"requests": {"successful": [], "errored": [], "incomplete": []}} + ] } temp_file.write_text(json.dumps(benchmark_data)) - + output_file = Path("output.json") - + with pytest.raises(DatasetCreationError) as exc_info: create_dataset_from_file( benchmark_file=temp_file, @@ -323,9 +319,9 @@ def test_create_dataset_from_file_no_successful_requests(temp_file): show_stats=False, enable_console=False, ) - + assert "Invalid benchmark report file" in str(exc_info.value) if __name__ == "__main__": - unittest.main() \ No newline at end of file + unittest.main()