From e20e2ba505c167dd87878c622142550caabe8853 Mon Sep 17 00:00:00 2001
From: Harshith-umesh <harshith.umesh.nat@gmail.com>
Date: Mon, 21 Jul 2025 16:33:19 -0400
Subject: [PATCH 1/3] Add dataset-from-file command to extract datasets from
 benchmark reports

---
 docs/preprocess.md                            | 134 +++++++
 src/guidellm/__main__.py                      |  58 +++
 src/guidellm/preprocess/__init__.py           |   3 +-
 src/guidellm/preprocess/dataset_from_file.py  | 185 ++++++++++
 .../test_dataset_from_file_entrypoint.py      | 346 ++++++++++++++++++
 5 files changed, 725 insertions(+), 1 deletion(-)
 create mode 100644 docs/preprocess.md
 create mode 100644 src/guidellm/preprocess/dataset_from_file.py
 create mode 100644 tests/unit/entrypoints/test_dataset_from_file_entrypoint.py

diff --git a/docs/preprocess.md b/docs/preprocess.md
new file mode 100644
index 00000000..58d1ab9c
--- /dev/null
+++ b/docs/preprocess.md
@@ -0,0 +1,134 @@
+# Preprocess Commands
+
+GuideLLM provides preprocessing capabilities to transform and prepare data for benchmarking workflows. The preprocess module includes tools for creating datasets from existing benchmark results, enabling "apples-to-apples" comparisons and reusable benchmark datasets.
+
+## Overview
+
+The `guidellm preprocess` command provides utilities to:
+
+- **Extract datasets from benchmark results**: Convert completed benchmark reports into reusable datasets with known prompt and output token counts for consistent comparisons
+
+
+## Commands
+
+### `dataset-from-file`
+
+Extracts prompts and their corresponding output token counts from saved benchmark report files to create datasets for future benchmarking runs.
+
+#### Purpose
+
+When you run a benchmark with GuideLLM, you get detailed results about how a model performed with specific prompts. The `dataset-from-file` command allows you to extract those successful prompt-response pairs and convert them into a standardized dataset format. This enables:
+
+1. **Consistent Comparisons**: Use the exact same prompts across different models or configurations
+2. **Known Expectations**: Each prompt comes with its expected output token count
+3. **Reproducible Benchmarks**: Eliminate variability from different prompts when comparing models
+
+#### Syntax
+
+```bash
+guidellm preprocess dataset-from-file [OPTIONS] BENCHMARK_FILE
+```
+
+#### Arguments
+
+- `BENCHMARK_FILE`: Path to the saved benchmark report file (JSON format)
+
+#### Options
+
+- `-o, --output-path PATH`: Output dataset file path (default: `dataset_from_benchmark.json`)
+- `--show-stats`: Show dataset statistics after creation
+- `--disable-console-outputs`: Disable console output for silent operation
+- `--help`: Show help message and exit
+
+#### Example Usage
+
+##### Basic Usage
+
+```bash
+# Convert a benchmark report to a dataset
+guidellm preprocess dataset-from-file benchmark-results.json
+
+# Specify custom output path
+guidellm preprocess dataset-from-file benchmark-results.json -o my_dataset.json
+
+# Show statistics about the created dataset
+guidellm preprocess dataset-from-file benchmark-results.json --show-stats
+```
+
+#### Input File Requirements
+
+The input benchmark file must be a valid GuideLLM benchmark report containing:
+
+- **Valid JSON format**: The file must be properly formatted
+- **Benchmark report structure**: Must contain the expected benchmark report schema
+- **Successful requests**: Must contain at least one successful request to extract data from
+
+##### Supported Input Formats
+
+```json
+{
+  "benchmarks": [
+    {
+      "requests": {
+        "successful": [
+          {
+            "prompt": "What is the capital of France?",
+            "output_tokens": 5,
+            "... other request fields ..."
+          }
+        ],
+        "errored": [],
+        "incomplete": []
+      }
+    }
+  ]
+}
+```
+
+#### Output Format
+
+The generated dataset follows this structure:
+
+```json
+{
+  "version": "1.0",
+  "description": "Dataset created from benchmark results for apples-to-apples comparisons",
+  "data": [
+    {
+      "prompt": "What is the capital of France?",
+      "output_tokens_count": 5,
+      "prompt_tokens_count": 12
+    },
+    {
+      "prompt": "Explain quantum computing in simple terms.",
+      "output_tokens_count": 45,
+      "prompt_tokens_count": 8
+    }
+  ]
+}
+```
+
+
+Each data item contains:
+- `prompt`: The original prompt text
+- `output_tokens_count`: The number of tokens in the model's response
+- `prompt_tokens_count`: The number of tokens in the original prompt
+
+#### Statistics Output
+
+When using `--show-stats`, you'll see detailed information about the created dataset:
+
+```
+Dataset Statistics:
+==================
+Total items: 95
+Prompt length statistics:
+  Min: 8 characters
+  Max: 245 characters  
+  Mean: 87.3 characters
+Output tokens statistics:
+  Min: 1 tokens
+  Max: 512 tokens
+  Mean: 124.8 tokens
+```
+
diff --git a/src/guidellm/__main__.py b/src/guidellm/__main__.py
index f0bd9043..648c5a54 100644
--- a/src/guidellm/__main__.py
+++ b/src/guidellm/__main__.py
@@ -15,6 +15,7 @@
 from guidellm.benchmark.scenario import GenerativeTextScenario, get_builtin_scenarios
 from guidellm.config import print_config
 from guidellm.preprocess.dataset import ShortPromptStrategy, process_dataset
+from guidellm.preprocess.dataset_from_file import create_dataset_from_file, DatasetCreationError
 from guidellm.scheduler import StrategyType
 from guidellm.utils import DefaultGroupHandler
 from guidellm.utils import cli as cli_tools
@@ -491,6 +492,12 @@ def dataset(
     hub_dataset_id,
     random_seed,
 ):
+    """
+    Convert a dataset to have specific prompt and output token counts.
+
+    This creates a filtered and processed dataset where prompts and outputs
+    match specified token counts, useful for consistent benchmarking.
+    """
     process_dataset(
         data=data,
         output_path=output_path,
@@ -508,5 +515,56 @@ def dataset(
     )
 
 
+@preprocess.command("dataset-from-file", help="Create a dataset from a saved benchmark report file.")
+@click.argument(
+    "benchmark_file",
+    type=click.Path(exists=True, file_okay=True, dir_okay=False, path_type=Path),
+)
+@click.option(
+    "-o",
+    "--output-path",
+    type=click.Path(file_okay=True, dir_okay=False, path_type=Path),
+    default=Path("dataset_from_benchmark.json"),
+    help="Output dataset file path.",
+)
+@click.option(
+    "--show-stats",
+    is_flag=True,
+    help="Show dataset statistics after creation.",
+)
+@click.option(
+    "--disable-console-outputs",
+    is_flag=True,
+    help="Set this flag to disable console output.",
+)
+def dataset_from_file(
+    benchmark_file,
+    output_path,
+    show_stats,
+    disable_console_outputs,
+):
+    """
+    Create a dataset from a saved benchmark report file.
+
+    This extracts prompts and their corresponding output token counts from
+    benchmark results to create an 'apples-to-apples' comparison dataset.
+
+    BENCHMARK_FILE: Path to the benchmark results JSON file.
+    """
+    try:
+        create_dataset_from_file(
+            benchmark_file=benchmark_file,
+            output_path=Path(output_path),
+            show_stats=show_stats,
+            enable_console=not disable_console_outputs,
+        )
+    except DatasetCreationError as e:
+        # To print clean error message without a traceback
+        if not disable_console_outputs:
+            click.echo(f"Error: {e}", err=True)
+        ctx = click.get_current_context()
+        ctx.exit(1)
+
+
 if __name__ == "__main__":
     cli()
diff --git a/src/guidellm/preprocess/__init__.py b/src/guidellm/preprocess/__init__.py
index 95d01e5f..a53b378f 100644
--- a/src/guidellm/preprocess/__init__.py
+++ b/src/guidellm/preprocess/__init__.py
@@ -1,3 +1,4 @@
 from .dataset import ShortPromptStrategy, process_dataset
+from .dataset_from_file import create_dataset_from_file, DatasetCreationError
 
-__all__ = ["ShortPromptStrategy", "process_dataset"]
+__all__ = ["ShortPromptStrategy", "process_dataset", "create_dataset_from_file", "DatasetCreationError"]
diff --git a/src/guidellm/preprocess/dataset_from_file.py b/src/guidellm/preprocess/dataset_from_file.py
new file mode 100644
index 00000000..a3e47d82
--- /dev/null
+++ b/src/guidellm/preprocess/dataset_from_file.py
@@ -0,0 +1,185 @@
+"""
+Module for creating datasets from saved benchmark report files.
+
+This module provides functionality to extract prompts and their corresponding
+output token counts from benchmark results to create datasets for future
+'apples-to-apples' comparisons.
+"""
+
+import json
+import sys
+from pathlib import Path
+from typing import Any
+
+from guidellm.benchmark.output import GenerativeBenchmarksReport
+
+__all__ = [
+    "validate_benchmark_file",
+    "extract_dataset_from_benchmark_report", 
+    "save_dataset_from_benchmark",
+    "print_dataset_statistics",
+    "create_dataset_from_file",
+    "DatasetCreationError",
+]
+
+
+class DatasetCreationError(Exception):
+    """Exception raised when dataset creation fails."""
+    pass
+
+
+def validate_benchmark_file(filepath: Path) -> GenerativeBenchmarksReport:
+    """
+    Validate that the file is a proper GuideLLM benchmark report.
+    
+    Args:
+        filepath: Path to the benchmark report file
+        
+    Returns:
+        GenerativeBenchmarksReport: The validated and loaded report
+        
+    Raises:
+        DatasetCreationError: If file validation fails
+    """
+    try:
+        report = GenerativeBenchmarksReport.load_file(filepath)
+        
+        if not report.benchmarks:
+            raise DatasetCreationError("Benchmark report contains no benchmark data")
+            
+        return report
+        
+    except Exception as e:
+        raise DatasetCreationError(
+            f"Invalid benchmark report file '{filepath}': {e}"
+        ) from e
+
+
+def extract_dataset_from_benchmark_report(report: GenerativeBenchmarksReport) -> list[dict[str, Any]]:
+    """
+    Extract prompts and output tokens from a validated benchmark report.
+    
+    Args:
+        report: A validated GenerativeBenchmarksReport instance
+        
+    Returns:
+        List of dataset items with prompt and token information
+    """
+    dataset_items = []
+    
+    for benchmark in report.benchmarks:
+        requests_breakdown = benchmark.requests
+        
+        successful_requests = requests_breakdown.successful
+        
+        for request in successful_requests:
+            prompt = request.prompt
+            output_tokens = request.output_tokens
+            prompt_tokens = request.prompt_tokens
+            
+            if prompt and output_tokens > 0:
+                dataset_items.append({
+                    "prompt": prompt,
+                    "output_tokens": output_tokens,
+                    "prompt_tokens": prompt_tokens,
+                })
+    
+    return dataset_items
+
+
+def save_dataset_from_benchmark(dataset_items: list[dict[str, Any]], output_file: Path) -> None:
+    """Save the dataset to a JSON file."""
+    # Convert to the format expected by guidellm documentation
+    formatted_items = []
+    for item in dataset_items:
+        formatted_items.append({
+            "prompt": item["prompt"],
+            "output_tokens_count": item["output_tokens"],  
+            "prompt_tokens_count": item["prompt_tokens"],  
+        })
+    
+    dataset_data = {
+        "version": "1.0",
+        "description": "Dataset created from benchmark results for apples-to-apples comparisons",
+        "data": formatted_items  
+    }
+    
+    with output_file.open("w") as f:
+        json.dump(dataset_data, f, indent=2)
+
+
+def print_dataset_statistics(dataset_items: list[dict[str, Any]], enable_console: bool = True) -> None:
+    """Print statistics about the dataset."""
+    if not enable_console:
+        return
+        
+    if not dataset_items:
+        print("No valid items found in dataset", file=sys.stderr)
+        return
+    
+    total_items = len(dataset_items)
+    prompt_tokens = [item["prompt_tokens"] for item in dataset_items]
+    output_tokens = [item["output_tokens"] for item in dataset_items]
+    
+    print(f"\nDataset Statistics:")
+    print(f"Total items: {total_items}")
+    print(f"Prompt tokens - Min: {min(prompt_tokens)}, Max: {max(prompt_tokens)}, Mean: {sum(prompt_tokens)/len(prompt_tokens):.1f}")
+    print(f"Output tokens - Min: {min(output_tokens)}, Max: {max(output_tokens)}, Mean: {sum(output_tokens)/len(output_tokens):.1f}")
+
+
+def create_dataset_from_file(
+    benchmark_file: Path,
+    output_path: Path,
+    show_stats: bool = False,
+    enable_console: bool = True,
+) -> None:
+    """
+    Create a dataset from a saved benchmark report file.
+    
+    This function validates the benchmark file format, loads it using the same
+    validation as the 'from-file' command, then extracts prompts and their 
+    corresponding output token counts from successful requests.
+    
+    Args:
+        benchmark_file: Path to the benchmark results JSON/YAML file
+        output_path: Path where the dataset should be saved
+        show_stats: Whether to display dataset statistics
+        enable_console: Whether to enable console output
+        
+    Raises:
+        DatasetCreationError: If validation fails or no valid requests found
+    """
+    if enable_console:
+        print(f"Validating benchmark report file: {benchmark_file}")
+    
+    try:
+        report = validate_benchmark_file(benchmark_file)
+        
+        if enable_console:
+            print(f"Valid benchmark report with {len(report.benchmarks)} benchmark(s)")
+            print(f"Loading and extracting dataset from benchmark results...")
+        
+        dataset_items = extract_dataset_from_benchmark_report(report)
+        
+        if not dataset_items:
+            error_msg = "No valid requests with prompts and output tokens found in benchmark report"
+            if enable_console:
+                print(f"Error: {error_msg}", file=sys.stderr)
+            raise DatasetCreationError(error_msg)
+        
+        save_dataset_from_benchmark(dataset_items, output_path)
+        
+        if enable_console:
+            print(f"Dataset saved to: {output_path}")
+            print(f"Success, Created dataset with {len(dataset_items)} items")
+            print(f"You can now use this dataset for future guidellm runs by specifying: --data {output_path}")
+        
+        if show_stats:
+            print_dataset_statistics(dataset_items, enable_console)
+        
+    except DatasetCreationError:
+        raise
+    except Exception as e:
+        if enable_console:
+            print(f"Unexpected error: {e}", file=sys.stderr)
+        raise DatasetCreationError(f"Failed to process benchmark file: {e}") from e 
\ No newline at end of file
diff --git a/tests/unit/entrypoints/test_dataset_from_file_entrypoint.py b/tests/unit/entrypoints/test_dataset_from_file_entrypoint.py
new file mode 100644
index 00000000..491da4b9
--- /dev/null
+++ b/tests/unit/entrypoints/test_dataset_from_file_entrypoint.py
@@ -0,0 +1,346 @@
+import json
+import os
+import tempfile
+import unittest
+from pathlib import Path
+
+import pytest
+
+from guidellm.preprocess.dataset_from_file import (
+    DatasetCreationError,
+    create_dataset_from_file,
+    validate_benchmark_file,
+    extract_dataset_from_benchmark_report,
+    save_dataset_from_benchmark,
+    print_dataset_statistics,
+)
+
+REGENERATE_ARTIFACTS = False
+
+
+@pytest.fixture
+def get_test_asset_dir():
+    def _() -> Path:
+        return Path(__file__).parent / "assets"
+
+    return _
+
+
+@pytest.fixture
+def cleanup():
+    to_delete: list[Path] = []
+    yield to_delete
+    for item in to_delete:
+        if item.exists():
+            item.unlink()  # Deletes the file
+
+
+@pytest.fixture
+def temp_file():
+    """Create a temporary file that gets cleaned up automatically."""
+    with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
+        temp_path = Path(f.name)
+    yield temp_path
+    if temp_path.exists():
+        temp_path.unlink()
+
+
+def test_create_dataset_from_valid_benchmark_json(get_test_asset_dir, cleanup):
+    """Test creating dataset from a valid benchmark JSON file."""
+    asset_dir = get_test_asset_dir()
+    source_file = asset_dir / "benchmarks_stripped.json"
+    output_file = asset_dir / "test_dataset_output.json"
+    cleanup.append(output_file)
+    
+    create_dataset_from_file(
+        benchmark_file=source_file,
+        output_path=output_file,
+        show_stats=False,
+        enable_console=False,
+    )
+    
+    assert output_file.exists()
+    
+    with output_file.open() as f:
+        dataset = json.load(f)
+    
+    # Verify dataset structure
+    assert "version" in dataset
+    assert "description" in dataset
+    assert "data" in dataset
+    assert isinstance(dataset["data"], list)
+    assert len(dataset["data"]) > 0
+    
+    # Verify each dataset item has required fields
+    for item in dataset["data"]:
+        assert "prompt" in item
+        assert "output_tokens_count" in item
+        assert "prompt_tokens_count" in item
+        assert isinstance(item["prompt"], str)
+        assert isinstance(item["output_tokens_count"], int)
+        assert isinstance(item["prompt_tokens_count"], int)
+        assert len(item["prompt"]) > 0
+        assert item["output_tokens_count"] > 0
+
+
+def test_create_dataset_from_valid_benchmark_yaml(get_test_asset_dir, cleanup):
+    """Test creating dataset from a valid benchmark YAML file."""
+    asset_dir = get_test_asset_dir()
+    source_file = asset_dir / "benchmarks_stripped.yaml"
+    output_file = asset_dir / "test_dataset_yaml_output.json"
+    cleanup.append(output_file)
+    
+    create_dataset_from_file(
+        benchmark_file=source_file,
+        output_path=output_file,
+        show_stats=False,
+        enable_console=False,
+    )
+    
+    assert output_file.exists()
+    
+    with output_file.open() as f:
+        dataset = json.load(f)
+    
+    assert "data" in dataset
+    assert len(dataset["data"]) > 0
+
+
+def test_create_dataset_with_stats_output(capfd, get_test_asset_dir, cleanup):
+    """Test creating dataset with statistics output enabled."""
+    asset_dir = get_test_asset_dir()
+    source_file = asset_dir / "benchmarks_stripped.json"
+    output_file = asset_dir / "test_dataset_stats_output.json"
+    cleanup.append(output_file)
+    
+    create_dataset_from_file(
+        benchmark_file=source_file,
+        output_path=output_file,
+        show_stats=True,
+        enable_console=True,
+    )
+    
+    # Verify console output includes statistics
+    out, err = capfd.readouterr()
+    assert "Validating benchmark report file" in out
+    assert "Valid benchmark report with" in out
+    assert "Dataset saved to" in out
+    assert "Success, Created dataset with" in out
+    assert "Dataset Statistics:" in out
+    assert "Total items:" in out
+    assert "Prompt tokens - Min:" in out
+    assert "Output tokens - Min:" in out
+
+
+def test_create_dataset_with_console_disabled(capfd, get_test_asset_dir, cleanup):
+    """Test creating dataset with console output disabled."""
+    asset_dir = get_test_asset_dir()
+    source_file = asset_dir / "benchmarks_stripped.json"
+    output_file = asset_dir / "test_dataset_no_console.json"
+    cleanup.append(output_file)
+    
+    create_dataset_from_file(
+        benchmark_file=source_file,
+        output_path=output_file,
+        show_stats=True,
+        enable_console=False,
+    )
+    
+    # Verify no console output
+    out, err = capfd.readouterr()
+    assert out == ""
+    assert err == ""
+    
+    assert output_file.exists()
+
+
+def test_validate_benchmark_file_valid_file(get_test_asset_dir):
+    """Test validation with a valid benchmark file."""
+    asset_dir = get_test_asset_dir()
+    source_file = asset_dir / "benchmarks_stripped.json"
+    
+    report = validate_benchmark_file(source_file)
+    assert report is not None
+    assert len(report.benchmarks) > 0
+
+
+def test_validate_benchmark_file_invalid_json(temp_file):
+    """Test validation with invalid JSON."""
+    # Write invalid JSON
+    temp_file.write_text("This is not JSON")
+    
+    with pytest.raises(DatasetCreationError) as exc_info:
+        validate_benchmark_file(temp_file)
+    
+    assert "Invalid benchmark report file" in str(exc_info.value)
+    assert "Expecting value" in str(exc_info.value)
+
+
+def test_validate_benchmark_file_invalid_structure(temp_file):
+    """Test validation with valid JSON but invalid benchmark structure."""
+    # Write valid JSON but wrong structure
+    temp_file.write_text('{"invalid": "structure"}')
+    
+    with pytest.raises(DatasetCreationError) as exc_info:
+        validate_benchmark_file(temp_file)
+    
+    assert "Invalid benchmark report file" in str(exc_info.value)
+
+
+def test_validate_benchmark_file_no_benchmarks(temp_file):
+    """Test validation with valid structure but no benchmarks."""
+    # Write valid structure but empty benchmarks
+    temp_file.write_text('{"benchmarks": []}')
+    
+    with pytest.raises(DatasetCreationError) as exc_info:
+        validate_benchmark_file(temp_file)
+    
+    assert "Benchmark report contains no benchmark data" in str(exc_info.value)
+
+
+def test_extract_dataset_from_benchmark_report(get_test_asset_dir):
+    """Test extracting dataset from a validated benchmark report."""
+    asset_dir = get_test_asset_dir()
+    source_file = asset_dir / "benchmarks_stripped.json"
+    
+    # First validate and load the report
+    report = validate_benchmark_file(source_file)
+    
+    # Extract dataset
+    dataset_items = extract_dataset_from_benchmark_report(report)
+    
+    assert len(dataset_items) > 0
+    
+    # Verify structure of extracted items
+    for item in dataset_items:
+        assert "prompt" in item
+        assert "output_tokens" in item
+        assert "prompt_tokens" in item
+        assert len(item["prompt"]) > 0
+        assert item["output_tokens"] > 0
+        assert item["prompt_tokens"] > 0
+
+
+def test_save_dataset_from_benchmark(cleanup):
+    """Test saving dataset to file."""
+    # Create test dataset items
+    dataset_items = [
+        {
+            "prompt": "Test prompt 1",
+            "output_tokens": 100,
+            "prompt_tokens": 50,
+        },
+        {
+            "prompt": "Test prompt 2", 
+            "output_tokens": 200,
+            "prompt_tokens": 75,
+        }
+    ]
+    
+    output_file = Path("test_save_dataset.json")
+    cleanup.append(output_file)
+    
+    # Save dataset
+    save_dataset_from_benchmark(dataset_items, output_file)
+    
+    # Verify file exists and has correct structure
+    assert output_file.exists()
+    
+    with output_file.open() as f:
+        saved_data = json.load(f)
+    
+    assert "version" in saved_data
+    assert "description" in saved_data
+    assert "data" in saved_data
+    assert len(saved_data["data"]) == 2
+    
+    # Verify field names are converted correctly
+    for item in saved_data["data"]:
+        assert "prompt" in item
+        assert "output_tokens_count" in item
+        assert "prompt_tokens_count" in item
+
+
+def test_print_dataset_statistics_with_data(capfd):
+    """Test printing statistics with valid dataset."""
+    dataset_items = [
+        {"prompt": "Test 1", "output_tokens": 100, "prompt_tokens": 50},
+        {"prompt": "Test 2", "output_tokens": 200, "prompt_tokens": 75},
+        {"prompt": "Test 3", "output_tokens": 150, "prompt_tokens": 60},
+    ]
+    
+    print_dataset_statistics(dataset_items, enable_console=True)
+    
+    out, err = capfd.readouterr()
+    assert "Dataset Statistics:" in out
+    assert "Total items: 3" in out
+    assert "Prompt tokens - Min: 50, Max: 75, Mean: 61.7" in out
+    assert "Output tokens - Min: 100, Max: 200, Mean: 150.0" in out
+
+
+def test_print_dataset_statistics_empty_dataset(capfd):
+    """Test printing statistics with empty dataset."""
+    dataset_items = []
+    
+    print_dataset_statistics(dataset_items, enable_console=True)
+    
+    out, err = capfd.readouterr()
+    assert "No valid items found in dataset" in err
+
+
+def test_print_dataset_statistics_console_disabled(capfd):
+    """Test printing statistics with console disabled."""
+    dataset_items = [
+        {"prompt": "Test", "output_tokens": 100, "prompt_tokens": 50},
+    ]
+    
+    print_dataset_statistics(dataset_items, enable_console=False)
+    
+    out, err = capfd.readouterr()
+    assert out == ""
+    assert err == ""
+
+
+def test_create_dataset_from_file_nonexistent_file():
+    """Test error handling for nonexistent file."""
+    nonexistent_file = Path("does_not_exist.json")
+    output_file = Path("output.json")
+    
+    with pytest.raises(DatasetCreationError):
+        create_dataset_from_file(
+            benchmark_file=nonexistent_file,
+            output_path=output_file,
+            show_stats=False,
+            enable_console=False,
+        )
+
+
+def test_create_dataset_from_file_no_successful_requests(temp_file):
+    """Test handling of benchmark with no successful requests."""
+    # Create benchmark with no successful requests
+    benchmark_data = {
+        "benchmarks": [{
+            "requests": {
+                "successful": [],
+                "errored": [],
+                "incomplete": []
+            }
+        }]
+    }
+    temp_file.write_text(json.dumps(benchmark_data))
+    
+    output_file = Path("output.json")
+    
+    with pytest.raises(DatasetCreationError) as exc_info:
+        create_dataset_from_file(
+            benchmark_file=temp_file,
+            output_path=output_file,
+            show_stats=False,
+            enable_console=False,
+        )
+    
+    assert "Invalid benchmark report file" in str(exc_info.value)
+
+
+if __name__ == "__main__":
+    unittest.main() 
\ No newline at end of file

From 52d054e0e53fffbd296e37eb1f053d94f77af8a4 Mon Sep 17 00:00:00 2001
From: Harshith-umesh <harshith.umesh.nat@gmail.com>
Date: Tue, 22 Jul 2025 13:36:45 -0400
Subject: [PATCH 2/3] Fix comments in tests

---
 .../test_dataset_from_file_entrypoint.py          | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/tests/unit/entrypoints/test_dataset_from_file_entrypoint.py b/tests/unit/entrypoints/test_dataset_from_file_entrypoint.py
index 491da4b9..857dc7b7 100644
--- a/tests/unit/entrypoints/test_dataset_from_file_entrypoint.py
+++ b/tests/unit/entrypoints/test_dataset_from_file_entrypoint.py
@@ -64,14 +64,12 @@ def test_create_dataset_from_valid_benchmark_json(get_test_asset_dir, cleanup):
     with output_file.open() as f:
         dataset = json.load(f)
     
-    # Verify dataset structure
     assert "version" in dataset
     assert "description" in dataset
     assert "data" in dataset
     assert isinstance(dataset["data"], list)
     assert len(dataset["data"]) > 0
     
-    # Verify each dataset item has required fields
     for item in dataset["data"]:
         assert "prompt" in item
         assert "output_tokens_count" in item
@@ -120,7 +118,6 @@ def test_create_dataset_with_stats_output(capfd, get_test_asset_dir, cleanup):
         enable_console=True,
     )
     
-    # Verify console output includes statistics
     out, err = capfd.readouterr()
     assert "Validating benchmark report file" in out
     assert "Valid benchmark report with" in out
@@ -146,7 +143,6 @@ def test_create_dataset_with_console_disabled(capfd, get_test_asset_dir, cleanup
         enable_console=False,
     )
     
-    # Verify no console output
     out, err = capfd.readouterr()
     assert out == ""
     assert err == ""
@@ -166,7 +162,6 @@ def test_validate_benchmark_file_valid_file(get_test_asset_dir):
 
 def test_validate_benchmark_file_invalid_json(temp_file):
     """Test validation with invalid JSON."""
-    # Write invalid JSON
     temp_file.write_text("This is not JSON")
     
     with pytest.raises(DatasetCreationError) as exc_info:
@@ -178,7 +173,6 @@ def test_validate_benchmark_file_invalid_json(temp_file):
 
 def test_validate_benchmark_file_invalid_structure(temp_file):
     """Test validation with valid JSON but invalid benchmark structure."""
-    # Write valid JSON but wrong structure
     temp_file.write_text('{"invalid": "structure"}')
     
     with pytest.raises(DatasetCreationError) as exc_info:
@@ -189,7 +183,6 @@ def test_validate_benchmark_file_invalid_structure(temp_file):
 
 def test_validate_benchmark_file_no_benchmarks(temp_file):
     """Test validation with valid structure but no benchmarks."""
-    # Write valid structure but empty benchmarks
     temp_file.write_text('{"benchmarks": []}')
     
     with pytest.raises(DatasetCreationError) as exc_info:
@@ -203,15 +196,12 @@ def test_extract_dataset_from_benchmark_report(get_test_asset_dir):
     asset_dir = get_test_asset_dir()
     source_file = asset_dir / "benchmarks_stripped.json"
     
-    # First validate and load the report
     report = validate_benchmark_file(source_file)
     
-    # Extract dataset
     dataset_items = extract_dataset_from_benchmark_report(report)
     
     assert len(dataset_items) > 0
     
-    # Verify structure of extracted items
     for item in dataset_items:
         assert "prompt" in item
         assert "output_tokens" in item
@@ -223,7 +213,6 @@ def test_extract_dataset_from_benchmark_report(get_test_asset_dir):
 
 def test_save_dataset_from_benchmark(cleanup):
     """Test saving dataset to file."""
-    # Create test dataset items
     dataset_items = [
         {
             "prompt": "Test prompt 1",
@@ -240,10 +229,8 @@ def test_save_dataset_from_benchmark(cleanup):
     output_file = Path("test_save_dataset.json")
     cleanup.append(output_file)
     
-    # Save dataset
     save_dataset_from_benchmark(dataset_items, output_file)
     
-    # Verify file exists and has correct structure
     assert output_file.exists()
     
     with output_file.open() as f:
@@ -254,7 +241,6 @@ def test_save_dataset_from_benchmark(cleanup):
     assert "data" in saved_data
     assert len(saved_data["data"]) == 2
     
-    # Verify field names are converted correctly
     for item in saved_data["data"]:
         assert "prompt" in item
         assert "output_tokens_count" in item
@@ -317,7 +303,6 @@ def test_create_dataset_from_file_nonexistent_file():
 
 def test_create_dataset_from_file_no_successful_requests(temp_file):
     """Test handling of benchmark with no successful requests."""
-    # Create benchmark with no successful requests
     benchmark_data = {
         "benchmarks": [{
             "requests": {

From 72d60fb594a76fbfc9a61826be5caf7a561511c6 Mon Sep 17 00:00:00 2001
From: Harshith-umesh <harshith.umesh.nat@gmail.com>
Date: Sat, 26 Jul 2025 18:01:22 -0400
Subject: [PATCH 3/3] Fix linting and quality check errors

---
 docs/preprocess.md                            |   6 +-
 src/guidellm/__main__.py                      |   9 +-
 src/guidellm/preprocess/__init__.py           |   9 +-
 src/guidellm/preprocess/dataset_from_file.py  | 174 +++++++++++-------
 .../test_dataset_from_file_entrypoint.py      | 110 ++++++-----
 5 files changed, 173 insertions(+), 135 deletions(-)

diff --git a/docs/preprocess.md b/docs/preprocess.md
index 58d1ab9c..062bfc84 100644
--- a/docs/preprocess.md
+++ b/docs/preprocess.md
@@ -8,7 +8,6 @@ The `guidellm preprocess` command provides utilities to:
 
 - **Extract datasets from benchmark results**: Convert completed benchmark reports into reusable datasets with known prompt and output token counts for consistent comparisons
 
-
 ## Commands
 
 ### `dataset-from-file`
@@ -108,8 +107,8 @@ The generated dataset follows this structure:
 }
 ```
 
-
 Each data item contains:
+
 - `prompt`: The original prompt text
 - `output_tokens_count`: The number of tokens in the model's response
 - `prompt_tokens_count`: The number of tokens in the original prompt
@@ -124,11 +123,10 @@ Dataset Statistics:
 Total items: 95
 Prompt length statistics:
   Min: 8 characters
-  Max: 245 characters  
+  Max: 245 characters
   Mean: 87.3 characters
 Output tokens statistics:
   Min: 1 tokens
   Max: 512 tokens
   Mean: 124.8 tokens
 ```
-
diff --git a/src/guidellm/__main__.py b/src/guidellm/__main__.py
index 648c5a54..db697869 100644
--- a/src/guidellm/__main__.py
+++ b/src/guidellm/__main__.py
@@ -15,7 +15,10 @@
 from guidellm.benchmark.scenario import GenerativeTextScenario, get_builtin_scenarios
 from guidellm.config import print_config
 from guidellm.preprocess.dataset import ShortPromptStrategy, process_dataset
-from guidellm.preprocess.dataset_from_file import create_dataset_from_file, DatasetCreationError
+from guidellm.preprocess.dataset_from_file import (
+    DatasetCreationError,
+    create_dataset_from_file,
+)
 from guidellm.scheduler import StrategyType
 from guidellm.utils import DefaultGroupHandler
 from guidellm.utils import cli as cli_tools
@@ -515,7 +518,9 @@ def dataset(
     )
 
 
-@preprocess.command("dataset-from-file", help="Create a dataset from a saved benchmark report file.")
+@preprocess.command(
+    "dataset-from-file", help="Create a dataset from a saved benchmark report file."
+)
 @click.argument(
     "benchmark_file",
     type=click.Path(exists=True, file_okay=True, dir_okay=False, path_type=Path),
diff --git a/src/guidellm/preprocess/__init__.py b/src/guidellm/preprocess/__init__.py
index a53b378f..83aeb207 100644
--- a/src/guidellm/preprocess/__init__.py
+++ b/src/guidellm/preprocess/__init__.py
@@ -1,4 +1,9 @@
 from .dataset import ShortPromptStrategy, process_dataset
-from .dataset_from_file import create_dataset_from_file, DatasetCreationError
+from .dataset_from_file import DatasetCreationError, create_dataset_from_file
 
-__all__ = ["ShortPromptStrategy", "process_dataset", "create_dataset_from_file", "DatasetCreationError"]
+__all__ = [
+    "DatasetCreationError",
+    "ShortPromptStrategy",
+    "create_dataset_from_file",
+    "process_dataset",
+]
diff --git a/src/guidellm/preprocess/dataset_from_file.py b/src/guidellm/preprocess/dataset_from_file.py
index a3e47d82..5a1202d6 100644
--- a/src/guidellm/preprocess/dataset_from_file.py
+++ b/src/guidellm/preprocess/dataset_from_file.py
@@ -7,124 +7,147 @@
 """
 
 import json
-import sys
 from pathlib import Path
 from typing import Any
 
+from rich.console import Console
+
 from guidellm.benchmark.output import GenerativeBenchmarksReport
 
 __all__ = [
-    "validate_benchmark_file",
-    "extract_dataset_from_benchmark_report", 
-    "save_dataset_from_benchmark",
-    "print_dataset_statistics",
-    "create_dataset_from_file",
     "DatasetCreationError",
+    "create_dataset_from_file",
+    "extract_dataset_from_benchmark_report",
+    "print_dataset_statistics",
+    "save_dataset_from_benchmark",
+    "validate_benchmark_file",
 ]
 
 
 class DatasetCreationError(Exception):
     """Exception raised when dataset creation fails."""
-    pass
 
 
 def validate_benchmark_file(filepath: Path) -> GenerativeBenchmarksReport:
     """
     Validate that the file is a proper GuideLLM benchmark report.
-    
+
     Args:
         filepath: Path to the benchmark report file
-        
+
     Returns:
         GenerativeBenchmarksReport: The validated and loaded report
-        
+
     Raises:
         DatasetCreationError: If file validation fails
     """
     try:
         report = GenerativeBenchmarksReport.load_file(filepath)
-        
         if not report.benchmarks:
             raise DatasetCreationError("Benchmark report contains no benchmark data")
-            
         return report
-        
     except Exception as e:
-        raise DatasetCreationError(
-            f"Invalid benchmark report file '{filepath}': {e}"
-        ) from e
+        error_msg = f"Invalid benchmark report file: {e}"
+        raise DatasetCreationError(error_msg) from e
 
 
-def extract_dataset_from_benchmark_report(report: GenerativeBenchmarksReport) -> list[dict[str, Any]]:
+def extract_dataset_from_benchmark_report(
+    report: GenerativeBenchmarksReport,
+) -> list[dict[str, Any]]:
     """
     Extract prompts and output tokens from a validated benchmark report.
-    
+
     Args:
         report: A validated GenerativeBenchmarksReport instance
-        
+
     Returns:
         List of dataset items with prompt and token information
     """
     dataset_items = []
-    
+
     for benchmark in report.benchmarks:
+        # Access the StatusBreakdown properties directly
         requests_breakdown = benchmark.requests
-        
+
+        # Get successful requests (these are the ones we want)
         successful_requests = requests_breakdown.successful
-        
+
         for request in successful_requests:
+            # Extract the needed data - these are Request objects
             prompt = request.prompt
             output_tokens = request.output_tokens
             prompt_tokens = request.prompt_tokens
-            
+
+            # Only include items with valid data
             if prompt and output_tokens > 0:
-                dataset_items.append({
-                    "prompt": prompt,
-                    "output_tokens": output_tokens,
-                    "prompt_tokens": prompt_tokens,
-                })
-    
+                dataset_items.append(
+                    {
+                        "prompt": prompt,
+                        "output_tokens": output_tokens,
+                        "prompt_tokens": prompt_tokens,
+                    }
+                )
+
     return dataset_items
 
 
-def save_dataset_from_benchmark(dataset_items: list[dict[str, Any]], output_file: Path) -> None:
+def save_dataset_from_benchmark(
+    dataset_items: list[dict[str, Any]], output_file: Path
+) -> None:
     """Save the dataset to a JSON file."""
     # Convert to the format expected by guidellm documentation
     formatted_items = []
     for item in dataset_items:
-        formatted_items.append({
-            "prompt": item["prompt"],
-            "output_tokens_count": item["output_tokens"],  
-            "prompt_tokens_count": item["prompt_tokens"],  
-        })
-    
+        formatted_items.append(
+            {
+                "prompt": item["prompt"],
+                "output_tokens_count": item["output_tokens"],
+                "prompt_tokens_count": item["prompt_tokens"],
+            }
+        )
+
     dataset_data = {
         "version": "1.0",
-        "description": "Dataset created from benchmark results for apples-to-apples comparisons",
-        "data": formatted_items  
+        "description": (
+            "Dataset created from benchmark results for apples-to-apples comparisons"
+        ),
+        "data": formatted_items,
     }
-    
+
     with output_file.open("w") as f:
         json.dump(dataset_data, f, indent=2)
 
 
-def print_dataset_statistics(dataset_items: list[dict[str, Any]], enable_console: bool = True) -> None:
+def print_dataset_statistics(
+    dataset_items: list[dict[str, Any]], enable_console: bool = True
+) -> None:
     """Print statistics about the dataset."""
     if not enable_console:
         return
-        
+
+    console = Console()
+    console_err = Console(stderr=True)
+
     if not dataset_items:
-        print("No valid items found in dataset", file=sys.stderr)
+        console_err.print("No valid items found in dataset")
         return
-    
+
     total_items = len(dataset_items)
     prompt_tokens = [item["prompt_tokens"] for item in dataset_items]
     output_tokens = [item["output_tokens"] for item in dataset_items]
-    
-    print(f"\nDataset Statistics:")
-    print(f"Total items: {total_items}")
-    print(f"Prompt tokens - Min: {min(prompt_tokens)}, Max: {max(prompt_tokens)}, Mean: {sum(prompt_tokens)/len(prompt_tokens):.1f}")
-    print(f"Output tokens - Min: {min(output_tokens)}, Max: {max(output_tokens)}, Mean: {sum(output_tokens)/len(output_tokens):.1f}")
+
+    console.print("\nDataset Statistics:")
+    console.print(f"Total items: {total_items}")
+    console.print(
+        f"Prompt tokens - Min: {min(prompt_tokens)}, "
+        f"Max: {max(prompt_tokens)}, "
+        f"Mean: {sum(prompt_tokens) / len(prompt_tokens):.1f}"
+    )
+    console.print(
+        f"Output tokens - Min: {min(output_tokens)}, "
+        f"Max: {max(output_tokens)}, "
+        f"Mean: {sum(output_tokens) / len(output_tokens):.1f}"
+    )
 
 
 def create_dataset_from_file(
@@ -135,51 +158,62 @@ def create_dataset_from_file(
 ) -> None:
     """
     Create a dataset from a saved benchmark report file.
-    
+
     This function validates the benchmark file format, loads it using the same
-    validation as the 'from-file' command, then extracts prompts and their 
+    validation as the 'from-file' command, then extracts prompts and their
     corresponding output token counts from successful requests.
-    
+
     Args:
         benchmark_file: Path to the benchmark results JSON/YAML file
         output_path: Path where the dataset should be saved
         show_stats: Whether to display dataset statistics
         enable_console: Whether to enable console output
-        
+
     Raises:
         DatasetCreationError: If validation fails or no valid requests found
     """
+    console = Console()
+    console_err = Console(stderr=True)
+
     if enable_console:
-        print(f"Validating benchmark report file: {benchmark_file}")
-    
+        console.print(f"Validating benchmark report file: {benchmark_file}")
+
     try:
         report = validate_benchmark_file(benchmark_file)
-        
+
         if enable_console:
-            print(f"Valid benchmark report with {len(report.benchmarks)} benchmark(s)")
-            print(f"Loading and extracting dataset from benchmark results...")
-        
+            console.print(
+                f"Valid benchmark report with {len(report.benchmarks)} benchmark(s)"
+            )
+            console.print("Loading and extracting dataset from benchmark results...")
+
         dataset_items = extract_dataset_from_benchmark_report(report)
-        
+
         if not dataset_items:
-            error_msg = "No valid requests with prompts and output tokens found in benchmark report"
+            error_msg = (
+                "No valid requests with prompts and output tokens "
+                "found in benchmark report"
+            )
             if enable_console:
-                print(f"Error: {error_msg}", file=sys.stderr)
+                console_err.print(f"Error: {error_msg}")
             raise DatasetCreationError(error_msg)
-        
+
         save_dataset_from_benchmark(dataset_items, output_path)
-        
+
         if enable_console:
-            print(f"Dataset saved to: {output_path}")
-            print(f"Success, Created dataset with {len(dataset_items)} items")
-            print(f"You can now use this dataset for future guidellm runs by specifying: --data {output_path}")
-        
+            console.print(f"Dataset saved to: {output_path}")
+            console.print(f"Success, Created dataset with {len(dataset_items)} items")
+            console.print(
+                f"You can now use this dataset for future guidellm runs "
+                f"by specifying: --data {output_path}"
+            )
+
         if show_stats:
             print_dataset_statistics(dataset_items, enable_console)
-        
+
     except DatasetCreationError:
         raise
     except Exception as e:
         if enable_console:
-            print(f"Unexpected error: {e}", file=sys.stderr)
-        raise DatasetCreationError(f"Failed to process benchmark file: {e}") from e 
\ No newline at end of file
+            console_err.print(f"Unexpected error: {e}")
+        raise DatasetCreationError(f"Failed to process benchmark file: {e}") from e
diff --git a/tests/unit/entrypoints/test_dataset_from_file_entrypoint.py b/tests/unit/entrypoints/test_dataset_from_file_entrypoint.py
index 857dc7b7..12a668e6 100644
--- a/tests/unit/entrypoints/test_dataset_from_file_entrypoint.py
+++ b/tests/unit/entrypoints/test_dataset_from_file_entrypoint.py
@@ -1,18 +1,18 @@
 import json
-import os
 import tempfile
 import unittest
 from pathlib import Path
+from typing import Any
 
 import pytest
 
 from guidellm.preprocess.dataset_from_file import (
     DatasetCreationError,
     create_dataset_from_file,
-    validate_benchmark_file,
     extract_dataset_from_benchmark_report,
-    save_dataset_from_benchmark,
     print_dataset_statistics,
+    save_dataset_from_benchmark,
+    validate_benchmark_file,
 )
 
 REGENERATE_ARTIFACTS = False
@@ -38,7 +38,7 @@ def cleanup():
 @pytest.fixture
 def temp_file():
     """Create a temporary file that gets cleaned up automatically."""
-    with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
         temp_path = Path(f.name)
     yield temp_path
     if temp_path.exists():
@@ -51,25 +51,25 @@ def test_create_dataset_from_valid_benchmark_json(get_test_asset_dir, cleanup):
     source_file = asset_dir / "benchmarks_stripped.json"
     output_file = asset_dir / "test_dataset_output.json"
     cleanup.append(output_file)
-    
+
     create_dataset_from_file(
         benchmark_file=source_file,
         output_path=output_file,
         show_stats=False,
         enable_console=False,
     )
-    
+
     assert output_file.exists()
-    
+
     with output_file.open() as f:
         dataset = json.load(f)
-    
+
     assert "version" in dataset
     assert "description" in dataset
     assert "data" in dataset
     assert isinstance(dataset["data"], list)
     assert len(dataset["data"]) > 0
-    
+
     for item in dataset["data"]:
         assert "prompt" in item
         assert "output_tokens_count" in item
@@ -87,19 +87,19 @@ def test_create_dataset_from_valid_benchmark_yaml(get_test_asset_dir, cleanup):
     source_file = asset_dir / "benchmarks_stripped.yaml"
     output_file = asset_dir / "test_dataset_yaml_output.json"
     cleanup.append(output_file)
-    
+
     create_dataset_from_file(
         benchmark_file=source_file,
         output_path=output_file,
         show_stats=False,
         enable_console=False,
     )
-    
+
     assert output_file.exists()
-    
+
     with output_file.open() as f:
         dataset = json.load(f)
-    
+
     assert "data" in dataset
     assert len(dataset["data"]) > 0
 
@@ -110,14 +110,14 @@ def test_create_dataset_with_stats_output(capfd, get_test_asset_dir, cleanup):
     source_file = asset_dir / "benchmarks_stripped.json"
     output_file = asset_dir / "test_dataset_stats_output.json"
     cleanup.append(output_file)
-    
+
     create_dataset_from_file(
         benchmark_file=source_file,
         output_path=output_file,
         show_stats=True,
         enable_console=True,
     )
-    
+
     out, err = capfd.readouterr()
     assert "Validating benchmark report file" in out
     assert "Valid benchmark report with" in out
@@ -135,18 +135,18 @@ def test_create_dataset_with_console_disabled(capfd, get_test_asset_dir, cleanup
     source_file = asset_dir / "benchmarks_stripped.json"
     output_file = asset_dir / "test_dataset_no_console.json"
     cleanup.append(output_file)
-    
+
     create_dataset_from_file(
         benchmark_file=source_file,
         output_path=output_file,
         show_stats=True,
         enable_console=False,
     )
-    
+
     out, err = capfd.readouterr()
     assert out == ""
     assert err == ""
-    
+
     assert output_file.exists()
 
 
@@ -154,7 +154,7 @@ def test_validate_benchmark_file_valid_file(get_test_asset_dir):
     """Test validation with a valid benchmark file."""
     asset_dir = get_test_asset_dir()
     source_file = asset_dir / "benchmarks_stripped.json"
-    
+
     report = validate_benchmark_file(source_file)
     assert report is not None
     assert len(report.benchmarks) > 0
@@ -163,10 +163,10 @@ def test_validate_benchmark_file_valid_file(get_test_asset_dir):
 def test_validate_benchmark_file_invalid_json(temp_file):
     """Test validation with invalid JSON."""
     temp_file.write_text("This is not JSON")
-    
+
     with pytest.raises(DatasetCreationError) as exc_info:
         validate_benchmark_file(temp_file)
-    
+
     assert "Invalid benchmark report file" in str(exc_info.value)
     assert "Expecting value" in str(exc_info.value)
 
@@ -174,20 +174,20 @@ def test_validate_benchmark_file_invalid_json(temp_file):
 def test_validate_benchmark_file_invalid_structure(temp_file):
     """Test validation with valid JSON but invalid benchmark structure."""
     temp_file.write_text('{"invalid": "structure"}')
-    
+
     with pytest.raises(DatasetCreationError) as exc_info:
         validate_benchmark_file(temp_file)
-    
+
     assert "Invalid benchmark report file" in str(exc_info.value)
 
 
 def test_validate_benchmark_file_no_benchmarks(temp_file):
     """Test validation with valid structure but no benchmarks."""
     temp_file.write_text('{"benchmarks": []}')
-    
+
     with pytest.raises(DatasetCreationError) as exc_info:
         validate_benchmark_file(temp_file)
-    
+
     assert "Benchmark report contains no benchmark data" in str(exc_info.value)
 
 
@@ -195,13 +195,13 @@ def test_extract_dataset_from_benchmark_report(get_test_asset_dir):
     """Test extracting dataset from a validated benchmark report."""
     asset_dir = get_test_asset_dir()
     source_file = asset_dir / "benchmarks_stripped.json"
-    
+
     report = validate_benchmark_file(source_file)
-    
+
     dataset_items = extract_dataset_from_benchmark_report(report)
-    
+
     assert len(dataset_items) > 0
-    
+
     for item in dataset_items:
         assert "prompt" in item
         assert "output_tokens" in item
@@ -220,27 +220,27 @@ def test_save_dataset_from_benchmark(cleanup):
             "prompt_tokens": 50,
         },
         {
-            "prompt": "Test prompt 2", 
+            "prompt": "Test prompt 2",
             "output_tokens": 200,
             "prompt_tokens": 75,
-        }
+        },
     ]
-    
+
     output_file = Path("test_save_dataset.json")
     cleanup.append(output_file)
-    
+
     save_dataset_from_benchmark(dataset_items, output_file)
-    
+
     assert output_file.exists()
-    
+
     with output_file.open() as f:
         saved_data = json.load(f)
-    
+
     assert "version" in saved_data
     assert "description" in saved_data
     assert "data" in saved_data
     assert len(saved_data["data"]) == 2
-    
+
     for item in saved_data["data"]:
         assert "prompt" in item
         assert "output_tokens_count" in item
@@ -254,9 +254,9 @@ def test_print_dataset_statistics_with_data(capfd):
         {"prompt": "Test 2", "output_tokens": 200, "prompt_tokens": 75},
         {"prompt": "Test 3", "output_tokens": 150, "prompt_tokens": 60},
     ]
-    
+
     print_dataset_statistics(dataset_items, enable_console=True)
-    
+
     out, err = capfd.readouterr()
     assert "Dataset Statistics:" in out
     assert "Total items: 3" in out
@@ -266,10 +266,10 @@ def test_print_dataset_statistics_with_data(capfd):
 
 def test_print_dataset_statistics_empty_dataset(capfd):
     """Test printing statistics with empty dataset."""
-    dataset_items = []
-    
+    dataset_items: list[dict[str, Any]] = []
+
     print_dataset_statistics(dataset_items, enable_console=True)
-    
+
     out, err = capfd.readouterr()
     assert "No valid items found in dataset" in err
 
@@ -279,9 +279,9 @@ def test_print_dataset_statistics_console_disabled(capfd):
     dataset_items = [
         {"prompt": "Test", "output_tokens": 100, "prompt_tokens": 50},
     ]
-    
+
     print_dataset_statistics(dataset_items, enable_console=False)
-    
+
     out, err = capfd.readouterr()
     assert out == ""
     assert err == ""
@@ -291,7 +291,7 @@ def test_create_dataset_from_file_nonexistent_file():
     """Test error handling for nonexistent file."""
     nonexistent_file = Path("does_not_exist.json")
     output_file = Path("output.json")
-    
+
     with pytest.raises(DatasetCreationError):
         create_dataset_from_file(
             benchmark_file=nonexistent_file,
@@ -303,19 +303,15 @@ def test_create_dataset_from_file_nonexistent_file():
 
 def test_create_dataset_from_file_no_successful_requests(temp_file):
     """Test handling of benchmark with no successful requests."""
-    benchmark_data = {
-        "benchmarks": [{
-            "requests": {
-                "successful": [],
-                "errored": [],
-                "incomplete": []
-            }
-        }]
+    benchmark_data: dict[str, Any] = {
+        "benchmarks": [
+            {"requests": {"successful": [], "errored": [], "incomplete": []}}
+        ]
     }
     temp_file.write_text(json.dumps(benchmark_data))
-    
+
     output_file = Path("output.json")
-    
+
     with pytest.raises(DatasetCreationError) as exc_info:
         create_dataset_from_file(
             benchmark_file=temp_file,
@@ -323,9 +319,9 @@ def test_create_dataset_from_file_no_successful_requests(temp_file):
             show_stats=False,
             enable_console=False,
         )
-    
+
     assert "Invalid benchmark report file" in str(exc_info.value)
 
 
 if __name__ == "__main__":
-    unittest.main() 
\ No newline at end of file
+    unittest.main()