From a5035de720bae02ac4eb95f5037bd8c504cc5210 Mon Sep 17 00:00:00 2001
From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com>
Date: Wed, 12 Nov 2025 15:16:34 -0600
Subject: [PATCH 01/30] WIP: First round of performance regressions are working

---
 nemoguardrails/benchmark/aiperf/__init__.py   |  21 +
 .../aiperf_configs/concurrency_sweep.yaml     |  38 ++
 .../benchmark/aiperf/aiperf_models.py         | 156 +++++++
 nemoguardrails/benchmark/aiperf/run_aiperf.py | 404 ++++++++++++++++++
 .../llm/providers/huggingface/streamers.py    |   4 +-
 5 files changed, 621 insertions(+), 2 deletions(-)
 create mode 100644 nemoguardrails/benchmark/aiperf/__init__.py
 create mode 100644 nemoguardrails/benchmark/aiperf/aiperf_configs/concurrency_sweep.yaml
 create mode 100644 nemoguardrails/benchmark/aiperf/aiperf_models.py
 create mode 100755 nemoguardrails/benchmark/aiperf/run_aiperf.py
diff --git a/nemoguardrails/benchmark/aiperf/__init__.py b/nemoguardrails/benchmark/aiperf/__init__.py
new file mode 100644
index 000000000..3b074a794
--- /dev/null
+++ b/nemoguardrails/benchmark/aiperf/__init__.py
@@ -0,0 +1,21 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""AIPerf benchmark runner and configuration models."""
+
+from .aiperf_config import AIPerfConfig, BaseConfig
+from .aiperf_runner import AIPerfRunner
+
+__all__ = ["AIPerfConfig", "BaseConfig", "AIPerfRunner"]
diff --git a/nemoguardrails/benchmark/aiperf/aiperf_configs/concurrency_sweep.yaml b/nemoguardrails/benchmark/aiperf/aiperf_configs/concurrency_sweep.yaml
new file mode 100644
index 000000000..610103582
--- /dev/null
+++ b/nemoguardrails/benchmark/aiperf/aiperf_configs/concurrency_sweep.yaml
@@ -0,0 +1,38 @@
+# Concurrency sweep. One-minute tests at log-spaced concurrencies
+
+# Name for this batch of benchmarks (will be part of output directory name)
+batch_name: simple_regression_benchmark
+
+# Base directory where all benchmark results will be stored.
+# Actual name is <output_base_dir>/<batch_name>/<sweep value>
+output_base_dir: aiperf_results
+
+# Base configuration applied to all benchmark runs
+# These parameters can be overridden by sweep parameters
+base_config:
+  # Model details
+  model: meta/llama-3.3-70b-instruct
+  url: "http://localhost:9000"
+  endpoint: "/v1/chat/completions"
+  endpoint_type: chat
+
+  # Load generation settings.
+  # Ramp-up time = concurrency / request-rate (https://github.com/ai-dynamo/aiperf/blob/b91408d3df0df78236748017b46292cd74e65dc6/docs/tutorials/request-rate-concurrency.md)
+  # So request_rate = concurrency / rampup_seconds
+  rampup_seconds: 10
+  benchmark_seconds: 60
+  concurrency: 16
+  request_rate_mode: "constant"
+
+  # Synthetic data generation
+  random_seed: 12345
+  prompt_input_tokens_mean: 100
+  prompt_input_tokens_stddev: 10
+  prompt_output_tokens_mean: 50
+  prompt_output_tokens_stddev: 5
+
+# Parameter sweeps. Each parameter can have multiple values
+# The script will run all combinations (Cartesian product)
+sweeps:
+  # Sweep over the following concurrency values
+  concurrency: [1, 10, 100, 1000]
diff --git a/nemoguardrails/benchmark/aiperf/aiperf_models.py b/nemoguardrails/benchmark/aiperf/aiperf_models.py
new file mode 100644
index 000000000..23560ac90
--- /dev/null
+++ b/nemoguardrails/benchmark/aiperf/aiperf_models.py
@@ -0,0 +1,156 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Pydantic models for AIPerf configuration validation.
+"""
+
+from pathlib import Path
+from typing import Dict, List, Literal, Optional, Union
+
+from pydantic import BaseModel, Field, model_validator
+
+
+class BaseConfig(BaseModel):
+    """Base configuration for AIPerf benchmark runs."""
+
+    # Model details
+    model: str = Field(..., description="Model name")
+    url: str = Field(..., description="Model base URL")
+    endpoint: str = Field(
+        default="/v1/chat/completions", description="API endpoint path"
+    )
+    endpoint_type: Literal["chat", "completions"] = Field(
+        default="chat",
+        description="Type of endpoint (chat or completions)",
+    )
+
+    # Load generation settings
+    rampup_seconds: int = Field(description="Ramp-up time in seconds")
+    benchmark_seconds: int = Field(description="Benchmark duration in seconds")
+    concurrency: int = Field(description="Number of concurrent requests")
+    request_rate: Optional[float] = Field(
+        default=None,
+        description="Request rate (requests per second, auto-calculated if not provided)",
+    )
+    request_rate_mode: Optional[Literal["constant", "poisson"]] = Field(
+        default="constant",
+        description="Request rate mode (constant, poisson, etc.)",
+    )
+
+    # Synthetic data generation
+    random_seed: Optional[int] = Field(
+        default=None, description="Random seed for reproducibility"
+    )
+    prompt_input_tokens_mean: Optional[int] = Field(
+        default=None,
+        description="Mean number of input tokens",
+    )
+    prompt_input_tokens_stddev: Optional[int] = Field(
+        default=None,
+        description="Standard deviation of input tokens",
+    )
+    prompt_output_tokens_mean: Optional[int] = Field(
+        default=None,
+        description="Mean number of output tokens",
+    )
+    prompt_output_tokens_stddev: Optional[int] = Field(
+        default=None,
+        description="Standard deviation of output tokens",
+    )
+
+    # TODO! Come up with better ways to set these
+    tokenizer: str = Field(
+        default="/Users/tgasser/projects/aiperf/tokenizers/llama-3.3-70b",
+        description="Path to tokenizer",
+    )
+    ui_type: Literal["dashboard", "simple", "none"] = Field(
+        default="none", description="UI to use while running regression"
+    )
+
+    @model_validator(mode="after")
+    def calculate_request_rate(self):
+        """Calculate request_rate if not provided."""
+        if self.request_rate is None:
+            self.request_rate = int(self.concurrency / self.rampup_seconds)
+        return self
+
+
+class AIPerfConfig(BaseModel):
+    """Main configuration model for AIPerf benchmark runner."""
+
+    batch_name: str = Field(
+        default="benchmark", description="Name for this batch of benchmarks"
+    )
+    output_base_dir: str = Field(
+        default="aiperf_results",
+        description="Base directory for benchmark results",
+    )
+    base_config: BaseConfig = Field(
+        ..., description="Base configuration applied to all benchmark runs"
+    )
+    sweeps: Optional[Dict[str, List[Union[int, float, str]]]] = Field(
+        default=None,
+        description="Parameter sweeps. Key is the parameter to change, value is a list of values to use",
+    )
+
+    # TODO! Add validation for sweeps
+
+    # @field_validator("sweeps")
+    # @classmethod
+    # def validate_sweeps(
+    #     cls, v: Optional[Dict[str, List[Any]]]
+    # ) -> Optional[Dict[str, List[Any]]]:
+    #     """Validate that sweep values are lists."""
+    #     if v is None:
+    #         return v
+
+    #     for param_name, values in v.items():
+    #         if not isinstance(values, list):
+    #             raise ValueError(
+    #                 f"Sweep parameter '{param_name}' must be a list, got {type(values)}"
+    #             )
+    #         if len(values) == 0:
+    #             raise ValueError(f"Sweep parameter '{param_name}' cannot be empty")
+
+    #     return v
+
+    # @model_validator(mode="after")
+    # def validate_sweep_keys(self):
+    #     """Validate that sweep keys exist in base_config."""
+    #     if self.sweeps is None:
+    #         return self
+
+    #     # Get all valid field names from BaseConfig (using hyphenated versions)
+    #     valid_keys = set()
+    #     for field_name, field_info in BaseConfig.model_fields.items():
+    #         # Use alias if available, otherwise convert underscores to hyphens
+    #         if field_info.alias:
+    #             valid_keys.add(field_info.alias)
+
+    #     # Check each sweep parameter
+    #     for param_name in self.sweeps.keys():
+    #         if param_name not in valid_keys:
+    #             valid_fields = sorted(valid_keys)
+    #             raise ValueError(
+    #                 f"Sweep parameter '{param_name}' is not a valid BaseConfig field. "
+    #                 f"Valid fields are: {', '.join(valid_fields)}"
+    #             )
+
+    #     return self
+
+    def get_output_base_path(self) -> Path:
+        """Get the base output directory as a Path object."""
+        return Path(self.output_base_dir)
diff --git a/nemoguardrails/benchmark/aiperf/run_aiperf.py b/nemoguardrails/benchmark/aiperf/run_aiperf.py
new file mode 100755
index 000000000..c174e844b
--- /dev/null
+++ b/nemoguardrails/benchmark/aiperf/run_aiperf.py
@@ -0,0 +1,404 @@
+#!/usr/bin/env python3
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+AIPerf Benchmark Runner
+
+This script orchestrates multiple aiperf benchmark runs based on a YAML configuration file.
+It supports parameter sweeps and organizes results in a structured directory hierarchy.
+"""
+
+import itertools
+import json
+import logging
+import subprocess
+import sys
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Dict, List
+
+import typer
+import yaml
+from aiperf_models import AIPerfConfig
+from pydantic import ValidationError
+from tqdm import tqdm
+
+# 1. Get a logger instance
+log = logging.getLogger(__name__)
+log.setLevel(logging.DEBUG)  # Set the lowest level to capture all messages
+
+# Set up formatter and direct it to the console
+formatter = logging.Formatter(
+    "%(asctime)s %(levelname)s: %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
+)
+console_handler = logging.StreamHandler()
+console_handler.setLevel(logging.DEBUG)  # DEBUG and higher will go to the console
+console_handler.setFormatter(formatter)
+
+# Add the console handler for logging
+log.addHandler(console_handler)
+
+
+class AIPerfRunner:
+    """Manages execution of aiperf benchmark runs with configurable parameters."""
+
+    def __init__(self, config_path: Path):
+        """
+        Initialize the runner with a configuration file.
+
+        Args:
+            config_path: Path to the YAML configuration file
+        """
+        self.config_path = config_path
+        self.config = self._load_config()
+
+    def _load_config(self) -> AIPerfConfig:
+        """Load and validate the YAML configuration file using Pydantic."""
+        try:
+            with open(self.config_path, "r", encoding="utf-8") as f:
+                config_data = yaml.safe_load(f)
+
+            # Validate with Pydantic model
+            config = AIPerfConfig(**config_data)
+            return config
+
+        except FileNotFoundError:
+            log.error("Configuration file not found: %s", self.config_path)
+            sys.exit(1)
+        except yaml.YAMLError as e:
+            log.error("Error parsing YAML configuration: %s", e)
+            sys.exit(1)
+        except ValidationError as e:
+            log.error("Configuration validation error:\n%s", e)
+            sys.exit(1)
+        except Exception as e:
+            log.error("Unexpected error loading configuration: %s", e)
+            sys.exit(1)
+
+    def _get_sweep_combinations(self) -> List[Dict[str, Any]]:
+        """
+        Generate all parameter combinations from sweep configurations.
+
+        Returns:
+            List of dictionaries, each representing one parameter combination
+        """
+        sweeps = self.config.sweeps
+
+        if not sweeps:
+            # No sweeps, return single empty combination
+            return [{}]
+
+        # Extract parameter names and their values
+        param_names = list(sweeps.keys())
+        param_values = [sweeps[name] for name in param_names]
+
+        # Generate all combinations
+        combinations = []
+        for combo in itertools.product(*param_values):
+            combinations.append(dict(zip(param_names, combo)))
+
+        return combinations
+
+    def _build_command(
+        self, sweep_params: Dict[str, Any], output_dir: Path
+    ) -> List[str]:
+        """
+        Build the aiperf command with given parameters.
+
+        Args:
+            sweep_params: Parameter overrides from sweep
+            output_dir: Directory to store output artifacts
+
+        Returns:
+            Command as list of strings
+        """
+        cmd = ["aiperf", "profile"]
+
+        # Get base config as dictionary with hyphenated keys
+        base_params = self.config.base_config.model_dump()
+        # Merge base config with sweep params (sweep params override base)
+        params = {**base_params, **sweep_params}
+
+        # Add output directory
+        params["output-artifact-dir"] = str(output_dir)
+
+        # Convert parameters to command line arguments
+        for key, value in params.items():
+            item_key = key
+            # Rampup seconds is used to derive `request_rate` in the BaseConfig model, don't pass
+            # it to the aiperf invocation
+            if key == "rampup_seconds":
+                continue
+
+            # Convert the `benchmark_seconds` in config file to `benchmark_duration` key
+            if key == "benchmark_seconds":
+                item_key = "benchmark_duration"
+
+            # Convert underscores to hyphens for CLI arguments
+            arg_name = item_key.replace("_", "-")
+
+            # Handle different value types
+            if isinstance(value, bool):
+                if value:
+                    cmd.append(f"--{arg_name}")
+            elif isinstance(value, list):
+                # For list values, add multiple arguments
+                for item in value:
+                    cmd.extend([f"--{arg_name}", str(item)])
+            elif value is not None:
+                cmd.extend([f"--{arg_name}", str(value)])
+
+        return cmd
+
+    def _create_output_dir(
+        self, base_dir: Path, sweep_params: Dict[str, Any], run_index: int
+    ) -> Path:
+        """
+        Create a descriptive output directory for this run.
+
+        Args:
+            base_dir: Base output directory
+            sweep_params: Parameters for this run
+            run_index: Index of this run in the sequence
+
+        Returns:
+            Path to the created output directory
+        """
+        # Create descriptive directory name
+        if sweep_params:
+            # Create name from sweep parameters
+            param_parts = []
+            for key, value in sorted(sweep_params.items()):
+                # Shorten common parameter names
+                short_key = key.replace("prompt-", "").replace("tokens-", "")
+                param_parts.append(f"{short_key}={value}")
+            dir_name = f"run_{run_index:03d}_" + "_".join(param_parts)
+        else:
+            dir_name = f"run_{run_index:03d}"
+
+        output_dir = base_dir / dir_name
+        output_dir.mkdir(parents=True, exist_ok=True)
+
+        return output_dir
+
+    def _save_run_metadata(
+        self,
+        output_dir: Path,
+        sweep_params: Dict[str, Any],
+        command: List[str],
+        run_index: int,
+    ):
+        """
+        Save metadata about this run for reference.
+
+        Args:
+            output_dir: Directory where results are stored
+            sweep_params: Parameters for this run
+            command: Full command that was executed
+            run_index: Index of this run
+        """
+        metadata = {
+            "run_index": run_index,
+            "timestamp": datetime.now().isoformat(),
+            "config_file": str(self.config_path),
+            "sweep_params": sweep_params,
+            "base_config": self.config.base_config.model_dump(),
+            "command": " ".join(command),
+        }
+
+        metadata_file = output_dir / "run_metadata.json"
+        with open(metadata_file, "w", encoding="utf-8") as f:
+            json.dump(metadata, f, indent=2)
+
+    def run(self, dry_run: bool = False, show_progress: bool = True) -> int:
+        """
+        Execute all benchmark runs based on configuration.
+
+        Args:
+            dry_run: If True, print commands without executing
+            show_progress: If True, show progress bar with tqdm
+
+        Returns:
+            Exit code (0 for success, non-zero for failure)
+        """
+        # Get base output directory
+        base_output_dir = self.config.get_output_base_path()
+
+        # Create timestamped batch directory
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        batch_name = self.config.batch_name
+        batch_dir = base_output_dir / f"{batch_name}_{timestamp}"
+
+        # Generate all sweep combinations
+        combinations = self._get_sweep_combinations()
+
+        log.info("=" * 80)
+        log.info("AIPerf Benchmark Runner")
+        log.info("=" * 80)
+        log.info("Configuration: %s", self.config_path)
+        log.info("Batch directory: %s", batch_dir)
+        log.info("Number of runs: %s", len(combinations))
+        log.info("=" * 80)
+
+        if dry_run:
+            log.info("DRY RUN MODE - Commands will not be executed")
+
+        # Execute each combination
+        failed_runs = []
+
+        # Setup progress bar
+        progress_bar = tqdm(
+            enumerate(combinations, start=1),
+            total=len(combinations),
+            desc="Benchmark Progress",
+            unit="run",
+            disable=not show_progress,
+            ncols=100,
+        )
+
+        for i, sweep_params in progress_bar:
+            # Update progress bar description with current run info
+            if show_progress:
+                params_desc = (
+                    ", ".join(f"{k}={v}" for k, v in sorted(sweep_params.items()))
+                    if sweep_params
+                    else "base config"
+                )
+                progress_bar.set_description(
+                    f"Run {i}/{len(combinations)}: {params_desc[:40]}"
+                )
+
+            # Create output directory for this run
+            run_output_dir = self._create_output_dir(batch_dir, sweep_params, i)
+
+            # Build command
+            command = self._build_command(sweep_params, run_output_dir)
+
+            # Save metadata
+            self._save_run_metadata(run_output_dir, sweep_params, command, i)
+
+            log.info("Run %s/%s", i, len(combinations))
+            log.info(
+                "Parameters: %s", sweep_params if sweep_params else "base config only"
+            )
+            log.info("Output directory: %s", run_output_dir)
+            log.info("Command: %s", " ".join(command))
+
+            if not dry_run:
+                try:
+                    # Execute the command
+                    subprocess.run(
+                        command,
+                        check=True,
+                        capture_output=False,  # Let output stream to console
+                        text=True,
+                    )
+                    log.info("✓ Run %s completed successfully", i)
+                    if show_progress:
+                        progress_bar.set_postfix_str("✓ Success")
+                except subprocess.CalledProcessError as e:
+                    log.error("✗ Run %s failed with exit code %s", i, e.returncode)
+                    failed_runs.append((i, sweep_params))
+                    if show_progress:
+                        progress_bar.set_postfix_str("✗ Failed")
+                except KeyboardInterrupt:
+                    log.warning("Interrupted by user")
+                    progress_bar.close()
+                    return 130
+
+        # Close progress bar
+        progress_bar.close()
+
+        # Log summary
+        log.info("=" * 80)
+        log.info("Benchmark Run Summary")
+        log.info("=" * 80)
+        log.info("Total runs: %s", len(combinations))
+        log.info("Successful: %s", len(combinations) - len(failed_runs))
+        log.info("Failed: %s", len(failed_runs))
+
+        if failed_runs:
+            log.warning("Failed runs:")
+            for run_index, params in failed_runs:
+                log.warning("  - Run %s: %s", run_index, params)
+
+        log.info("Results stored in: %s", batch_dir)
+        log.info("=" * 80)
+
+        return 0 if not failed_runs else 1
+
+
+# Create typer app
+app = typer.Typer(
+    help="Run aiperf benchmarks with configurable parameters and sweeps",
+    add_completion=False,
+)
+
+
+@app.command()
+def main(
+    config_file: Path = typer.Argument(
+        ...,
+        help="Path to YAML configuration file",
+        exists=True,
+        file_okay=True,
+        dir_okay=False,
+        readable=True,
+    ),
+    dry_run: bool = typer.Option(
+        False,
+        "--dry-run",
+        help="Print commands without executing them",
+    ),
+    no_progress: bool = typer.Option(
+        False,
+        "--no-progress",
+        help="Disable progress bar display",
+    ),
+):
+    """
+    Run aiperf benchmarks with configurable parameters and sweeps.
+
+    Example configuration file (config.yaml):
+
+      batch_name: my_benchmark
+      output_base_dir: ./benchmark_results
+
+      base_config:
+        model-names: gpt-3.5-turbo
+        url: localhost:8000
+        request-count: 100
+        random-seed: 42
+        prompt-input-tokens-mean: 100
+        prompt-input-tokens-stddev: 10
+        prompt-output-tokens-mean: 50
+        prompt-output-tokens-stddev: 5
+
+      sweeps:
+        request-rate: [10, 20, 50]
+        concurrency: [1, 5, 10]
+
+    This configuration will run 9 benchmark tests (3 request rates × 3 concurrency levels).
+    """
+    # Create and run the benchmark runner
+    runner = AIPerfRunner(config_file)
+    exit_code = runner.run(dry_run=dry_run, show_progress=not no_progress)
+
+    raise typer.Exit(code=exit_code)
+
+
+if __name__ == "__main__":
+    app()
diff --git a/nemoguardrails/llm/providers/huggingface/streamers.py b/nemoguardrails/llm/providers/huggingface/streamers.py
index 14c406124..4afb44ac4 100644
--- a/nemoguardrails/llm/providers/huggingface/streamers.py
+++ b/nemoguardrails/llm/providers/huggingface/streamers.py
@@ -18,8 +18,8 @@
 
 TRANSFORMERS_AVAILABLE = True
 try:
-    from transformers.generation.streamers import (  # type: ignore[import-untyped]
-        TextStreamer,
+    from transformers.generation.streamers import (
+        TextStreamer,  # type: ignore[import-untyped]
     )
 except ImportError:
     # Fallback if transformers is not available

From 436a804faba50bc3a8cd33103bb41b21ec69cbd4 Mon Sep 17 00:00:00 2001
From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com>
Date: Thu, 13 Nov 2025 09:45:24 -0600
Subject: [PATCH 02/30] Remove the rampup_seconds calculation, use warmup
 request count instead

---
 .../aiperf_configs/concurrency_sweep.yaml     |  10 +-
 .../benchmark/aiperf/aiperf_models.py         |  15 +-
 nemoguardrails/benchmark/aiperf/run_aiperf.py | 189 ++++++------------
 3 files changed, 65 insertions(+), 149 deletions(-)

diff --git a/nemoguardrails/benchmark/aiperf/aiperf_configs/concurrency_sweep.yaml b/nemoguardrails/benchmark/aiperf/aiperf_configs/concurrency_sweep.yaml
index 610103582..59b871e04 100644
--- a/nemoguardrails/benchmark/aiperf/aiperf_configs/concurrency_sweep.yaml
+++ b/nemoguardrails/benchmark/aiperf/aiperf_configs/concurrency_sweep.yaml
@@ -1,7 +1,7 @@
 # Concurrency sweep. One-minute tests at log-spaced concurrencies
 
 # Name for this batch of benchmarks (will be part of output directory name)
-batch_name: simple_regression_benchmark
+batch_name: simple_regression
 
 # Base directory where all benchmark results will be stored.
 # Actual name is <output_base_dir>/<batch_name>/<sweep value>
@@ -17,11 +17,9 @@ base_config:
   endpoint_type: chat
 
   # Load generation settings.
-  # Ramp-up time = concurrency / request-rate (https://github.com/ai-dynamo/aiperf/blob/b91408d3df0df78236748017b46292cd74e65dc6/docs/tutorials/request-rate-concurrency.md)
-  # So request_rate = concurrency / rampup_seconds
-  rampup_seconds: 10
+  warmup_request_count: 10
   benchmark_seconds: 60
-  concurrency: 16
+  concurrency: 0  # Overridden by the concurrency sweep below
   request_rate_mode: "constant"
 
   # Synthetic data generation
@@ -35,4 +33,4 @@ base_config:
 # The script will run all combinations (Cartesian product)
 sweeps:
   # Sweep over the following concurrency values
-  concurrency: [1, 10, 100, 1000]
+  concurrency: [1, 10, 100]
diff --git a/nemoguardrails/benchmark/aiperf/aiperf_models.py b/nemoguardrails/benchmark/aiperf/aiperf_models.py
index 23560ac90..7ce3ac767 100644
--- a/nemoguardrails/benchmark/aiperf/aiperf_models.py
+++ b/nemoguardrails/benchmark/aiperf/aiperf_models.py
@@ -38,13 +38,9 @@ class BaseConfig(BaseModel):
     )
 
     # Load generation settings
-    rampup_seconds: int = Field(description="Ramp-up time in seconds")
+    warmup_request_count: int = Field(description="Number of warmup requests")
     benchmark_seconds: int = Field(description="Benchmark duration in seconds")
     concurrency: int = Field(description="Number of concurrent requests")
-    request_rate: Optional[float] = Field(
-        default=None,
-        description="Request rate (requests per second, auto-calculated if not provided)",
-    )
     request_rate_mode: Optional[Literal["constant", "poisson"]] = Field(
         default="constant",
         description="Request rate mode (constant, poisson, etc.)",
@@ -80,13 +76,6 @@ class BaseConfig(BaseModel):
         default="none", description="UI to use while running regression"
     )
 
-    @model_validator(mode="after")
-    def calculate_request_rate(self):
-        """Calculate request_rate if not provided."""
-        if self.request_rate is None:
-            self.request_rate = int(self.concurrency / self.rampup_seconds)
-        return self
-
 
 class AIPerfConfig(BaseModel):
     """Main configuration model for AIPerf benchmark runner."""
@@ -101,7 +90,7 @@ class AIPerfConfig(BaseModel):
     base_config: BaseConfig = Field(
         ..., description="Base configuration applied to all benchmark runs"
     )
-    sweeps: Optional[Dict[str, List[Union[int, float, str]]]] = Field(
+    sweeps: Optional[Dict[str, List[Union[int, str]]]] = Field(
         default=None,
         description="Parameter sweeps. Key is the parameter to change, value is a list of values to use",
     )
diff --git a/nemoguardrails/benchmark/aiperf/run_aiperf.py b/nemoguardrails/benchmark/aiperf/run_aiperf.py
index c174e844b..ee2724f51 100755
--- a/nemoguardrails/benchmark/aiperf/run_aiperf.py
+++ b/nemoguardrails/benchmark/aiperf/run_aiperf.py
@@ -14,13 +14,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""
-AIPerf Benchmark Runner
-
-This script orchestrates multiple aiperf benchmark runs based on a YAML configuration file.
-It supports parameter sweeps and organizes results in a structured directory hierarchy.
-"""
-
 import itertools
 import json
 import logging
@@ -28,19 +21,18 @@
 import sys
 from datetime import datetime
 from pathlib import Path
-from typing import Any, Dict, List
+from subprocess import CompletedProcess
+from typing import Any, Dict, List, Optional, Union
 
 import typer
 import yaml
 from aiperf_models import AIPerfConfig
 from pydantic import ValidationError
-from tqdm import tqdm
 
-# 1. Get a logger instance
+# Set up logging
 log = logging.getLogger(__name__)
-log.setLevel(logging.DEBUG)  # Set the lowest level to capture all messages
+log.setLevel(logging.INFO)  # Set the lowest level to capture all messages
 
-# Set up formatter and direct it to the console
 formatter = logging.Formatter(
     "%(asctime)s %(levelname)s: %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
 )
@@ -53,15 +45,9 @@
 
 
 class AIPerfRunner:
-    """Manages execution of aiperf benchmark runs with configurable parameters."""
+    """Run batches of AIPerf benchmarks using YAML config and optional parameter sweeps"""
 
     def __init__(self, config_path: Path):
-        """
-        Initialize the runner with a configuration file.
-
-        Args:
-            config_path: Path to the YAML configuration file
-        """
         self.config_path = config_path
         self.config = self._load_config()
 
@@ -89,48 +75,36 @@ def _load_config(self) -> AIPerfConfig:
             sys.exit(1)
 
     def _get_sweep_combinations(self) -> List[Dict[str, Any]]:
-        """
-        Generate all parameter combinations from sweep configurations.
+        """Create cartesian-product of parameter sweep values for benchmarks"""
 
-        Returns:
-            List of dictionaries, each representing one parameter combination
-        """
-        sweeps = self.config.sweeps
-
-        if not sweeps:
+        if not self.config.sweeps:
             # No sweeps, return single empty combination
             return [{}]
 
         # Extract parameter names and their values
-        param_names = list(sweeps.keys())
-        param_values = [sweeps[name] for name in param_names]
+        param_names = list(self.config.sweeps.keys())
+        param_values = [self.config.sweeps[name] for name in param_names]
 
         # Generate all combinations
         combinations = []
-        for combo in itertools.product(*param_values):
-            combinations.append(dict(zip(param_names, combo)))
+        for combination in itertools.product(*param_values):
+            combinations.append(dict(zip(param_names, combination)))
 
         return combinations
 
     def _build_command(
-        self, sweep_params: Dict[str, Any], output_dir: Path
+        self, sweep_params: Optional[Dict[str, Union[str, int]]], output_dir: Path
     ) -> List[str]:
-        """
-        Build the aiperf command with given parameters.
+        """Build the aiperf command with given parameters."""
 
-        Args:
-            sweep_params: Parameter overrides from sweep
-            output_dir: Directory to store output artifacts
-
-        Returns:
-            Command as list of strings
-        """
+        # Run aiperf in profile mode: `aiperf profile`
         cmd = ["aiperf", "profile"]
 
-        # Get base config as dictionary with hyphenated keys
+        # Get base config as dictionary
         base_params = self.config.base_config.model_dump()
+
         # Merge base config with sweep params (sweep params override base)
-        params = {**base_params, **sweep_params}
+        params = base_params if not sweep_params else {**base_params, **sweep_params}
 
         # Add output directory
         params["output-artifact-dir"] = str(output_dir)
@@ -140,8 +114,8 @@ def _build_command(
             item_key = key
             # Rampup seconds is used to derive `request_rate` in the BaseConfig model, don't pass
             # it to the aiperf invocation
-            if key == "rampup_seconds":
-                continue
+            # if key == "rampup_seconds":
+            #     continue
 
             # Convert the `benchmark_seconds` in config file to `benchmark_duration` key
             if key == "benchmark_seconds":
@@ -163,35 +137,22 @@ def _build_command(
 
         return cmd
 
+    @staticmethod
     def _create_output_dir(
-        self, base_dir: Path, sweep_params: Dict[str, Any], run_index: int
+        base_dir: Path,
+        sweep_params: Optional[Dict[str, Union[str, int]]],
     ) -> Path:
-        """
-        Create a descriptive output directory for this run.
+        """Create directory in which to place AIPerf outputs."""
 
-        Args:
-            base_dir: Base output directory
-            sweep_params: Parameters for this run
-            run_index: Index of this run in the sequence
+        # Early-out if we're not sweeping anything
+        if not sweep_params:
+            return base_dir
 
-        Returns:
-            Path to the created output directory
-        """
-        # Create descriptive directory name
-        if sweep_params:
-            # Create name from sweep parameters
-            param_parts = []
-            for key, value in sorted(sweep_params.items()):
-                # Shorten common parameter names
-                short_key = key.replace("prompt-", "").replace("tokens-", "")
-                param_parts.append(f"{short_key}={value}")
-            dir_name = f"run_{run_index:03d}_" + "_".join(param_parts)
-        else:
-            dir_name = f"run_{run_index:03d}"
-
-        output_dir = base_dir / dir_name
-        output_dir.mkdir(parents=True, exist_ok=True)
+        param_parts = [f"{key}{value}" for key, value in sorted(sweep_params.items())]
+        param_dir = "_".join(param_parts)
 
+        output_dir = base_dir / param_dir
+        output_dir.mkdir(parents=True, exist_ok=True)
         return output_dir
 
     def _save_run_metadata(
@@ -223,16 +184,18 @@ def _save_run_metadata(
         with open(metadata_file, "w", encoding="utf-8") as f:
             json.dump(metadata, f, indent=2)
 
-    def run(self, dry_run: bool = False, show_progress: bool = True) -> int:
-        """
-        Execute all benchmark runs based on configuration.
+    def _save_subprocess_result_json(
+        self, output_dir: Path, result: CompletedProcess
+    ) -> None:
+        """Save the subprocess result to the given filename"""
 
-        Args:
-            dry_run: If True, print commands without executing
-            show_progress: If True, show progress bar with tqdm
+        process_result_file = output_dir / "process_result.json"
+        with open(process_result_file, "w", encoding="utf-8") as f:
+            json.dump(result.__dict__, f, indent=2)
 
-        Returns:
-            Exit code (0 for success, non-zero for failure)
+    def run(self, dry_run: bool = False) -> int:
+        """
+        Run benchmarks with AIPerf
         """
         # Get base output directory
         base_output_dir = self.config.get_output_base_path()
@@ -240,18 +203,15 @@ def run(self, dry_run: bool = False, show_progress: bool = True) -> int:
         # Create timestamped batch directory
         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
         batch_name = self.config.batch_name
-        batch_dir = base_output_dir / f"{batch_name}_{timestamp}"
+        batch_dir = base_output_dir / batch_name / timestamp
 
         # Generate all sweep combinations
         combinations = self._get_sweep_combinations()
 
-        log.info("=" * 80)
-        log.info("AIPerf Benchmark Runner")
-        log.info("=" * 80)
-        log.info("Configuration: %s", self.config_path)
+        log.info("Running AIPerf with configuration: %s", self.config_path)
         log.info("Batch directory: %s", batch_dir)
+        log.info("Sweep parameters: %s", combinations)
         log.info("Number of runs: %s", len(combinations))
-        log.info("=" * 80)
 
         if dry_run:
             log.info("DRY RUN MODE - Commands will not be executed")
@@ -259,30 +219,10 @@ def run(self, dry_run: bool = False, show_progress: bool = True) -> int:
         # Execute each combination
         failed_runs = []
 
-        # Setup progress bar
-        progress_bar = tqdm(
-            enumerate(combinations, start=1),
-            total=len(combinations),
-            desc="Benchmark Progress",
-            unit="run",
-            disable=not show_progress,
-            ncols=100,
-        )
-
-        for i, sweep_params in progress_bar:
-            # Update progress bar description with current run info
-            if show_progress:
-                params_desc = (
-                    ", ".join(f"{k}={v}" for k, v in sorted(sweep_params.items()))
-                    if sweep_params
-                    else "base config"
-                )
-                progress_bar.set_description(
-                    f"Run {i}/{len(combinations)}: {params_desc[:40]}"
-                )
-
+        for i, sweep_params in enumerate(combinations):
+            run_num = i + 1
             # Create output directory for this run
-            run_output_dir = self._create_output_dir(batch_dir, sweep_params, i)
+            run_output_dir = self._create_output_dir(batch_dir, sweep_params)
 
             # Build command
             command = self._build_command(sweep_params, run_output_dir)
@@ -290,42 +230,32 @@ def run(self, dry_run: bool = False, show_progress: bool = True) -> int:
             # Save metadata
             self._save_run_metadata(run_output_dir, sweep_params, command, i)
 
-            log.info("Run %s/%s", i, len(combinations))
+            log.info("Run %s/%s", run_num, len(combinations))
             log.info(
                 "Parameters: %s", sweep_params if sweep_params else "base config only"
             )
-            log.info("Output directory: %s", run_output_dir)
-            log.info("Command: %s", " ".join(command))
+            log.debug("Output directory: %s", run_output_dir)
+            log.debug("Command: %s", " ".join(command))
 
             if not dry_run:
                 try:
                     # Execute the command
-                    subprocess.run(
-                        command,
-                        check=True,
-                        capture_output=False,  # Let output stream to console
-                        text=True,
+                    result = subprocess.run(
+                        command, check=True, capture_output=True, text=True
                     )
-                    log.info("✓ Run %s completed successfully", i)
-                    if show_progress:
-                        progress_bar.set_postfix_str("✓ Success")
+                    log.info("Run %s completed successfully", run_num)
+
+                    self._save_subprocess_result_json(run_output_dir, result)
+
                 except subprocess.CalledProcessError as e:
-                    log.error("✗ Run %s failed with exit code %s", i, e.returncode)
+                    log.error("Run %s failed with exit code %s", i, e.returncode)
                     failed_runs.append((i, sweep_params))
-                    if show_progress:
-                        progress_bar.set_postfix_str("✗ Failed")
                 except KeyboardInterrupt:
                     log.warning("Interrupted by user")
-                    progress_bar.close()
                     return 130
 
-        # Close progress bar
-        progress_bar.close()
-
         # Log summary
-        log.info("=" * 80)
-        log.info("Benchmark Run Summary")
-        log.info("=" * 80)
+        log.info("SUMMARY")
         log.info("Total runs: %s", len(combinations))
         log.info("Successful: %s", len(combinations) - len(failed_runs))
         log.info("Failed: %s", len(failed_runs))
@@ -336,9 +266,8 @@ def run(self, dry_run: bool = False, show_progress: bool = True) -> int:
                 log.warning("  - Run %s: %s", run_index, params)
 
         log.info("Results stored in: %s", batch_dir)
-        log.info("=" * 80)
 
-        return 0 if not failed_runs else 1
+        return 1 if failed_runs else 0
 
 
 # Create typer app
@@ -395,7 +324,7 @@ def main(
     """
     # Create and run the benchmark runner
     runner = AIPerfRunner(config_file)
-    exit_code = runner.run(dry_run=dry_run, show_progress=not no_progress)
+    exit_code = runner.run(dry_run=dry_run)
 
     raise typer.Exit(code=exit_code)
 

From 3304f365dd5b9eb2b33169a34b30d057795149be Mon Sep 17 00:00:00 2001
From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com>
Date: Thu, 13 Nov 2025 11:46:38 -0600
Subject: [PATCH 03/30] Add benchmark directory to pyright type-checking

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index 03ebc905a..f3452a964 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -163,6 +163,7 @@ include = [
   "nemoguardrails/tracing/**",
   "nemoguardrails/server/**",
   "tests/test_callbacks.py",
+  "nemoguardrails/benchmark/**"
 ]
 exclude = [
   "nemoguardrails/llm/providers/trtllm/**",

From 777bba247bd91302fe6fc07e48971655cefa6f94 Mon Sep 17 00:00:00 2001
From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com>
Date: Thu, 13 Nov 2025 12:51:38 -0600
Subject: [PATCH 04/30] Add aiperf typer to the top-level nemoguardrails app

---
 nemoguardrails/benchmark/aiperf/__init__.py   |  7 ----
 nemoguardrails/benchmark/aiperf/run_aiperf.py | 41 +++----------------
 nemoguardrails/cli/__init__.py                |  1 +
 3 files changed, 6 insertions(+), 43 deletions(-)

diff --git a/nemoguardrails/benchmark/aiperf/__init__.py b/nemoguardrails/benchmark/aiperf/__init__.py
index 3b074a794..3159bfe65 100644
--- a/nemoguardrails/benchmark/aiperf/__init__.py
+++ b/nemoguardrails/benchmark/aiperf/__init__.py
@@ -12,10 +12,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-"""AIPerf benchmark runner and configuration models."""
-
-from .aiperf_config import AIPerfConfig, BaseConfig
-from .aiperf_runner import AIPerfRunner
-
-__all__ = ["AIPerfConfig", "BaseConfig", "AIPerfRunner"]
diff --git a/nemoguardrails/benchmark/aiperf/run_aiperf.py b/nemoguardrails/benchmark/aiperf/run_aiperf.py
index ee2724f51..18265864f 100755
--- a/nemoguardrails/benchmark/aiperf/run_aiperf.py
+++ b/nemoguardrails/benchmark/aiperf/run_aiperf.py
@@ -26,9 +26,10 @@
 
 import typer
 import yaml
-from aiperf_models import AIPerfConfig
 from pydantic import ValidationError
 
+from nemoguardrails.benchmark.aiperf.aiperf_models import AIPerfConfig
+
 # Set up logging
 log = logging.getLogger(__name__)
 log.setLevel(logging.INFO)  # Set the lowest level to capture all messages
@@ -272,13 +273,13 @@ def run(self, dry_run: bool = False) -> int:
 
 # Create typer app
 app = typer.Typer(
-    help="Run aiperf benchmarks with configurable parameters and sweeps",
+    help="AIPerf application to run, analyze, and compare benchmarks",
     add_completion=False,
 )
 
 
 @app.command()
-def main(
+def run(
     config_file: Path = typer.Argument(
         ...,
         help="Path to YAML configuration file",
@@ -292,42 +293,10 @@ def main(
         "--dry-run",
         help="Print commands without executing them",
     ),
-    no_progress: bool = typer.Option(
-        False,
-        "--no-progress",
-        help="Disable progress bar display",
-    ),
 ):
-    """
-    Run aiperf benchmarks with configurable parameters and sweeps.
-
-    Example configuration file (config.yaml):
-
-      batch_name: my_benchmark
-      output_base_dir: ./benchmark_results
-
-      base_config:
-        model-names: gpt-3.5-turbo
-        url: localhost:8000
-        request-count: 100
-        random-seed: 42
-        prompt-input-tokens-mean: 100
-        prompt-input-tokens-stddev: 10
-        prompt-output-tokens-mean: 50
-        prompt-output-tokens-stddev: 5
-
-      sweeps:
-        request-rate: [10, 20, 50]
-        concurrency: [1, 5, 10]
-
-    This configuration will run 9 benchmark tests (3 request rates × 3 concurrency levels).
-    """
+    """Run AIPerf benchmark using the provided YAML config file"""
     # Create and run the benchmark runner
     runner = AIPerfRunner(config_file)
     exit_code = runner.run(dry_run=dry_run)
 
     raise typer.Exit(code=exit_code)
-
-
-if __name__ == "__main__":
-    app()
diff --git a/nemoguardrails/cli/__init__.py b/nemoguardrails/cli/__init__.py
index 97a8faed6..c40ecf7b4 100644
--- a/nemoguardrails/cli/__init__.py
+++ b/nemoguardrails/cli/__init__.py
@@ -25,6 +25,7 @@
 
 from nemoguardrails import __version__
 from nemoguardrails.actions_server import actions_server
+from nemoguardrails.benchmark.aiperf.run_aiperf import app as aiperf_app
 from nemoguardrails.cli.chat import run_chat
 from nemoguardrails.cli.migration import migrate
 from nemoguardrails.cli.providers import _list_providers, select_provider_with_type

From 05123b22c7e50b62c04c781499fe98c1cafbadce Mon Sep 17 00:00:00 2001
From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com>
Date: Thu, 13 Nov 2025 13:13:57 -0600
Subject: [PATCH 05/30] Add a quick GET to the /v1/models endpoint before
 running any benchamrks

---
 nemoguardrails/benchmark/aiperf/run_aiperf.py | 48 +++++++++++--------
 1 file changed, 28 insertions(+), 20 deletions(-)

diff --git a/nemoguardrails/benchmark/aiperf/run_aiperf.py b/nemoguardrails/benchmark/aiperf/run_aiperf.py
index 18265864f..81a1a8a36 100755
--- a/nemoguardrails/benchmark/aiperf/run_aiperf.py
+++ b/nemoguardrails/benchmark/aiperf/run_aiperf.py
@@ -19,11 +19,13 @@
 import logging
 import subprocess
 import sys
+import urllib.parse
 from datetime import datetime
 from pathlib import Path
 from subprocess import CompletedProcess
 from typing import Any, Dict, List, Optional, Union
 
+import httpx
 import typer
 import yaml
 from pydantic import ValidationError
@@ -96,7 +98,7 @@ def _get_sweep_combinations(self) -> List[Dict[str, Any]]:
     def _build_command(
         self, sweep_params: Optional[Dict[str, Union[str, int]]], output_dir: Path
     ) -> List[str]:
-        """Build the aiperf command with given parameters."""
+        """Create a list of strings with the aiperf command and arguments to execute"""
 
         # Run aiperf in profile mode: `aiperf profile`
         cmd = ["aiperf", "profile"]
@@ -113,10 +115,6 @@ def _build_command(
         # Convert parameters to command line arguments
         for key, value in params.items():
             item_key = key
-            # Rampup seconds is used to derive `request_rate` in the BaseConfig model, don't pass
-            # it to the aiperf invocation
-            # if key == "rampup_seconds":
-            #     continue
 
             # Convert the `benchmark_seconds` in config file to `benchmark_duration` key
             if key == "benchmark_seconds":
@@ -143,7 +141,7 @@ def _create_output_dir(
         base_dir: Path,
         sweep_params: Optional[Dict[str, Union[str, int]]],
     ) -> Path:
-        """Create directory in which to place AIPerf outputs."""
+        """Create directory in which to store AIPerf outputs."""
 
         # Early-out if we're not sweeping anything
         if not sweep_params:
@@ -163,15 +161,7 @@ def _save_run_metadata(
         command: List[str],
         run_index: int,
     ):
-        """
-        Save metadata about this run for reference.
-
-        Args:
-            output_dir: Directory where results are stored
-            sweep_params: Parameters for this run
-            command: Full command that was executed
-            run_index: Index of this run
-        """
+        """Save metadata about the run for future reruns or analysis"""
         metadata = {
             "run_index": run_index,
             "timestamp": datetime.now().isoformat(),
@@ -185,8 +175,9 @@ def _save_run_metadata(
         with open(metadata_file, "w", encoding="utf-8") as f:
             json.dump(metadata, f, indent=2)
 
+    @staticmethod
     def _save_subprocess_result_json(
-        self, output_dir: Path, result: CompletedProcess
+        output_dir: Path, result: CompletedProcess
     ) -> None:
         """Save the subprocess result to the given filename"""
 
@@ -194,10 +185,26 @@ def _save_subprocess_result_json(
         with open(process_result_file, "w", encoding="utf-8") as f:
             json.dump(result.__dict__, f, indent=2)
 
+    @staticmethod
+    def _check_service_endpoint(url: str) -> None:
+        """Check if the service is up before we run the benchmarks"""
+
+        try:
+            response = httpx.get(url, timeout=5)
+        except httpx.ConnectError as e:
+            raise RuntimeError(f"Can't connect to {url}: {e}")
+
+        if response.status_code != 200:
+            raise RuntimeError(f"Can't access {url}: {response}")
+
     def run(self, dry_run: bool = False) -> int:
-        """
-        Run benchmarks with AIPerf
-        """
+        """Run benchmarks with AIPerf"""
+
+        # Check the service is up before running anything
+        service_url = urllib.parse.urljoin(self.config.base_config.url, "v1/models")
+        log.info("Checking service is up using %s", service_url)
+        self._check_service_endpoint(service_url)
+
         # Get base output directory
         base_output_dir = self.config.get_output_base_path()
 
@@ -280,8 +287,9 @@ def run(self, dry_run: bool = False) -> int:
 
 @app.command()
 def run(
-    config_file: Path = typer.Argument(
+    config_file: Path = typer.Option(
         ...,
+        "--config-file",
         help="Path to YAML configuration file",
         exists=True,
         file_okay=True,

From dde5d89b8f83b25f40b5b983997f16f6b0e97257 Mon Sep 17 00:00:00 2001
From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com>
Date: Thu, 13 Nov 2025 14:28:27 -0600
Subject: [PATCH 06/30] Add tests for aiperf Pydantic models

---
 .../benchmark/aiperf/aiperf_models.py         | 100 ++---
 tests/benchmark/test_aiperf_models.py         | 377 ++++++++++++++++++
 2 files changed, 421 insertions(+), 56 deletions(-)
 create mode 100644 tests/benchmark/test_aiperf_models.py

diff --git a/nemoguardrails/benchmark/aiperf/aiperf_models.py b/nemoguardrails/benchmark/aiperf/aiperf_models.py
index 7ce3ac767..a926ecb4c 100644
--- a/nemoguardrails/benchmark/aiperf/aiperf_models.py
+++ b/nemoguardrails/benchmark/aiperf/aiperf_models.py
@@ -18,9 +18,9 @@
 """
 
 from pathlib import Path
-from typing import Dict, List, Literal, Optional, Union
+from typing import Any, Dict, List, Literal, Optional, Union
 
-from pydantic import BaseModel, Field, model_validator
+from pydantic import BaseModel, Field, field_validator, model_validator
 
 
 class BaseConfig(BaseModel):
@@ -38,9 +38,15 @@ class BaseConfig(BaseModel):
     )
 
     # Load generation settings
-    warmup_request_count: int = Field(description="Number of warmup requests")
+    warmup_request_count: int = Field(
+        description="Requests to send before beginning performance-test"
+    )
     benchmark_seconds: int = Field(description="Benchmark duration in seconds")
     concurrency: int = Field(description="Number of concurrent requests")
+    request_rate: Optional[float] = Field(
+        default=None,
+        description="Request rate (requests per second, auto-calculated if not provided)",
+    )
     request_rate_mode: Optional[Literal["constant", "poisson"]] = Field(
         default="constant",
         description="Request rate mode (constant, poisson, etc.)",
@@ -67,15 +73,6 @@ class BaseConfig(BaseModel):
         description="Standard deviation of output tokens",
     )
 
-    # TODO! Come up with better ways to set these
-    tokenizer: str = Field(
-        default="/Users/tgasser/projects/aiperf/tokenizers/llama-3.3-70b",
-        description="Path to tokenizer",
-    )
-    ui_type: Literal["dashboard", "simple", "none"] = Field(
-        default="none", description="UI to use while running regression"
-    )
-
 
 class AIPerfConfig(BaseModel):
     """Main configuration model for AIPerf benchmark runner."""
@@ -95,50 +92,41 @@ class AIPerfConfig(BaseModel):
         description="Parameter sweeps. Key is the parameter to change, value is a list of values to use",
     )
 
-    # TODO! Add validation for sweeps
-
-    # @field_validator("sweeps")
-    # @classmethod
-    # def validate_sweeps(
-    #     cls, v: Optional[Dict[str, List[Any]]]
-    # ) -> Optional[Dict[str, List[Any]]]:
-    #     """Validate that sweep values are lists."""
-    #     if v is None:
-    #         return v
-
-    #     for param_name, values in v.items():
-    #         if not isinstance(values, list):
-    #             raise ValueError(
-    #                 f"Sweep parameter '{param_name}' must be a list, got {type(values)}"
-    #             )
-    #         if len(values) == 0:
-    #             raise ValueError(f"Sweep parameter '{param_name}' cannot be empty")
-
-    #     return v
-
-    # @model_validator(mode="after")
-    # def validate_sweep_keys(self):
-    #     """Validate that sweep keys exist in base_config."""
-    #     if self.sweeps is None:
-    #         return self
-
-    #     # Get all valid field names from BaseConfig (using hyphenated versions)
-    #     valid_keys = set()
-    #     for field_name, field_info in BaseConfig.model_fields.items():
-    #         # Use alias if available, otherwise convert underscores to hyphens
-    #         if field_info.alias:
-    #             valid_keys.add(field_info.alias)
-
-    #     # Check each sweep parameter
-    #     for param_name in self.sweeps.keys():
-    #         if param_name not in valid_keys:
-    #             valid_fields = sorted(valid_keys)
-    #             raise ValueError(
-    #                 f"Sweep parameter '{param_name}' is not a valid BaseConfig field. "
-    #                 f"Valid fields are: {', '.join(valid_fields)}"
-    #             )
-
-    #     return self
+    @field_validator("sweeps")
+    @classmethod
+    def validate_sweeps(
+        cls, v: Optional[Dict[str, List[Any]]]
+    ) -> Optional[Dict[str, List[Any]]]:
+        """Validate that sweep values are lists of ints or strings."""
+        if v is None:
+            return v
+
+        for param_name, values in v.items():
+            if len(values) == 0:
+                raise ValueError(f"Sweep parameter '{param_name}' cannot be empty")
+
+        return v
+
+    @model_validator(mode="after")
+    def validate_sweep_keys(self):
+        """Validate that sweep keys exist in base_config."""
+        sweeps = self.sweeps
+        if sweeps is None:
+            return self
+
+        # Get all valid field names from BaseConfig
+        valid_keys = set(BaseConfig.model_fields.keys())
+
+        # Check each sweep parameter
+        for param_name in sweeps:
+            if param_name not in valid_keys:
+                valid_fields = sorted(valid_keys)
+                raise ValueError(
+                    f"Sweep parameter '{param_name}' is not a valid BaseConfig field. "
+                    f"Valid fields are: {', '.join(valid_fields)}"
+                )
+
+        return self
 
     def get_output_base_path(self) -> Path:
         """Get the base output directory as a Path object."""
diff --git a/tests/benchmark/test_aiperf_models.py b/tests/benchmark/test_aiperf_models.py
new file mode 100644
index 000000000..82b3698df
--- /dev/null
+++ b/tests/benchmark/test_aiperf_models.py
@@ -0,0 +1,377 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Tests for AIPerf configuration models.
+"""
+
+from pathlib import Path
+
+import pytest
+from pydantic import ValidationError
+
+from nemoguardrails.benchmark.aiperf.aiperf_models import AIPerfConfig, BaseConfig
+
+
+class TestBaseConfig:
+    """Test the BaseConfig model."""
+
+    def test_base_config_minimal_valid(self):
+        """Test creating BaseConfig with minimal required fields."""
+        config = BaseConfig(
+            model="test-model",
+            url="http://localhost:8000",
+            warmup_request_count=10,
+            benchmark_seconds=60,
+            concurrency=5,
+        )
+        assert config.model == "test-model"
+        assert config.url == "http://localhost:8000"
+        assert config.endpoint == "/v1/chat/completions"  # Default
+        assert config.endpoint_type == "chat"  # Default
+        assert config.warmup_request_count == 10
+        assert config.benchmark_seconds == 60
+        assert config.concurrency == 5
+        assert config.request_rate_mode == "constant"  # Default
+
+    def test_base_config_with_all_fields(self):
+        """Test creating BaseConfig with all fields specified."""
+        config = BaseConfig(
+            model="test-model",
+            url="http://localhost:8000",
+            endpoint="/v1/completions",
+            endpoint_type="completions",
+            warmup_request_count=10,
+            benchmark_seconds=60,
+            concurrency=5,
+            request_rate=2.5,
+            request_rate_mode="poisson",
+            random_seed=42,
+            prompt_input_tokens_mean=100,
+            prompt_input_tokens_stddev=10,
+            prompt_output_tokens_mean=50,
+            prompt_output_tokens_stddev=5,
+        )
+        assert config.model == "test-model"
+        assert config.endpoint == "/v1/completions"
+        assert config.endpoint_type == "completions"
+        assert config.request_rate == 2.5
+        assert config.request_rate_mode == "poisson"
+        assert config.random_seed == 42
+        assert config.prompt_input_tokens_mean == 100
+        assert config.prompt_input_tokens_stddev == 10
+        assert config.prompt_output_tokens_mean == 50
+        assert config.prompt_output_tokens_stddev == 5
+
+    def test_base_config_missing_required_fields(self):
+        """Test that missing required fields raise validation error."""
+        with pytest.raises(ValidationError) as exc_info:
+            BaseConfig(
+                model="test-model",
+                url="http://localhost:8000",
+                # Missing warmup_request_count, benchmark_seconds, concurrency
+            )
+        errors = exc_info.value.errors()
+        error_fields = {err["loc"][0] for err in errors}
+        assert "warmup_request_count" in error_fields
+        assert "benchmark_seconds" in error_fields
+        assert "concurrency" in error_fields
+
+    def test_base_config_invalid_endpoint_type(self):
+        """Test that invalid endpoint_type raises validation error."""
+        with pytest.raises(ValidationError) as exc_info:
+            BaseConfig(
+                model="test-model",
+                url="http://localhost:8000",
+                endpoint_type="invalid",  # Must be "chat" or "completions"
+                warmup_request_count=10,
+                benchmark_seconds=60,
+                concurrency=5,
+            )
+        errors = exc_info.value.errors()
+        assert any("endpoint_type" in str(err["loc"]) for err in errors)
+
+    def test_base_config_invalid_request_rate_mode(self):
+        """Test that invalid request_rate_mode raises validation error."""
+        with pytest.raises(ValidationError) as exc_info:
+            BaseConfig(
+                model="test-model",
+                url="http://localhost:8000",
+                request_rate_mode="invalid",  # Must be "constant" or "poisson"
+                warmup_request_count=10,
+                benchmark_seconds=60,
+                concurrency=5,
+            )
+        errors = exc_info.value.errors()
+        assert any("request_rate_mode" in str(err["loc"]) for err in errors)
+
+
+class TestAIPerfConfig:
+    """Test the AIPerfConfig model."""
+
+    @pytest.fixture(autouse=True)
+    def valid_base_config(self) -> BaseConfig:
+        """Helper to get a valid base config dictionary."""
+        return BaseConfig(
+            model="test-model",
+            url="http://localhost:8000",
+            warmup_request_count=10,
+            benchmark_seconds=60,
+            concurrency=5,
+        )
+
+    def test_aiperf_config_minimal_valid(self, valid_base_config):
+        """Test creating AIPerfConfig with minimal required fields."""
+        config = AIPerfConfig(base_config=valid_base_config)
+
+        assert config.batch_name == "benchmark"  # Default
+        assert config.output_base_dir == "aiperf_results"  # Default
+        assert config.base_config.model == "test-model"
+        assert config.sweeps is None
+
+    def test_aiperf_config_with_custom_fields(self, valid_base_config):
+        """Test creating AIPerfConfig with custom batch_name and output_dir."""
+        config = AIPerfConfig(
+            batch_name="my_benchmark",
+            output_base_dir="custom_results",
+            base_config=valid_base_config,
+        )
+        assert config.batch_name == "my_benchmark"
+        assert config.output_base_dir == "custom_results"
+        assert config.base_config.model == "test-model"
+
+    def test_aiperf_config_with_valid_sweeps_int(self, valid_base_config):
+        """Test creating AIPerfConfig with valid integer sweeps."""
+
+        sweeps: dict[str, list[int]] = {
+            "concurrency": [10, 20, 30],
+            "warmup_request_count": [5, 10, 15],
+        }
+        config = AIPerfConfig(
+            base_config=valid_base_config,
+            sweeps=sweeps,
+        )
+        assert config.sweeps == sweeps
+
+    def test_aiperf_config_with_valid_sweeps_str(self, valid_base_config):
+        """Test creating AIPerfConfig with valid string sweeps."""
+        config = AIPerfConfig(
+            base_config=valid_base_config,
+            sweeps={
+                "model": ["model-a", "model-b", "model-c"],
+                "endpoint": ["/v1/chat", "/v1/completions"],
+            },
+        )
+        assert config.sweeps == {
+            "model": ["model-a", "model-b", "model-c"],
+            "endpoint": ["/v1/chat", "/v1/completions"],
+        }
+
+    def test_aiperf_config_with_valid_sweeps_mixed(self, valid_base_config):
+        """Test creating AIPerfConfig with mixed int and string sweeps."""
+        sweeps = {
+            "concurrency": [10, 20],
+            "model": ["model-a", "model-b"],
+        }
+        config = AIPerfConfig(
+            base_config=valid_base_config,
+            sweeps=sweeps,
+        )
+        assert config.sweeps == sweeps
+
+    def test_aiperf_config_sweep_invalid_key(self, valid_base_config):
+        """Test that invalid sweep key raises validation error."""
+        with pytest.raises(ValidationError) as exc_info:
+            AIPerfConfig(
+                base_config=valid_base_config,
+                sweeps={
+                    "invalid_field": [1, 2, 3],
+                },
+            )
+        error_msg = str(exc_info.value)
+        assert "invalid_field" in error_msg
+        assert "not a valid BaseConfig field" in error_msg
+
+    def test_aiperf_config_sweep_invalid_value_type_float(self, valid_base_config):
+        """Test that float values in sweeps raise validation error."""
+        with pytest.raises(ValidationError) as exc_info:
+            AIPerfConfig(
+                base_config=valid_base_config,
+                sweeps={
+                    "concurrency": [10, 20.5, 30],  # Float not allowed
+                },
+            )
+        error_msg = str(exc_info.value)
+        # Pydantic catches this during type validation
+        assert "sweeps.concurrency" in error_msg
+        assert "must be int or str" in error_msg or "int_from_float" in error_msg
+
+    def test_aiperf_config_sweep_invalid_value_type_dict(self, valid_base_config):
+        """Test that dict values in sweeps raise validation error."""
+        with pytest.raises(ValidationError) as exc_info:
+            AIPerfConfig(
+                base_config=valid_base_config,
+                sweeps={
+                    "concurrency": [10, {"value": 20}, 30],  # Dict not allowed
+                },
+            )
+        error_msg = str(exc_info.value)
+        # Pydantic catches this during type validation
+        assert "sweeps.concurrency" in error_msg
+        assert (
+            "must be int or str" in error_msg
+            or "int_type" in error_msg
+            or "string_type" in error_msg
+        )
+
+    def test_aiperf_config_sweep_invalid_value_type_list(self, valid_base_config):
+        """Test that list values in sweeps raise validation error."""
+        with pytest.raises(ValidationError) as exc_info:
+            AIPerfConfig(
+                base_config=valid_base_config,
+                sweeps={
+                    "concurrency": [10, [20, 30]],  # List not allowed
+                },
+            )
+        error_msg = str(exc_info.value)
+        # Pydantic catches this during type validation
+        assert "sweeps.concurrency" in error_msg
+        assert (
+            "must be int or str" in error_msg
+            or "int_type" in error_msg
+            or "string_type" in error_msg
+        )
+
+    def test_aiperf_config_sweep_empty_list(self, valid_base_config):
+        """Test that empty sweep list raises validation error."""
+        with pytest.raises(ValidationError) as exc_info:
+            AIPerfConfig(
+                base_config=valid_base_config,
+                sweeps={
+                    "concurrency": [],
+                },
+            )
+        error_msg = str(exc_info.value)
+        assert "cannot be empty" in error_msg
+
+    def test_aiperf_config_sweep_not_list(self, valid_base_config):
+        """Test that non-list sweep value raises validation error."""
+        with pytest.raises(ValidationError) as exc_info:
+            AIPerfConfig(
+                base_config=valid_base_config,
+                sweeps={
+                    "concurrency": 10,  # Should be a list
+                },
+            )
+        error_msg = str(exc_info.value)
+        # Pydantic catches this during type validation
+        assert "sweeps.concurrency" in error_msg
+        assert "must be a list" in error_msg or "list_type" in error_msg
+
+    def test_aiperf_config_multiple_invalid_sweep_keys(self, valid_base_config):
+        """Test that multiple invalid sweep keys are all reported."""
+        with pytest.raises(ValidationError) as exc_info:
+            AIPerfConfig(
+                base_config=valid_base_config,
+                sweeps={
+                    "invalid_field_1": [1, 2],
+                    "invalid_field_2": [3, 4],
+                },
+            )
+        error_msg = str(exc_info.value)
+        # At least one of the invalid fields should be mentioned
+        assert "invalid_field" in error_msg
+
+    def test_aiperf_config_get_output_base_path(self, valid_base_config):
+        """Test get_output_base_path method."""
+        config = AIPerfConfig(
+            output_base_dir="custom_results", base_config=valid_base_config
+        )
+        path = config.get_output_base_path()
+        assert isinstance(path, Path)
+        assert str(path) == "custom_results"
+
+    def test_aiperf_config_get_output_base_path_default(self, valid_base_config):
+        """Test get_output_base_path method with default output_base_dir."""
+        config = AIPerfConfig(base_config=valid_base_config)
+        path = config.get_output_base_path()
+        assert isinstance(path, Path)
+        assert str(path) == "aiperf_results"
+
+    def test_aiperf_config_missing_base_config(self):
+        """Test that missing base_config raises validation error."""
+        with pytest.raises(ValidationError) as exc_info:
+            AIPerfConfig()  # Missing required base_config
+        errors = exc_info.value.errors()
+        assert any("base_config" in str(err["loc"]) for err in errors)
+
+    def test_aiperf_config_invalid_base_config(self):
+        """Test that invalid base_config raises validation error."""
+        with pytest.raises(ValidationError):
+            AIPerfConfig(
+                base_config={
+                    "model": "test-model",
+                    # Missing required fields
+                },
+            )
+
+    def test_aiperf_config_all_valid_sweep_keys(self, valid_base_config):
+        """Test sweeps with all valid BaseConfig field names."""
+        config = AIPerfConfig(
+            base_config=valid_base_config,
+            sweeps={
+                "model": ["model-a", "model-b"],
+                "url": ["http://localhost:8000", "http://localhost:8001"],
+                "endpoint": ["/v1/chat", "/v1/completions"],
+                "endpoint_type": ["chat", "completions"],
+                "warmup_request_count": [5, 10],
+                "benchmark_seconds": [30, 60],
+                "concurrency": [5, 10],
+                "request_rate_mode": ["constant", "poisson"],
+                "random_seed": [42, 123],
+                "prompt_input_tokens_mean": [100, 200],
+                "prompt_input_tokens_stddev": [10, 20],
+                "prompt_output_tokens_mean": [50, 100],
+                "prompt_output_tokens_stddev": [5, 10],
+            },
+        )
+        # All sweeps should be accepted
+        assert len(config.sweeps) == 13
+
+    def test_sweeps_none(self, valid_base_config):
+        """Test sweeps set to None don't raise Exception."""
+        config = AIPerfConfig(
+            base_config=valid_base_config,
+            sweeps=None,
+        )
+
+    def test_sweeps_not_list_raises(self, valid_base_config):
+        """Test sweeps set to None don't raise Exception."""
+        with pytest.raises(ValueError, match="Input should be a valid list"):
+            config = AIPerfConfig(
+                base_config=valid_base_config,
+                sweeps={"benchmark_seconds": 1},
+            )
+
+    def test_sweeps_empty_list_raises(self, valid_base_config):
+        """Test sweeps set to None don't raise Exception."""
+        with pytest.raises(
+            ValueError,
+            match="Sweep parameter 'concurrency' cannot be empty",
+        ):
+            config = AIPerfConfig(
+                base_config=valid_base_config,
+                sweeps={"concurrency": []},
+            )

From c68bdcb9b0ddac9ab4f669a1236bb7c9b103a99d Mon Sep 17 00:00:00 2001
From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com>
Date: Thu, 13 Nov 2025 20:25:27 -0600
Subject: [PATCH 07/30] Rename benchmark_seconds to benchmark_duration, add
 tokenizer optional field

---
 .../aiperf_configs/single_concurrency.yaml    | 30 +++++++++++++++++++
 ...ency_sweep.yaml => sweep_concurrency.yaml} |  0
 .../benchmark/aiperf/aiperf_models.py         |  6 +++-
 tests/benchmark/test_aiperf_models.py         | 24 ++++++++-------
 4 files changed, 49 insertions(+), 11 deletions(-)
 create mode 100644 nemoguardrails/benchmark/aiperf/aiperf_configs/single_concurrency.yaml
 rename nemoguardrails/benchmark/aiperf/aiperf_configs/{concurrency_sweep.yaml => sweep_concurrency.yaml} (100%)

diff --git a/nemoguardrails/benchmark/aiperf/aiperf_configs/single_concurrency.yaml b/nemoguardrails/benchmark/aiperf/aiperf_configs/single_concurrency.yaml
new file mode 100644
index 000000000..4098eab44
--- /dev/null
+++ b/nemoguardrails/benchmark/aiperf/aiperf_configs/single_concurrency.yaml
@@ -0,0 +1,30 @@
+# Concurrency sweep. One-minute tests at log-spaced concurrencies
+
+# Name for this batch of benchmarks (will be part of output directory name)
+batch_name: simple_regression
+
+# Base directory where all benchmark results will be stored.
+# Actual name is <output_base_dir>/<batch_name>/<sweep value>
+output_base_dir: aiperf_results
+
+# Base configuration applied to all benchmark runs
+# These parameters can be overridden by sweep parameters
+base_config:
+  # Model details
+  model: meta-llama/Llama-3.3-70B-Instruct
+  url: "http://localhost:9000"
+  endpoint: "/v1/chat/completions"
+  endpoint_type: chat
+
+  # Load generation settings.
+  warmup_request_count: 10
+  benchmark_seconds: 60
+  concurrency: 100
+  request_rate_mode: "constant"
+
+  # Synthetic data generation
+  random_seed: 12345
+  prompt_input_tokens_mean: 100
+  prompt_input_tokens_stddev: 10
+  prompt_output_tokens_mean: 50
+  prompt_output_tokens_stddev: 5
diff --git a/nemoguardrails/benchmark/aiperf/aiperf_configs/concurrency_sweep.yaml b/nemoguardrails/benchmark/aiperf/aiperf_configs/sweep_concurrency.yaml
similarity index 100%
rename from nemoguardrails/benchmark/aiperf/aiperf_configs/concurrency_sweep.yaml
rename to nemoguardrails/benchmark/aiperf/aiperf_configs/sweep_concurrency.yaml
diff --git a/nemoguardrails/benchmark/aiperf/aiperf_models.py b/nemoguardrails/benchmark/aiperf/aiperf_models.py
index a926ecb4c..68d99f6a4 100644
--- a/nemoguardrails/benchmark/aiperf/aiperf_models.py
+++ b/nemoguardrails/benchmark/aiperf/aiperf_models.py
@@ -28,6 +28,10 @@ class BaseConfig(BaseModel):
 
     # Model details
     model: str = Field(..., description="Model name")
+    tokenizer: Optional[str] = Field(
+        default=None,
+        description="Optional tokenizer Huggingface name, or local directory",
+    )
     url: str = Field(..., description="Model base URL")
     endpoint: str = Field(
         default="/v1/chat/completions", description="API endpoint path"
@@ -41,7 +45,7 @@ class BaseConfig(BaseModel):
     warmup_request_count: int = Field(
         description="Requests to send before beginning performance-test"
     )
-    benchmark_seconds: int = Field(description="Benchmark duration in seconds")
+    benchmark_duration: int = Field(description="Benchmark duration in seconds")
     concurrency: int = Field(description="Number of concurrent requests")
     request_rate: Optional[float] = Field(
         default=None,
diff --git a/tests/benchmark/test_aiperf_models.py b/tests/benchmark/test_aiperf_models.py
index 82b3698df..429ab774b 100644
--- a/tests/benchmark/test_aiperf_models.py
+++ b/tests/benchmark/test_aiperf_models.py
@@ -32,17 +32,19 @@ def test_base_config_minimal_valid(self):
         """Test creating BaseConfig with minimal required fields."""
         config = BaseConfig(
             model="test-model",
+            tokenizer="test-tokenizer",
             url="http://localhost:8000",
             warmup_request_count=10,
-            benchmark_seconds=60,
+            benchmark_duration=60,
             concurrency=5,
         )
         assert config.model == "test-model"
+        assert config.tokenizer == "test-tokenizer"
         assert config.url == "http://localhost:8000"
         assert config.endpoint == "/v1/chat/completions"  # Default
         assert config.endpoint_type == "chat"  # Default
         assert config.warmup_request_count == 10
-        assert config.benchmark_seconds == 60
+        assert config.benchmark_duration == 60
         assert config.concurrency == 5
         assert config.request_rate_mode == "constant"  # Default
 
@@ -50,11 +52,12 @@ def test_base_config_with_all_fields(self):
         """Test creating BaseConfig with all fields specified."""
         config = BaseConfig(
             model="test-model",
+            tokenizer="test-tokenizer",
             url="http://localhost:8000",
             endpoint="/v1/completions",
             endpoint_type="completions",
             warmup_request_count=10,
-            benchmark_seconds=60,
+            benchmark_duration=60,
             concurrency=5,
             request_rate=2.5,
             request_rate_mode="poisson",
@@ -65,6 +68,7 @@ def test_base_config_with_all_fields(self):
             prompt_output_tokens_stddev=5,
         )
         assert config.model == "test-model"
+        assert config.tokenizer == "test-tokenizer"
         assert config.endpoint == "/v1/completions"
         assert config.endpoint_type == "completions"
         assert config.request_rate == 2.5
@@ -81,12 +85,12 @@ def test_base_config_missing_required_fields(self):
             BaseConfig(
                 model="test-model",
                 url="http://localhost:8000",
-                # Missing warmup_request_count, benchmark_seconds, concurrency
+                # Missing warmup_request_count, benchmark_duration, concurrency
             )
         errors = exc_info.value.errors()
         error_fields = {err["loc"][0] for err in errors}
         assert "warmup_request_count" in error_fields
-        assert "benchmark_seconds" in error_fields
+        assert "benchmark_duration" in error_fields
         assert "concurrency" in error_fields
 
     def test_base_config_invalid_endpoint_type(self):
@@ -97,7 +101,7 @@ def test_base_config_invalid_endpoint_type(self):
                 url="http://localhost:8000",
                 endpoint_type="invalid",  # Must be "chat" or "completions"
                 warmup_request_count=10,
-                benchmark_seconds=60,
+                benchmark_duration=60,
                 concurrency=5,
             )
         errors = exc_info.value.errors()
@@ -111,7 +115,7 @@ def test_base_config_invalid_request_rate_mode(self):
                 url="http://localhost:8000",
                 request_rate_mode="invalid",  # Must be "constant" or "poisson"
                 warmup_request_count=10,
-                benchmark_seconds=60,
+                benchmark_duration=60,
                 concurrency=5,
             )
         errors = exc_info.value.errors()
@@ -128,7 +132,7 @@ def valid_base_config(self) -> BaseConfig:
             model="test-model",
             url="http://localhost:8000",
             warmup_request_count=10,
-            benchmark_seconds=60,
+            benchmark_duration=60,
             concurrency=5,
         )
 
@@ -337,7 +341,7 @@ def test_aiperf_config_all_valid_sweep_keys(self, valid_base_config):
                 "endpoint": ["/v1/chat", "/v1/completions"],
                 "endpoint_type": ["chat", "completions"],
                 "warmup_request_count": [5, 10],
-                "benchmark_seconds": [30, 60],
+                "benchmark_duration": [30, 60],
                 "concurrency": [5, 10],
                 "request_rate_mode": ["constant", "poisson"],
                 "random_seed": [42, 123],
@@ -362,7 +366,7 @@ def test_sweeps_not_list_raises(self, valid_base_config):
         with pytest.raises(ValueError, match="Input should be a valid list"):
             config = AIPerfConfig(
                 base_config=valid_base_config,
-                sweeps={"benchmark_seconds": 1},
+                sweeps={"benchmark_duration": 1},
             )
 
     def test_sweeps_empty_list_raises(self, valid_base_config):

From 76a2828d2742e03fa2f18974e0e563b14ebea1d0 Mon Sep 17 00:00:00 2001
From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com>
Date: Thu, 13 Nov 2025 20:26:17 -0600
Subject: [PATCH 08/30] Add single-concurrency config, rename both

---
 .../aiperf/aiperf_configs/single_concurrency.yaml        | 9 +++++----
 .../aiperf/aiperf_configs/sweep_concurrency.yaml         | 3 ++-
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/nemoguardrails/benchmark/aiperf/aiperf_configs/single_concurrency.yaml b/nemoguardrails/benchmark/aiperf/aiperf_configs/single_concurrency.yaml
index 4098eab44..47126a8c3 100644
--- a/nemoguardrails/benchmark/aiperf/aiperf_configs/single_concurrency.yaml
+++ b/nemoguardrails/benchmark/aiperf/aiperf_configs/single_concurrency.yaml
@@ -11,15 +11,16 @@ output_base_dir: aiperf_results
 # These parameters can be overridden by sweep parameters
 base_config:
   # Model details
-  model: meta-llama/Llama-3.3-70B-Instruct
+  model: meta/llama-3.3-70b-instruct
+  tokenizer: meta-llama/Llama-3.3-70B-Instruct
   url: "http://localhost:9000"
   endpoint: "/v1/chat/completions"
   endpoint_type: chat
 
   # Load generation settings.
-  warmup_request_count: 10
-  benchmark_seconds: 60
-  concurrency: 100
+  warmup_request_count: 20
+  benchmark_duration: 60
+  concurrency: 10
   request_rate_mode: "constant"
 
   # Synthetic data generation
diff --git a/nemoguardrails/benchmark/aiperf/aiperf_configs/sweep_concurrency.yaml b/nemoguardrails/benchmark/aiperf/aiperf_configs/sweep_concurrency.yaml
index 59b871e04..abeaea85a 100644
--- a/nemoguardrails/benchmark/aiperf/aiperf_configs/sweep_concurrency.yaml
+++ b/nemoguardrails/benchmark/aiperf/aiperf_configs/sweep_concurrency.yaml
@@ -12,13 +12,14 @@ output_base_dir: aiperf_results
 base_config:
   # Model details
   model: meta/llama-3.3-70b-instruct
+  tokenizer: meta-llama/Llama-3.3-70B-Instruct
   url: "http://localhost:9000"
   endpoint: "/v1/chat/completions"
   endpoint_type: chat
 
   # Load generation settings.
   warmup_request_count: 10
-  benchmark_seconds: 60
+  benchmark_duration: 60
   concurrency: 0  # Overridden by the concurrency sweep below
   request_rate_mode: "constant"
 

From 0ee8b3400466864231725b02cea72303a4e98861 Mon Sep 17 00:00:00 2001
From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com>
Date: Fri, 14 Nov 2025 10:21:56 -0600
Subject: [PATCH 09/30] Change configs to use NVCF hosted Llama 3.3 70B model

---
 .../aiperf/aiperf_configs/single_concurrency.yaml        | 8 +++++---
 .../aiperf/aiperf_configs/sweep_concurrency.yaml         | 9 +++++----
 nemoguardrails/benchmark/aiperf/aiperf_models.py         | 4 ++++
 tests/benchmark/test_aiperf_models.py                    | 2 ++
 4 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/nemoguardrails/benchmark/aiperf/aiperf_configs/single_concurrency.yaml b/nemoguardrails/benchmark/aiperf/aiperf_configs/single_concurrency.yaml
index 47126a8c3..7b7c8fd08 100644
--- a/nemoguardrails/benchmark/aiperf/aiperf_configs/single_concurrency.yaml
+++ b/nemoguardrails/benchmark/aiperf/aiperf_configs/single_concurrency.yaml
@@ -1,7 +1,7 @@
 # Concurrency sweep. One-minute tests at log-spaced concurrencies
 
 # Name for this batch of benchmarks (will be part of output directory name)
-batch_name: simple_regression
+batch_name: single_concurrency
 
 # Base directory where all benchmark results will be stored.
 # Actual name is <output_base_dir>/<batch_name>/<sweep value>
@@ -13,14 +13,16 @@ base_config:
   # Model details
   model: meta/llama-3.3-70b-instruct
   tokenizer: meta-llama/Llama-3.3-70B-Instruct
-  url: "http://localhost:9000"
+  url: "https://integrate.api.nvidia.com"
   endpoint: "/v1/chat/completions"
   endpoint_type: chat
+  api_key_env_var: NVIDIA_API_KEY
+  streaming: True
 
   # Load generation settings.
   warmup_request_count: 20
   benchmark_duration: 60
-  concurrency: 10
+  concurrency: 1
   request_rate_mode: "constant"
 
   # Synthetic data generation
diff --git a/nemoguardrails/benchmark/aiperf/aiperf_configs/sweep_concurrency.yaml b/nemoguardrails/benchmark/aiperf/aiperf_configs/sweep_concurrency.yaml
index abeaea85a..35e2cbc53 100644
--- a/nemoguardrails/benchmark/aiperf/aiperf_configs/sweep_concurrency.yaml
+++ b/nemoguardrails/benchmark/aiperf/aiperf_configs/sweep_concurrency.yaml
@@ -1,10 +1,10 @@
 # Concurrency sweep. One-minute tests at log-spaced concurrencies
 
 # Name for this batch of benchmarks (will be part of output directory name)
-batch_name: simple_regression
+batch_name: sweep_concurrency
 
 # Base directory where all benchmark results will be stored.
-# Actual name is <output_base_dir>/<batch_name>/<sweep value>
+# Actual name is <output_base_dir>/<batch_name>/<sweep value> for sweeps
 output_base_dir: aiperf_results
 
 # Base configuration applied to all benchmark runs
@@ -13,9 +13,10 @@ base_config:
   # Model details
   model: meta/llama-3.3-70b-instruct
   tokenizer: meta-llama/Llama-3.3-70B-Instruct
-  url: "http://localhost:9000"
+  url: "https://integrate.api.nvidia.com"
   endpoint: "/v1/chat/completions"
   endpoint_type: chat
+  api_key_env_var: NVIDIA_API_KEY
 
   # Load generation settings.
   warmup_request_count: 10
@@ -34,4 +35,4 @@ base_config:
 # The script will run all combinations (Cartesian product)
 sweeps:
   # Sweep over the following concurrency values
-  concurrency: [1, 10, 100]
+  concurrency: [1, 2, 4]
diff --git a/nemoguardrails/benchmark/aiperf/aiperf_models.py b/nemoguardrails/benchmark/aiperf/aiperf_models.py
index 68d99f6a4..be436a491 100644
--- a/nemoguardrails/benchmark/aiperf/aiperf_models.py
+++ b/nemoguardrails/benchmark/aiperf/aiperf_models.py
@@ -40,6 +40,10 @@ class BaseConfig(BaseModel):
         default="chat",
         description="Type of endpoint (chat or completions)",
     )
+    api_key_env_var: Optional[str] = Field(
+        default=None, description="API key environment variable"
+    )
+    streaming: Optional[bool] = Field(default=False, description="Streaming mode")
 
     # Load generation settings
     warmup_request_count: int = Field(
diff --git a/tests/benchmark/test_aiperf_models.py b/tests/benchmark/test_aiperf_models.py
index 429ab774b..32eb35c86 100644
--- a/tests/benchmark/test_aiperf_models.py
+++ b/tests/benchmark/test_aiperf_models.py
@@ -56,6 +56,7 @@ def test_base_config_with_all_fields(self):
             url="http://localhost:8000",
             endpoint="/v1/completions",
             endpoint_type="completions",
+            api_key_env_var="AIPERF_API_KEY",
             warmup_request_count=10,
             benchmark_duration=60,
             concurrency=5,
@@ -71,6 +72,7 @@ def test_base_config_with_all_fields(self):
         assert config.tokenizer == "test-tokenizer"
         assert config.endpoint == "/v1/completions"
         assert config.endpoint_type == "completions"
+        assert config.api_key_env_var == "AIPERF_API_KEY"
         assert config.request_rate == 2.5
         assert config.request_rate_mode == "poisson"
         assert config.random_seed == 42

From eb679c591c7f224199205839e61f58e597c8b60b Mon Sep 17 00:00:00 2001
From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com>
Date: Fri, 14 Nov 2025 10:27:39 -0600
Subject: [PATCH 10/30] Refactor single and sweep benchmark runs, add API key
 env var logic to get environment variable

---
 nemoguardrails/benchmark/aiperf/run_aiperf.py | 260 +++++++++++++-----
 1 file changed, 197 insertions(+), 63 deletions(-)

diff --git a/nemoguardrails/benchmark/aiperf/run_aiperf.py b/nemoguardrails/benchmark/aiperf/run_aiperf.py
index 81a1a8a36..3cdd80314 100755
--- a/nemoguardrails/benchmark/aiperf/run_aiperf.py
+++ b/nemoguardrails/benchmark/aiperf/run_aiperf.py
@@ -17,13 +17,15 @@
 import itertools
 import json
 import logging
+import os
 import subprocess
 import sys
 import urllib.parse
+from dataclasses import dataclass
 from datetime import datetime
 from pathlib import Path
 from subprocess import CompletedProcess
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 import httpx
 import typer
@@ -47,6 +49,13 @@
 log.addHandler(console_handler)
 
 
+@dataclass
+class AIPerfSummary:
+    total: int
+    completed: int
+    failed: int
+
+
 class AIPerfRunner:
     """Run batches of AIPerf benchmarks using YAML config and optional parameter sweeps"""
 
@@ -77,12 +86,12 @@ def _load_config(self) -> AIPerfConfig:
             log.error("Unexpected error loading configuration: %s", e)
             sys.exit(1)
 
-    def _get_sweep_combinations(self) -> List[Dict[str, Any]]:
+    def _get_sweep_combinations(self) -> Optional[List[Dict[str, Union[int, str]]]]:
         """Create cartesian-product of parameter sweep values for benchmarks"""
 
         if not self.config.sweeps:
             # No sweeps, return single empty combination
-            return [{}]
+            return None
 
         # Extract parameter names and their values
         param_names = list(self.config.sweeps.keys())
@@ -108,20 +117,33 @@ def _build_command(
 
         # Merge base config with sweep params (sweep params override base)
         params = base_params if not sweep_params else {**base_params, **sweep_params}
+        log.debug("Building command-line with params: %s", params)
 
         # Add output directory
         params["output-artifact-dir"] = str(output_dir)
 
+        # Use the --verbose CLI option (which changes log.level to debug) to enable more debugging
+        params["ui_type"] = "simple" if log.level == logging.DEBUG else "none"
+
         # Convert parameters to command line arguments
         for key, value in params.items():
-            item_key = key
-
-            # Convert the `benchmark_seconds` in config file to `benchmark_duration` key
-            if key == "benchmark_seconds":
-                item_key = "benchmark_duration"
+            # If an optional field isn't provided, don't pass that argument to aiperf
+            if value is None:
+                continue
+
+            # If `api_key_env_var` is provided, get the value of the env var and add it
+            # to the command
+            if key == "api_key_env_var":
+                api_key = os.environ.get(value)
+                if not api_key:
+                    raise RuntimeError(
+                        f"Environment variable {value} not set. Please store the API Key in {value}"
+                    )
+                cmd.extend([f"--api-key", str(api_key)])
+                continue
 
             # Convert underscores to hyphens for CLI arguments
-            arg_name = item_key.replace("_", "-")
+            arg_name = key.replace("_", "-")
 
             # Handle different value types
             if isinstance(value, bool):
@@ -134,6 +156,7 @@ def _build_command(
             elif value is not None:
                 cmd.extend([f"--{arg_name}", str(value)])
 
+        log.debug("Final command-line: %s", cmd)
         return cmd
 
     @staticmethod
@@ -145,6 +168,7 @@ def _create_output_dir(
 
         # Early-out if we're not sweeping anything
         if not sweep_params:
+            base_dir.mkdir(parents=True, exist_ok=True)
             return base_dir
 
         param_parts = [f"{key}{value}" for key, value in sorted(sweep_params.items())]
@@ -157,7 +181,7 @@ def _create_output_dir(
     def _save_run_metadata(
         self,
         output_dir: Path,
-        sweep_params: Dict[str, Any],
+        sweep_params: Optional[Dict[str, Any]],
         command: List[str],
         run_index: int,
     ):
@@ -182,12 +206,28 @@ def _save_subprocess_result_json(
         """Save the subprocess result to the given filename"""
 
         process_result_file = output_dir / "process_result.json"
-        with open(process_result_file, "w", encoding="utf-8") as f:
-            json.dump(result.__dict__, f, indent=2)
+        save_data = result.__dict__
 
-    @staticmethod
-    def _check_service_endpoint(url: str) -> None:
+        try:
+            with open(process_result_file, "w", encoding="utf-8") as f:
+                json.dump(save_data, f, indent=2)
+
+        except (IOError, OSError) as e:
+            log.error(
+                f"Could not write %s to file %s: %s", save_data, process_result_file, e
+            )
+            raise
+
+        except TypeError as e:
+            log.error(
+                f"Couldn't serialize %s to %s: %s", save_data, process_result_file, e
+            )
+            raise
+
+    def _check_service(self, endpoint: Optional[str] = "/v1/models") -> None:
         """Check if the service is up before we run the benchmarks"""
+        url = urllib.parse.urljoin(self.config.base_config.url, endpoint)
+        log.debug("Checking service is up using endpoint %s", url)
 
         try:
             response = httpx.get(url, timeout=5)
@@ -201,81 +241,166 @@ def run(self, dry_run: bool = False) -> int:
         """Run benchmarks with AIPerf"""
 
         # Check the service is up before running anything
-        service_url = urllib.parse.urljoin(self.config.base_config.url, "v1/models")
-        log.info("Checking service is up using %s", service_url)
-        self._check_service_endpoint(service_url)
+        self._check_service()
+
+        # Get the directory under which all benchmarks will store results
+        batch_dir = self._get_batch_dir()
+
+        log.info("Running AIPerf with configuration: %s", self.config_path)
+        log.info("Results root directory: %s", batch_dir)
+        log.info("Sweeping parameters: %s", self.config.sweeps)
 
+        benchmark_result: AIPerfSummary = (
+            self.run_batch_benchmarks(batch_dir, dry_run)
+            if self.config.sweeps
+            else self.run_single_benchmark(batch_dir, dry_run)
+        )
+
+        # Log summary
+        log.info("SUMMARY")
+        log.info("Total runs : %s", benchmark_result.total)
+        log.info("Completed  : %s", benchmark_result.completed)
+        log.info("Failed     : %s", benchmark_result.failed)
+
+        return 1 if benchmark_result.failed > 0 else 0
+
+    def _get_batch_dir(self) -> Path:
         # Get base output directory
         base_output_dir = self.config.get_output_base_path()
+        batch_name = self.config.batch_name
 
         # Create timestamped batch directory
         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        batch_name = self.config.batch_name
         batch_dir = base_output_dir / batch_name / timestamp
+        return batch_dir
+
+    def run_single_benchmark(
+        self,
+        run_directory: Path,
+        dry_run: bool,
+    ) -> AIPerfSummary:
+        """Run a single benchmark. Return OS exit code."""
+
+        run_output_dir = self._create_output_dir(run_directory, sweep_params=None)
+
+        log.info("Running AIPerf with configuration: %s", self.config_path)
+        log.info("Output directory: %s", run_output_dir)
+
+        # Build command
+        command = self._build_command(sweep_params=None, output_dir=run_output_dir)
+
+        # Save metadata
+        self._save_run_metadata(run_output_dir, None, command, 0)
+
+        log.info("Single Run")
+        log.debug("Output directory: %s", run_output_dir)
+        log.debug("Command: %s", " ".join(command))
+        if dry_run:
+            log.info("Dry-run mode. Commands will not be executed")
+            return AIPerfSummary(total=0, completed=0, failed=0)
+
+        try:
+            capture_output = log.level != logging.DEBUG
+            # Execute the command
+            result = subprocess.run(
+                command,
+                check=True,
+                capture_output=capture_output,
+                text=True,
+            )
+            log.info("Run completed successfully")
+            self._save_subprocess_result_json(run_output_dir, result)
+            run_completed = 1 if result.returncode == 0 else 0
+            return AIPerfSummary(
+                total=1, completed=run_completed, failed=1 - run_completed
+            )
+
+        except subprocess.CalledProcessError as e:
+            log.error("Run failed with exit code %s", e.returncode)
+            return AIPerfSummary(total=1, completed=0, failed=1)
+
+        except KeyboardInterrupt:
+            log.warning("Interrupted by user")
+            raise
+
+    def run_batch_benchmarks(
+        self,
+        run_directory: Path,
+        dry_run: bool,
+    ) -> AIPerfSummary:
+        """Run a batch of benchmarks using sweeps values. Return OS exit code."""
 
         # Generate all sweep combinations
         combinations = self._get_sweep_combinations()
+        if not combinations:
+            raise RuntimeError(
+                f"Can't generate sweep combinations from {self.config.sweeps}"
+            )
 
-        log.info("Running AIPerf with configuration: %s", self.config_path)
-        log.info("Batch directory: %s", batch_dir)
-        log.info("Sweep parameters: %s", combinations)
-        log.info("Number of runs: %s", len(combinations))
+        num_combinations = len(combinations)
+        log.info("Running %s benchmarks", num_combinations)
 
+        # Early-out if it's a dry-run
         if dry_run:
-            log.info("DRY RUN MODE - Commands will not be executed")
+            log.info("Dry-run mode. Commands will not be executed")
+            return AIPerfSummary(total=0, completed=0, failed=0)
+
+        # If logging isn't set to DEBUG, we'll capture the AIPerf stdout and stderr to a file
+        capture_output = log.level != logging.DEBUG
 
         # Execute each combination
-        failed_runs = []
+        failed_runs = 0
 
+        # Iterate over the sweep combinations, saving out results in separate directories
         for i, sweep_params in enumerate(combinations):
-            run_num = i + 1
+            run_num = i + 1  # 1-indexed for run status printouts
+
             # Create output directory for this run
-            run_output_dir = self._create_output_dir(batch_dir, sweep_params)
+            run_output_dir = self._create_output_dir(run_directory, sweep_params)
 
-            # Build command
+            # Create the command-line for this sweep param
             command = self._build_command(sweep_params, run_output_dir)
 
-            # Save metadata
+            # Save metadata to reproduce benchmark results later if needed
             self._save_run_metadata(run_output_dir, sweep_params, command, i)
 
-            log.info("Run %s/%s", run_num, len(combinations))
-            log.info(
-                "Parameters: %s", sweep_params if sweep_params else "base config only"
-            )
+            log.info("Run %s/%s", run_num, num_combinations)
+            log.info("Sweep parameters: %s", sweep_params)
             log.debug("Output directory: %s", run_output_dir)
             log.debug("Command: %s", " ".join(command))
 
-            if not dry_run:
-                try:
-                    # Execute the command
-                    result = subprocess.run(
-                        command, check=True, capture_output=True, text=True
-                    )
-                    log.info("Run %s completed successfully", run_num)
-
-                    self._save_subprocess_result_json(run_output_dir, result)
-
-                except subprocess.CalledProcessError as e:
-                    log.error("Run %s failed with exit code %s", i, e.returncode)
-                    failed_runs.append((i, sweep_params))
-                except KeyboardInterrupt:
-                    log.warning("Interrupted by user")
-                    return 130
-
-        # Log summary
-        log.info("SUMMARY")
-        log.info("Total runs: %s", len(combinations))
-        log.info("Successful: %s", len(combinations) - len(failed_runs))
-        log.info("Failed: %s", len(failed_runs))
-
-        if failed_runs:
-            log.warning("Failed runs:")
-            for run_index, params in failed_runs:
-                log.warning("  - Run %s: %s", run_index, params)
-
-        log.info("Results stored in: %s", batch_dir)
-
-        return 1 if failed_runs else 0
+            try:
+                # Execute the command
+                result = subprocess.run(
+                    command,
+                    check=True,
+                    capture_output=capture_output,
+                    text=True,
+                )
+                log.info("Run %s completed successfully", run_num)
+
+                self._save_subprocess_result_json(run_output_dir, result)
+                if result.returncode != 0:
+                    failed_runs += 1
+
+            except subprocess.CalledProcessError as e:
+                log.error(
+                    "Run %s with sweep params %s failed with exit code %s",
+                    i,
+                    sweep_params,
+                    e.returncode,
+                )
+                failed_runs += 1
+
+            except KeyboardInterrupt:
+                log.warning("Interrupted by user")
+                raise
+
+        return AIPerfSummary(
+            total=num_combinations,
+            completed=num_combinations - failed_runs,
+            failed=failed_runs,
+        )
 
 
 # Create typer app
@@ -301,8 +426,17 @@ def run(
         "--dry-run",
         help="Print commands without executing them",
     ),
+    verbose: bool = typer.Option(
+        False,
+        "--verbose",
+        help="Print additional debugging information during run",
+    ),
 ):
     """Run AIPerf benchmark using the provided YAML config file"""
+
+    if verbose:
+        log.setLevel(logging.DEBUG)
+
     # Create and run the benchmark runner
     runner = AIPerfRunner(config_file)
     exit_code = runner.run(dry_run=dry_run)

From c41c09c90696bbf781c05319a170f7f9f8ef7426 Mon Sep 17 00:00:00 2001
From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com>
Date: Fri, 14 Nov 2025 12:27:40 -0600
Subject: [PATCH 11/30] Add tests for run_aiperf.py

---
 tests/benchmark/test_run_aiperf.py | 1010 ++++++++++++++++++++++++++++
 1 file changed, 1010 insertions(+)
 create mode 100644 tests/benchmark/test_run_aiperf.py

diff --git a/tests/benchmark/test_run_aiperf.py b/tests/benchmark/test_run_aiperf.py
new file mode 100644
index 000000000..22077e590
--- /dev/null
+++ b/tests/benchmark/test_run_aiperf.py
@@ -0,0 +1,1010 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Tests for AIPerf run_aiperf module.
+"""
+
+import json
+import logging
+import subprocess
+from pathlib import Path
+from typing import Any, Dict, Optional
+from unittest.mock import Mock, patch
+
+import httpx
+import pytest
+import yaml
+from typer.testing import CliRunner
+
+from nemoguardrails.benchmark.aiperf.aiperf_models import AIPerfConfig, BaseConfig
+from nemoguardrails.benchmark.aiperf.run_aiperf import AIPerfRunner, AIPerfSummary
+
+
+@pytest.fixture
+def create_config_data():
+    """Returns a function with sample basic config, and allows mutation of fields to cover
+    more cases or add extra fields"""
+
+    def _create_config(
+        batch_name="test_batch",
+        output_base_dir="test_output",
+        model="test-model",
+        tokenizer="test-tokenizer",
+        url="http://localhost:8000",
+        warmup_request_count=10,
+        benchmark_duration=60,
+        concurrency=5,
+        sweeps=None,
+        **extra_base_config,
+    ):
+        base_config = {
+            "model": model,
+            "tokenizer": tokenizer,
+            "url": url,
+            "warmup_request_count": warmup_request_count,
+            "benchmark_duration": benchmark_duration,
+            "concurrency": concurrency,
+        }
+
+        config_data = {
+            "batch_name": batch_name,
+            "output_base_dir": output_base_dir,
+            "base_config": base_config,
+        }
+
+        # Add sweeps if provided
+        if sweeps:
+            config_data["sweeps"] = sweeps
+
+        # Merge any extra base_config parameters
+        if extra_base_config:
+            base_config.update(extra_base_config)
+
+        return config_data
+
+    return _create_config
+
+
+@pytest.fixture
+def create_config_file(tmp_path, create_config_data):
+    """Fixture to write config data to a file and return the path."""
+
+    def _write_config_file(
+        extra_base_config: Optional[Dict[str, Any]] = None,
+        filename: Optional[str] = "config.yml",
+        sweeps: Optional[Dict[str, Any]] = None,
+    ) -> Path:
+        """Apply extra base config to config data, write to file and return the path."""
+
+        # Unpack extra_base_config as kwargs if provided
+        if extra_base_config:
+            config_data = create_config_data(sweeps=sweeps, **extra_base_config)
+        else:
+            config_data = create_config_data(sweeps=sweeps)
+
+        config_file = tmp_path / filename
+        config_file.write_text(yaml.dump(config_data))
+        return config_file
+
+    return _write_config_file
+
+
+class TestAIPerfSummary:
+    """Test the AIPerfSummary dataclass."""
+
+    def test_aiperf_summary_creation(self):
+        """Test creating an AIPerfSummary instance."""
+        summary = AIPerfSummary(total=10, completed=8, failed=2)
+        assert summary.total == 10
+        assert summary.completed == 8
+        assert summary.failed == 2
+
+
+class TestAIPerfRunnerInit:
+    """Test AIPerfRunner initialization and config loading."""
+
+    def test_init_with_valid_config(self, create_config_file):
+        """Test initialization with a valid config file."""
+        config_file = create_config_file()
+
+        runner = AIPerfRunner(config_file)
+
+        assert runner.config_path == config_file
+        assert isinstance(runner.config, AIPerfConfig)
+        assert runner.config.batch_name == "test_batch"
+        assert runner.config.output_base_dir == "test_output"
+        assert runner.config.base_config.model == "test-model"
+        assert runner.config.base_config.tokenizer == "test-tokenizer"
+        assert runner.config.base_config.url == "http://localhost:8000"
+        assert runner.config.base_config.warmup_request_count == 10
+        assert runner.config.base_config.benchmark_duration == 60
+        assert runner.config.base_config.concurrency == 5
+        assert runner.config.sweeps is None
+
+    def test_init_with_nonexistent_config(self, tmp_path):
+        """Test initialization with a nonexistent config file."""
+        config_file = tmp_path / "nonexistent.yaml"
+
+        with pytest.raises(SystemExit):
+            AIPerfRunner(config_file)
+
+    def test_init_with_invalid_yaml(self, tmp_path):
+        """Test initialization with invalid YAML syntax."""
+        config_file = tmp_path / "invalid.yaml"
+        config_file.write_text("invalid: yaml: syntax: [")
+
+        with pytest.raises(SystemExit):
+            AIPerfRunner(config_file)
+
+    def test_init_with_validation_error(self, tmp_path):
+        """Test initialization with config that fails Pydantic validation."""
+
+        config_file = tmp_path / "invalid.yaml"
+        config_file.write_text(
+            yaml.dump(
+                {
+                    "batch_name": "test_batch",
+                    "base_config": {
+                        "model": "test-model",
+                        # Missing required fields
+                    },
+                }
+            )
+        )
+
+        with pytest.raises(SystemExit):
+            AIPerfRunner(config_file)
+
+    def test_init_with_unexpected_error(self, create_config_file):
+        """Test initialization with an unexpected error."""
+        config_file = create_config_file()
+
+        # Mock yaml.safe_load to raise an unexpected exception
+        with patch("yaml.safe_load", side_effect=RuntimeError("Unexpected error")):
+            with pytest.raises(SystemExit):
+                AIPerfRunner(config_file)
+
+
+class TestGetSweepCombinations:
+    """Test the _get_sweep_combinations method."""
+
+    def test_no_sweeps_returns_none(self, create_config_file):
+        """Test that no sweeps returns None."""
+        config_file = create_config_file()
+
+        runner = AIPerfRunner(config_file)
+        combinations = runner._get_sweep_combinations()
+
+        assert combinations is None
+
+    def test_single_sweep_parameter(self, create_config_file):
+        """Test sweep with a single parameter."""
+        config_file = create_config_file(sweeps={"concurrency": [1, 2, 4]})
+
+        runner = AIPerfRunner(config_file)
+        combinations = runner._get_sweep_combinations()
+
+        assert len(combinations) == 3
+        assert combinations == [
+            {"concurrency": 1},
+            {"concurrency": 2},
+            {"concurrency": 4},
+        ]
+
+    def test_multiple_sweep_parameters(self, create_config_file):
+        """Test sweep with multiple parameters (Cartesian product)."""
+        config_file = create_config_file(
+            sweeps={
+                "concurrency": [1, 2],
+                "benchmark_duration": [30, 60],
+            }
+        )
+
+        runner = AIPerfRunner(config_file)
+        combinations = runner._get_sweep_combinations()
+
+        assert len(combinations) == 4
+        assert {"concurrency": 1, "benchmark_duration": 30} in combinations
+        assert {"concurrency": 1, "benchmark_duration": 60} in combinations
+        assert {"concurrency": 2, "benchmark_duration": 30} in combinations
+        assert {"concurrency": 2, "benchmark_duration": 60} in combinations
+
+
+class TestBuildCommand:
+    """Test the _build_command method."""
+
+    def test_build_command_basic(self, create_config_file, tmp_path):
+        """Test building a basic command."""
+        config_file = create_config_file()
+
+        runner = AIPerfRunner(config_file)
+        output_dir = tmp_path / "output"
+        cmd = runner._build_command(None, output_dir)
+
+        assert cmd[0] == "aiperf"
+        assert cmd[1] == "profile"
+        assert "--model" in cmd
+        assert "test-model" in cmd
+        assert "--url" in cmd
+        assert "http://localhost:8000" in cmd
+        assert "--output-artifact-dir" in cmd
+        assert str(output_dir) in cmd
+
+    def test_build_command_with_sweep_params(self, create_config_file, tmp_path):
+        """Test building command with sweep parameters that override base config."""
+        config_file = create_config_file()
+
+        runner = AIPerfRunner(config_file)
+        output_dir = tmp_path / "output"
+        sweep_params = {"concurrency": 10, "benchmark_duration": 30}
+        cmd = runner._build_command(sweep_params, output_dir)
+
+        assert "--concurrency" in cmd
+        concurrency_idx = cmd.index("--concurrency")
+        assert cmd[concurrency_idx + 1] == "10"
+
+        assert "--benchmark-duration" in cmd
+        duration_idx = cmd.index("--benchmark-duration")
+        assert cmd[duration_idx + 1] == "30"
+
+    def test_build_command_with_api_key_env_var(
+        self, create_config_file, tmp_path, monkeypatch
+    ):
+        """Test building command with API key from environment variable."""
+        config_file = create_config_file(
+            extra_base_config={"api_key_env_var": "TEST_API_KEY"}
+        )
+
+        # Set the environment variable
+        monkeypatch.setenv("TEST_API_KEY", "secret-key-123")
+
+        runner = AIPerfRunner(config_file)
+        output_dir = tmp_path / "output"
+        cmd = runner._build_command(None, output_dir)
+
+        assert "--api-key" in cmd
+        api_key_idx = cmd.index("--api-key")
+        assert cmd[api_key_idx + 1] == "secret-key-123"
+
+    def test_build_command_with_missing_api_key_env_var(
+        self, create_config_file, tmp_path
+    ):
+        """Test building command when API key environment variable is not set."""
+        config_file = create_config_file(
+            extra_base_config={"api_key_env_var": "MISSING_API_KEY"}
+        )
+
+        runner = AIPerfRunner(config_file)
+        output_dir = tmp_path / "output"
+
+        with pytest.raises(
+            RuntimeError, match="Environment variable MISSING_API_KEY not set"
+        ):
+            runner._build_command(None, output_dir)
+
+    def test_build_command_with_streaming_true(self, create_config_file, tmp_path):
+        """Test building command with streaming enabled"""
+        config_file = create_config_file(extra_base_config={"streaming": True})
+
+        runner = AIPerfRunner(config_file)
+        output_dir = tmp_path / "output"
+        cmd = runner._build_command(None, output_dir)
+
+        assert "--streaming" in cmd
+
+    def test_build_command_with_streaming_false(self, create_config_file, tmp_path):
+        """Test building command with boolean False value (should not be in command)."""
+        config_file = create_config_file(extra_base_config={"streaming": False})
+
+        runner = AIPerfRunner(config_file)
+        output_dir = tmp_path / "output"
+        cmd = runner._build_command(None, output_dir)
+
+        assert "--streaming" not in cmd
+
+    def test_build_command_default_streaming(self, create_config_file, tmp_path):
+        """Test building command with streaming default of False"""
+        config_file = create_config_file()
+
+        runner = AIPerfRunner(config_file)
+        output_dir = tmp_path / "output"
+        cmd = runner._build_command(None, output_dir)
+
+        assert "--streaming" not in cmd
+
+    def test_build_command_default_api_key(self, create_config_file, tmp_path):
+        """Test building command with None values (should be skipped)."""
+        config_file = create_config_file()
+
+        runner = AIPerfRunner(config_file)
+        output_dir = tmp_path / "output"
+        cmd = runner._build_command(None, output_dir)
+
+        # Optional fields with None should not appear
+        assert "--api-key-env-var" not in cmd
+
+    def test_build_command_ui_type_debug(self, create_config_file, tmp_path):
+        """Test that ui_type is 'simple' when log level is DEBUG."""
+        config_file = create_config_file()
+
+        runner = AIPerfRunner(config_file)
+        output_dir = tmp_path / "output"
+
+        # Patch log.level to be DEBUG
+        with patch(
+            "nemoguardrails.benchmark.aiperf.run_aiperf.log.level", logging.DEBUG
+        ):
+            cmd = runner._build_command(None, output_dir)
+
+            assert "--ui-type" in cmd
+            ui_type_idx = cmd.index("--ui-type")
+            assert cmd[ui_type_idx + 1] == "simple"
+
+    def test_build_command_ui_type_non_debug(self, create_config_file, tmp_path):
+        """Test that ui_type is 'none' when log level is not DEBUG."""
+        config_file = create_config_file()
+
+        runner = AIPerfRunner(config_file)
+        output_dir = tmp_path / "output"
+
+        # Patch log.level to be INFO
+        with patch(
+            "nemoguardrails.benchmark.aiperf.run_aiperf.log.level", logging.INFO
+        ):
+            cmd = runner._build_command(None, output_dir)
+
+            assert "--ui-type" in cmd
+            ui_type_idx = cmd.index("--ui-type")
+            assert cmd[ui_type_idx + 1] == "none"
+
+    def test_build_command_with_list_in_sweep_params(
+        self, create_config_file, tmp_path
+    ):
+        """Test building command when sweep params contain list values."""
+        config_file = create_config_file()
+
+        runner = AIPerfRunner(config_file)
+        output_dir = tmp_path / "output"
+
+        # Patch model_dump method at the class level to return a list value
+        original_model_dump = BaseConfig.model_dump
+
+        def mock_model_dump(self):
+            result = original_model_dump(self)
+            result["extra_param"] = ["value1", "value2"]
+            return result
+
+        with patch.object(BaseConfig, "model_dump", mock_model_dump):
+            cmd = runner._build_command(None, output_dir)
+
+            # List values should appear multiple times in the command
+            assert "--extra-param" in cmd
+            assert cmd.count("--extra-param") == 2
+            value1_idx = cmd.index("value1")
+            value2_idx = cmd.index("value2")
+            assert value1_idx > 0
+            assert value2_idx > 0
+
+
+class TestCreateOutputDir:
+    """Test the _create_output_dir static method."""
+
+    def test_create_output_dir_no_sweep(self, tmp_path):
+        """Test creating output directory without sweep parameters."""
+        base_dir = tmp_path / "output"
+        result = AIPerfRunner._create_output_dir(base_dir, None)
+
+        assert result == base_dir
+        assert result.exists()
+        assert result.is_dir()
+
+    def test_create_output_dir_with_sweep(self, tmp_path):
+        """Test creating output directory with sweep parameters."""
+        base_dir = tmp_path / "output"
+        sweep_params = {"concurrency": 10, "benchmark_duration": 30}
+        result = AIPerfRunner._create_output_dir(base_dir, sweep_params)
+
+        # Directory should contain sweep parameter values
+        assert str(result) == f"{base_dir}/benchmark_duration30_concurrency10"
+        assert result.exists()
+        assert result.is_dir()
+
+    def test_create_output_dir_creates_parent(self, tmp_path):
+        """Test that parent directories are created if they don't exist."""
+        base_dir = tmp_path / "parent" / "child" / "output"
+        result = AIPerfRunner._create_output_dir(base_dir, None)
+
+        assert result.exists()
+        assert result.is_dir()
+
+
+class TestSaveRunMetadata:
+    """Test the _save_run_metadata method."""
+
+    def test_save_run_metadata_without_sweep(self, create_config_file, tmp_path):
+        """Test saving run metadata without sweep parameters."""
+        config_file = create_config_file()
+
+        runner = AIPerfRunner(config_file)
+        output_dir = tmp_path / "output"
+        output_dir.mkdir()
+
+        command = ["aiperf", "profile", "--model", "test-model"]
+        runner._save_run_metadata(output_dir, None, command, 0)
+
+        metadata_file = output_dir / "run_metadata.json"
+        assert metadata_file.exists()
+
+        with open(metadata_file) as f:
+            metadata = json.load(f)
+
+        assert metadata["run_index"] == 0
+        assert metadata["config_file"] == str(config_file)
+        assert metadata["sweep_params"] is None
+        assert metadata["command"] == " ".join(command)
+        assert "timestamp" in metadata
+        assert "base_config" in metadata
+
+    def test_save_run_metadata_with_sweep(self, create_config_file, tmp_path):
+        """Test saving run metadata with sweep parameters."""
+        config_file = create_config_file()
+
+        runner = AIPerfRunner(config_file)
+        output_dir = tmp_path / "output"
+        output_dir.mkdir()
+
+        sweep_params = {"concurrency": 10}
+        command = ["aiperf", "profile", "--concurrency", "10"]
+        runner._save_run_metadata(output_dir, sweep_params, command, 1)
+
+        metadata_file = output_dir / "run_metadata.json"
+        assert metadata_file.exists()
+
+        with open(metadata_file) as f:
+            metadata = json.load(f)
+
+        assert metadata["run_index"] == 1
+        assert metadata["sweep_params"] == sweep_params
+
+
+class TestSaveSubprocessResultJson:
+    """Test the _save_subprocess_result_json static method."""
+
+    def test_save_subprocess_result_success(self, tmp_path):
+        """Test saving successful subprocess result."""
+        output_dir = tmp_path / "output"
+        output_dir.mkdir()
+
+        # Create a mock CompletedProcess
+        result = subprocess.CompletedProcess(
+            args=["aiperf", "profile"],
+            returncode=0,
+            stdout="Success output",
+            stderr="",
+        )
+
+        AIPerfRunner._save_subprocess_result_json(output_dir, result)
+
+        process_result_file = output_dir / "process_result.json"
+        assert process_result_file.exists()
+
+        with open(process_result_file) as f:
+            saved_data = json.load(f)
+
+        assert saved_data["returncode"] == 0
+        assert saved_data["stdout"] == "Success output"
+
+    def test_save_subprocess_result_failure(self, tmp_path):
+        """Test saving failed subprocess result."""
+        output_dir = tmp_path / "output"
+        output_dir.mkdir()
+
+        result = subprocess.CompletedProcess(
+            args=["aiperf", "profile"],
+            returncode=1,
+            stdout="",
+            stderr="Error message",
+        )
+
+        AIPerfRunner._save_subprocess_result_json(output_dir, result)
+
+        process_result_file = output_dir / "process_result.json"
+        assert process_result_file.exists()
+
+        with open(process_result_file) as f:
+            saved_data = json.load(f)
+
+        assert saved_data["returncode"] == 1
+        assert saved_data["stderr"] == "Error message"
+
+    def test_save_subprocess_result_io_error(self, tmp_path):
+        """Test saving subprocess result when IOError occurs."""
+        output_dir = tmp_path / "output"
+        output_dir.mkdir()
+
+        result = subprocess.CompletedProcess(
+            args=["aiperf", "profile"],
+            returncode=0,
+            stdout="Success",
+            stderr="",
+        )
+
+        # Mock open to raise IOError
+        with patch("builtins.open", side_effect=IOError("Disk full")):
+            with pytest.raises(IOError):
+                AIPerfRunner._save_subprocess_result_json(output_dir, result)
+
+    def test_save_subprocess_result_type_error(self, tmp_path):
+        """Test saving subprocess result when TypeError occurs during serialization."""
+        output_dir = tmp_path / "output"
+        output_dir.mkdir()
+
+        result = subprocess.CompletedProcess(
+            args=["aiperf", "profile"],
+            returncode=0,
+            stdout="Success",
+            stderr="",
+        )
+
+        # Mock json.dump to raise TypeError
+        with patch("json.dump", side_effect=TypeError("Cannot serialize")):
+            with pytest.raises(TypeError):
+                AIPerfRunner._save_subprocess_result_json(output_dir, result)
+
+
+class TestCheckService:
+    """Test the _check_service method."""
+
+    def test_check_service_success(self, create_config_file):
+        """Test checking service when it's available."""
+        config_file = create_config_file()
+
+        runner = AIPerfRunner(config_file)
+
+        # Mock httpx.get to return success
+        with patch("httpx.get") as mock_get:
+            mock_response = Mock()
+            mock_response.status_code = 200
+            mock_get.return_value = mock_response
+
+            # Should not raise any exception
+            runner._check_service()
+
+    def test_check_service_connect_error(self, create_config_file):
+        """Test checking service when connection fails."""
+        config_file = create_config_file()
+
+        runner = AIPerfRunner(config_file)
+
+        # Mock httpx.get to raise ConnectError
+        with patch("httpx.get", side_effect=httpx.ConnectError("Connection refused")):
+            with pytest.raises(RuntimeError, match="Can't connect to"):
+                runner._check_service()
+
+    def test_check_service_non_200_response(self, create_config_file):
+        """Test checking service when it returns non-200 status."""
+        config_file = create_config_file()
+
+        runner = AIPerfRunner(config_file)
+
+        # Mock httpx.get to return 404
+        with patch("httpx.get") as mock_get:
+            mock_response = Mock()
+            mock_response.status_code = 404
+            mock_get.return_value = mock_response
+
+            with pytest.raises(RuntimeError, match="Can't access"):
+                runner._check_service()
+
+    def test_check_service_custom_endpoint(self, create_config_file):
+        """Test checking service with custom endpoint."""
+        config_file = create_config_file()
+
+        runner = AIPerfRunner(config_file)
+
+        # Mock httpx.get
+        with patch("httpx.get") as mock_get:
+            mock_response = Mock()
+            mock_response.status_code = 200
+            mock_get.return_value = mock_response
+
+            runner._check_service("/custom/endpoint")
+
+            # Verify the URL was constructed correctly
+            mock_get.assert_called_once()
+            call_args = mock_get.call_args
+            assert "/custom/endpoint" in call_args[0][0]
+
+
+class TestGetBatchDir:
+    """Test the _get_batch_dir method."""
+
+    def test_get_batch_dir(self, create_config_file, tmp_path):
+        """Test getting the batch directory with timestamp."""
+        config_file = create_config_file(
+            extra_base_config={
+                "batch_name": "test_batch",
+                "output_base_dir": str(tmp_path / "output"),
+            }
+        )
+
+        runner = AIPerfRunner(config_file)
+        batch_dir = runner._get_batch_dir()
+
+        # Check that the path contains the expected components
+        assert "test_batch" in str(batch_dir)
+        assert str(tmp_path / "output") in str(batch_dir)
+        # Check that there's a timestamp-like pattern (YYYYMMDD_HHMMSS)
+        assert len(batch_dir.name) == 15  # Timestamp format
+
+
+class TestRunSingleBenchmark:
+    """Test the run_single_benchmark method."""
+
+    def test_run_single_benchmark_success(self, create_config_file, tmp_path):
+        """Test running a single benchmark successfully."""
+        config_file = create_config_file()
+
+        runner = AIPerfRunner(config_file)
+        run_directory = tmp_path / "runs"
+
+        # Mock subprocess.run to return success
+        with patch("subprocess.run") as mock_run:
+            mock_result = subprocess.CompletedProcess(
+                args=["aiperf", "profile"],
+                returncode=0,
+                stdout="Success",
+                stderr="",
+            )
+            mock_run.return_value = mock_result
+
+            summary = runner.run_single_benchmark(run_directory, dry_run=False)
+
+            assert summary.total == 1
+            assert summary.completed == 1
+            assert summary.failed == 0
+
+    def test_run_single_benchmark_dry_run(self, create_config_file, tmp_path):
+        """Test running a single benchmark in dry-run mode."""
+        config_file = create_config_file()
+
+        runner = AIPerfRunner(config_file)
+        run_directory = tmp_path / "runs"
+
+        summary = runner.run_single_benchmark(run_directory, dry_run=True)
+
+        assert summary.total == 0
+        assert summary.completed == 0
+        assert summary.failed == 0
+
+    def test_run_single_benchmark_failure(self, create_config_file, tmp_path):
+        """Test running a single benchmark that fails."""
+        config_file = create_config_file()
+
+        runner = AIPerfRunner(config_file)
+        run_directory = tmp_path / "runs"
+
+        # Mock subprocess.run to raise CalledProcessError
+        with patch(
+            "subprocess.run", side_effect=subprocess.CalledProcessError(1, "aiperf")
+        ):
+            summary = runner.run_single_benchmark(run_directory, dry_run=False)
+
+            assert summary.total == 1
+            assert summary.completed == 0
+            assert summary.failed == 1
+
+    def test_run_single_benchmark_keyboard_interrupt(
+        self, create_config_file, tmp_path
+    ):
+        """Test that KeyboardInterrupt is re-raised."""
+        config_file = create_config_file()
+
+        runner = AIPerfRunner(config_file)
+        run_directory = tmp_path / "runs"
+
+        # Mock subprocess.run to raise KeyboardInterrupt
+        with patch("subprocess.run", side_effect=KeyboardInterrupt):
+            with pytest.raises(KeyboardInterrupt):
+                runner.run_single_benchmark(run_directory, dry_run=False)
+
+
+class TestRunBatchBenchmarks:
+    """Test the run_batch_benchmarks method."""
+
+    def test_run_batch_benchmarks_success(self, create_config_file, tmp_path):
+        """Test running batch benchmarks successfully."""
+        config_file = create_config_file(sweeps={"concurrency": [1, 2]})
+
+        runner = AIPerfRunner(config_file)
+        run_directory = tmp_path / "runs"
+
+        # Mock subprocess.run to return success first, then failure
+        with patch("subprocess.run") as mock_run:
+            mock_run.side_effect = [
+                subprocess.CompletedProcess(
+                    args=["aiperf", "profile"],
+                    returncode=0,
+                    stdout="Success",
+                    stderr="",
+                ),
+                subprocess.CompletedProcess(
+                    args=["aiperf", "profile"],
+                    returncode=1,
+                    stdout="",
+                    stderr="Error",
+                ),
+            ]
+
+            summary = runner.run_batch_benchmarks(run_directory, dry_run=False)
+
+            assert summary.total == 2
+            assert summary.completed == 1
+            assert summary.failed == 1
+            assert mock_run.call_count == 2
+
+    def test_run_batch_benchmarks_dry_run(self, create_config_file, tmp_path):
+        """Test running batch benchmarks in dry-run mode."""
+        config_file = create_config_file(sweeps={"concurrency": [1, 2]})
+
+        runner = AIPerfRunner(config_file)
+        run_directory = tmp_path / "runs"
+
+        summary = runner.run_batch_benchmarks(run_directory, dry_run=True)
+
+        assert summary.total == 0
+        assert summary.completed == 0
+        assert summary.failed == 0
+
+    def test_run_batch_benchmarks_partial_failure(self, create_config_file, tmp_path):
+        """Test running batch benchmarks with some failures."""
+        config_file = create_config_file(sweeps={"concurrency": [1, 2, 4]})
+
+        runner = AIPerfRunner(config_file)
+        run_directory = tmp_path / "runs"
+
+        # Mock subprocess.run to fail on second call
+        call_count = 0
+
+        def side_effect(*args, **kwargs):
+            nonlocal call_count
+            call_count += 1
+            if call_count == 2:
+                raise subprocess.CalledProcessError(1, "aiperf")
+            return subprocess.CompletedProcess(
+                args=["aiperf", "profile"],
+                returncode=0,
+                stdout="Success",
+                stderr="",
+            )
+
+        with patch("subprocess.run", side_effect=side_effect):
+            summary = runner.run_batch_benchmarks(run_directory, dry_run=False)
+
+            assert summary.total == 3
+            assert summary.completed == 2
+            assert summary.failed == 1
+
+    def test_run_batch_benchmarks_no_combinations(self, create_config_file, tmp_path):
+        """Test running batch benchmarks with no sweep combinations raises error."""
+        config_file = create_config_file()
+
+        runner = AIPerfRunner(config_file)
+        # Manually set sweeps to empty dict to trigger error
+        runner.config.sweeps = {}
+
+        run_directory = tmp_path / "runs"
+
+        with pytest.raises(RuntimeError, match="Can't generate sweep combinations"):
+            runner.run_batch_benchmarks(run_directory, dry_run=False)
+
+    def test_run_batch_benchmarks_keyboard_interrupt(
+        self, create_config_file, tmp_path
+    ):
+        """Test that KeyboardInterrupt is re-raised in batch benchmarks."""
+        config_file = create_config_file(sweeps={"concurrency": [1, 2]})
+
+        runner = AIPerfRunner(config_file)
+        run_directory = tmp_path / "runs"
+
+        # Mock subprocess.run to raise KeyboardInterrupt on first call
+        with patch("subprocess.run", side_effect=KeyboardInterrupt):
+            with pytest.raises(KeyboardInterrupt):
+                runner.run_batch_benchmarks(run_directory, dry_run=False)
+
+    def test_run_batch_benchmarks_non_zero_returncode(
+        self, create_config_file, tmp_path
+    ):
+        """Test running batch benchmarks when subprocess returns non-zero but doesn't raise."""
+        config_file = create_config_file(sweeps={"concurrency": [1, 2]})
+
+        runner = AIPerfRunner(config_file)
+        run_directory = tmp_path / "runs"
+
+        # Mock subprocess.run to return non-zero returncode without raising
+        with patch("subprocess.run") as mock_run:
+            mock_result = subprocess.CompletedProcess(
+                args=["aiperf", "profile"],
+                returncode=1,  # Non-zero return code
+                stdout="",
+                stderr="Error",
+            )
+            mock_run.return_value = mock_result
+
+            summary = runner.run_batch_benchmarks(run_directory, dry_run=False)
+
+            assert summary.total == 2
+            assert summary.completed == 0
+            assert summary.failed == 2
+
+
+class TestRun:
+    """Test the main run method."""
+
+    def test_run_single_benchmark(self, create_config_file):
+        """Test main run method with single benchmark (no sweeps)."""
+        config_file = create_config_file()
+        runner = AIPerfRunner(config_file)
+
+        # Mock _check_service and subprocess.run
+        with patch.object(runner, "_check_service"):
+            with patch("subprocess.run") as mock_run:
+                mock_result = subprocess.CompletedProcess(
+                    args=["aiperf", "profile"],
+                    returncode=0,
+                    stdout="Success",
+                    stderr="",
+                )
+                mock_run.return_value = mock_result
+
+                exit_code = runner.run(dry_run=False)
+
+                assert exit_code == 0
+
+    def test_run_batch_benchmarks(self, create_config_file):
+        """Test main run method with batch benchmarks (with sweeps)."""
+        config_file = create_config_file(sweeps={"concurrency": [1, 2]})
+        runner = AIPerfRunner(config_file)
+
+        # Mock _check_service and subprocess.run
+        with patch.object(runner, "_check_service"):
+            with patch("subprocess.run") as mock_run:
+                mock_result = subprocess.CompletedProcess(
+                    args=["aiperf", "profile"],
+                    returncode=0,
+                    stdout="Success",
+                    stderr="",
+                )
+                mock_run.return_value = mock_result
+
+                exit_code = runner.run(dry_run=False)
+
+                assert exit_code == 0
+                assert mock_run.call_count == 2
+
+    def test_run_with_failures(self, create_config_file):
+        """Test main run method returns non-zero exit code on failures."""
+        config_file = create_config_file()
+
+        runner = AIPerfRunner(config_file)
+
+        # Mock _check_service and subprocess.run to fail
+        with patch.object(runner, "_check_service"):
+            with patch(
+                "subprocess.run", side_effect=subprocess.CalledProcessError(1, "aiperf")
+            ):
+                exit_code = runner.run(dry_run=False)
+                assert exit_code == 1
+
+    def test_run_service_check_failure(self, create_config_file):
+        """Test that service check failure raises error."""
+        config_file = create_config_file()
+        runner = AIPerfRunner(config_file)
+
+        # Mock _check_service to raise error
+        with patch.object(
+            runner, "_check_service", side_effect=RuntimeError("Service unavailable")
+        ):
+            with pytest.raises(RuntimeError, match="Service unavailable"):
+                runner.run(dry_run=False)
+
+
+class TestCLICommand:
+    """Test the CLI command function."""
+
+    def test_cli_run_command_basic(self, create_config_file):
+        """Test CLI run command with basic options."""
+        config_file = create_config_file()
+        runner = CliRunner()
+
+        from nemoguardrails.benchmark.aiperf.run_aiperf import app
+
+        # Mock the runner and service check
+        with patch(
+            "nemoguardrails.benchmark.aiperf.run_aiperf.AIPerfRunner"
+        ) as mock_runner_class:
+            mock_runner = Mock()
+            mock_runner.run.return_value = 0
+            mock_runner_class.return_value = mock_runner
+
+            result = runner.invoke(app, ["--config-file", str(config_file)])
+
+            assert result.exit_code == 0
+            mock_runner.run.assert_called_once_with(dry_run=False)
+
+    def test_cli_run_command_with_verbose(self, create_config_file):
+        """Test CLI run command with verbose flag."""
+        config_file = create_config_file()
+        runner = CliRunner()
+
+        from nemoguardrails.benchmark.aiperf.run_aiperf import app
+
+        # Mock the runner and service check
+        with patch(
+            "nemoguardrails.benchmark.aiperf.run_aiperf.AIPerfRunner"
+        ) as mock_runner_class:
+            mock_runner = Mock()
+            mock_runner.run.return_value = 0
+            mock_runner_class.return_value = mock_runner
+
+            result = runner.invoke(
+                app, ["--config-file", str(config_file), "--verbose"]
+            )
+
+            assert result.exit_code == 0
+            mock_runner.run.assert_called_once_with(dry_run=False)
+
+    def test_cli_run_command_with_dry_run(self, create_config_file):
+        """Test CLI run command with dry-run flag."""
+        config_file = create_config_file()
+        runner = CliRunner()
+
+        from nemoguardrails.benchmark.aiperf.run_aiperf import app
+
+        # Mock the runner and service check
+        with patch(
+            "nemoguardrails.benchmark.aiperf.run_aiperf.AIPerfRunner"
+        ) as mock_runner_class:
+            mock_runner = Mock()
+            mock_runner.run.return_value = 0
+            mock_runner_class.return_value = mock_runner
+
+            result = runner.invoke(
+                app, ["--config-file", str(config_file), "--dry-run"]
+            )
+
+            assert result.exit_code == 0
+            mock_runner.run.assert_called_once_with(dry_run=True)
+
+    def test_cli_run_command_with_failure(self, create_config_file):
+        """Test CLI run command when benchmark fails."""
+        config_file = create_config_file()
+        runner = CliRunner()
+
+        from nemoguardrails.benchmark.aiperf.run_aiperf import app
+
+        # Mock the runner to return failure
+        with patch(
+            "nemoguardrails.benchmark.aiperf.run_aiperf.AIPerfRunner"
+        ) as mock_runner_class:
+            mock_runner = Mock()
+            mock_runner.run.return_value = 1  # Failure
+            mock_runner_class.return_value = mock_runner
+
+            result = runner.invoke(app, ["--config-file", str(config_file)])
+
+            assert result.exit_code == 1
+            mock_runner.run.assert_called_once_with(dry_run=False)

From 5b1a00377d443cc948f30cd5a414cd86e9226ba9 Mon Sep 17 00:00:00 2001
From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com>
Date: Fri, 14 Nov 2025 13:36:33 -0600
Subject: [PATCH 12/30] Revert changes to
 llm/providers/huggingface/streamers.py

---
 nemoguardrails/llm/providers/huggingface/streamers.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nemoguardrails/llm/providers/huggingface/streamers.py b/nemoguardrails/llm/providers/huggingface/streamers.py
index 4afb44ac4..14c406124 100644
--- a/nemoguardrails/llm/providers/huggingface/streamers.py
+++ b/nemoguardrails/llm/providers/huggingface/streamers.py
@@ -18,8 +18,8 @@
 
 TRANSFORMERS_AVAILABLE = True
 try:
-    from transformers.generation.streamers import (
-        TextStreamer,  # type: ignore[import-untyped]
+    from transformers.generation.streamers import (  # type: ignore[import-untyped]
+        TextStreamer,
     )
 except ImportError:
     # Fallback if transformers is not available

From b45e3e8cc8ac25c39f2593b52daa086e09881e48 Mon Sep 17 00:00:00 2001
From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com>
Date: Fri, 14 Nov 2025 13:39:55 -0600
Subject: [PATCH 13/30] Address greptile feedback

---
 .../benchmark/aiperf/aiperf_configs/single_concurrency.yaml   | 2 +-
 nemoguardrails/benchmark/aiperf/run_aiperf.py                 | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/nemoguardrails/benchmark/aiperf/aiperf_configs/single_concurrency.yaml b/nemoguardrails/benchmark/aiperf/aiperf_configs/single_concurrency.yaml
index 7b7c8fd08..ce8227795 100644
--- a/nemoguardrails/benchmark/aiperf/aiperf_configs/single_concurrency.yaml
+++ b/nemoguardrails/benchmark/aiperf/aiperf_configs/single_concurrency.yaml
@@ -1,4 +1,4 @@
-# Concurrency sweep. One-minute tests at log-spaced concurrencies
+# Single-run AIPerf benchmark configuration
 
 # Name for this batch of benchmarks (will be part of output directory name)
 batch_name: single_concurrency
diff --git a/nemoguardrails/benchmark/aiperf/run_aiperf.py b/nemoguardrails/benchmark/aiperf/run_aiperf.py
index 3cdd80314..3ca11bea0 100755
--- a/nemoguardrails/benchmark/aiperf/run_aiperf.py
+++ b/nemoguardrails/benchmark/aiperf/run_aiperf.py
@@ -214,13 +214,13 @@ def _save_subprocess_result_json(
 
         except (IOError, OSError) as e:
             log.error(
-                f"Could not write %s to file %s: %s", save_data, process_result_file, e
+                "Could not write %s to file %s: %s", save_data, process_result_file, e
             )
             raise
 
         except TypeError as e:
             log.error(
-                f"Couldn't serialize %s to %s: %s", save_data, process_result_file, e
+                "Couldn't serialize %s to %s: %s", save_data, process_result_file, e
             )
             raise
 

From d0460260f044a09520885ecc325049b4d32522d0 Mon Sep 17 00:00:00 2001
From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com>
Date: Fri, 14 Nov 2025 14:56:30 -0600
Subject: [PATCH 14/30] Add README for AIPerf scripts

---
 nemoguardrails/benchmark/aiperf/README.md | 367 ++++++++++++++++++++++
 1 file changed, 367 insertions(+)
 create mode 100644 nemoguardrails/benchmark/aiperf/README.md

diff --git a/nemoguardrails/benchmark/aiperf/README.md b/nemoguardrails/benchmark/aiperf/README.md
new file mode 100644
index 000000000..338b0432d
--- /dev/null
+++ b/nemoguardrails/benchmark/aiperf/README.md
@@ -0,0 +1,367 @@
+# AIPerf Benchmarking for NeMo Guardrails
+
+## Introduction
+
+[AIPerf](https://github.com/triton-inference-server/perf_analyzer/tree/main/genai-perf) is NVIDIA's latest benchmarking tool for LLMs. It supports any OpenAI-compatible inference service and generates synthetic data loads, benchmarks, and all the metrics needed for performance comparison and analysis.
+
+The [`run_aiperf.py`](run_aiperf.py) script enhances AIPerf's capabilities by providing:
+
+- **Batch Execution**: Run multiple benchmarks in sequence with a single command
+- **Parameter Sweeps**: Automatically generate and run benchmarks across different parameter combinations (e.g., sweeping concurrency levels, token counts, etc.)
+- **Organized Results**: Automatically organizes benchmark results in timestamped directories with clear naming conventions
+- **YAML Configuration**: Simple, declarative configuration files for reproducible benchmark runs
+- **Run Metadata**: Saves complete metadata about each run (configuration, command, timestamp) for future analysis and reproduction
+- **Service Health Checks**: Validates that the target service is available before starting benchmarks
+
+Instead of manually running AIPerf multiple times with different parameters, you can define a sweep in a YAML file and let the script handle the rest.
+
+## Getting Started
+
+### Prerequisites
+
+These steps have been tested with Python 3.11.11.
+
+1. **Install NeMo Guardrails with developer tooling:**
+
+   ```bash
+   poetry install --with dev
+   ```
+
+2. **Install AIPerf and NVIDIA AI Endpoints:**
+
+   ```bash
+   poetry run pip install aiperf langchain-nvidia-ai-endpoints
+   ```
+
+3. **[Optional] Install/upgrade Hugging Face Hub:**
+
+   AIPerf needs a tokenizer to run and will download one from Hugging Face if available. If you have the tokenizer locally, you can point to that directory and not log into Huggingface.
+
+   ```bash
+   pip install --upgrade huggingface_hub
+   ```
+
+4. **[Optional] Login to Hugging Face:**
+
+   ```bash
+   huggingface-cli login
+   ```
+
+5. **[Optional] Set NVIDIA API Key:**
+
+   To use models hosted on [build.nvidia.com](https://build.nvidia.com/), set your API key:
+
+   ```bash
+   export NVIDIA_API_KEY="your-api-key-here"
+   ```
+
+## Running Benchmarks
+
+Each benchmark is configured using the `AIPerfConfig` Pydantic model in [aiperf_models.py](aiperf_models.py).
+The configs are stored in YAML files, and converted to an `AIPerfConfig` object.
+There are two example configs included which can be extended for your use-cases. These both use Nvidia-hosted models, :
+
+- [`single_concurrency.yaml`](aiperf_configs/single_concurrency.yaml): Example single-run benchmark with a single concurrency value.
+- [`sweep_concurrency.yaml`](aiperf_configs/sweep_concurrency.yaml): Example multiple-run benchmark to sweep concurency values and run a new benchmark for each.
+
+To run a benchmark, use the following command:
+
+```bash
+poetry run nemoguardrails aiperf run --config-file <path-to-config.yaml>
+```
+
+### Running a Single Benchmark
+
+To run a single benchmark with fixed parameters, use the `single_concurrency.yaml` configuration:
+
+```bash
+poetry run nemoguardrails aiperf run --config-file nemoguardrails/benchmark/aiperf/aiperf_configs/single_concurrency.yaml
+```
+
+**Example output:**
+
+```text
+2025-11-14 13:58:21 INFO: Running AIPerf with configuration: nemoguardrails/benchmark/aiperf/aiperf_configs/single_concurrency.yaml
+2025-11-14 13:58:21 INFO: Results root directory: aiperf_results/single_concurrency/20251114_135821
+2025-11-14 13:58:21 INFO: Sweeping parameters: None
+2025-11-14 13:58:21 INFO: Running AIPerf with configuration: nemoguardrails/benchmark/aiperf/aiperf_configs/single_concurrency.yaml
+2025-11-14 13:58:21 INFO: Output directory: aiperf_results/single_concurrency/20251114_135821
+2025-11-14 13:58:21 INFO: Single Run
+2025-11-14 13:59:58 INFO: Run completed successfully
+2025-11-14 13:59:58 INFO: SUMMARY
+2025-11-14 13:59:58 INFO: Total runs : 1
+2025-11-14 13:59:58 INFO: Completed  : 1
+2025-11-14 13:59:58 INFO: Failed     : 0
+```
+
+### Running a Concurrency Sweep
+
+To run multiple benchmarks with different concurrency levels, use the `sweep_concurrency.yaml` configuration:
+
+```bash
+poetry run nemoguardrails aiperf run --config-file nemoguardrails/benchmark/aiperf/aiperf_configs/sweep_concurrency.yaml
+```
+
+**Example output:**
+
+```text
+2025-11-14 14:02:54 INFO: Running AIPerf with configuration: nemoguardrails/benchmark/aiperf/aiperf_configs/sweep_concurrency.yaml
+2025-11-14 14:02:54 INFO: Results root directory: aiperf_results/sweep_concurrency/20251114_140254
+2025-11-14 14:02:54 INFO: Sweeping parameters: {'concurrency': [1, 2, 4]}
+2025-11-14 14:02:54 INFO: Running 3 benchmarks
+2025-11-14 14:02:54 INFO: Run 1/3
+2025-11-14 14:02:54 INFO: Sweep parameters: {'concurrency': 1}
+2025-11-14 14:04:12 INFO: Run 1 completed successfully
+2025-11-14 14:04:12 INFO: Run 2/3
+2025-11-14 14:04:12 INFO: Sweep parameters: {'concurrency': 2}
+2025-11-14 14:05:25 INFO: Run 2 completed successfully
+2025-11-14 14:05:25 INFO: Run 3/3
+2025-11-14 14:05:25 INFO: Sweep parameters: {'concurrency': 4}
+2025-11-14 14:06:38 INFO: Run 3 completed successfully
+2025-11-14 14:06:38 INFO: SUMMARY
+2025-11-14 14:06:38 INFO: Total runs : 3
+2025-11-14 14:06:38 INFO: Completed  : 3
+2025-11-14 14:06:38 INFO: Failed     : 0
+```
+
+## Additional Options
+
+### AIPerf run options
+
+The `--dry-run` option allows you to preview all benchmark commands without executing them. This is useful for:
+
+- Validating your configuration file
+- Checking which parameter combinations will be generated
+- Estimating total execution time before committing to a long-running sweep
+- Debugging configuration issues
+
+```bash
+poetry run nemoguardrails aiperf run --config-file <config.yaml> --dry-run
+```
+
+When in dry-run mode, the script will:
+
+- Load and validate your configuration
+- Check service connectivity
+- Generate all sweep combinations
+- Display what would be executed
+- Exit without running any benchmarks
+
+### Verbose Mode
+
+The `--verbose` option outputs more detailed debugging information to understand each step of the benchmarking process.
+
+```bash
+poetry run nemoguardrails aiperf run --config-file <config.yaml> --verbose
+```
+
+Verbose mode provides:
+
+- Complete command-line arguments passed to AIPerf
+- Detailed parameter merging logic (base config + sweep params)
+- Output directory creation details
+- Real-time AIPerf output (normally captured to files)
+- Full stack traces for errors
+
+**Tip:** Use verbose mode when debugging configuration issues or when you want to see live progress of the benchmark execution.
+
+## Configuration Files
+
+Configuration files are YAML files located in [aiperf_configs](aiperf_configs). The configuration is validated using Pydantic models to catch errors early.
+
+### Top-Level Configuration Fields
+
+| Field | Type | Required | Description |
+|-------|------|----------|-------------|
+| `batch_name` | string | Yes | Name for this batch of benchmarks. Used in output directory naming (e.g., `aiperf_results/batch_name/timestamp/`) |
+| `output_base_dir` | string | Yes | Base directory where all benchmark results will be stored |
+| `base_config` | object | Yes | Base configuration parameters applied to all benchmark runs (see below) |
+| `sweeps` | object | No | Optional parameter sweeps for running multiple benchmarks with different values |
+
+### Base Configuration Parameters
+
+The `base_config` section contains parameters that are passed to AIPerf. Any of these can be overridden by sweep parameters.
+
+#### Model and Service Configuration
+
+| Parameter | Type | Required | Description |
+|-----------|------|----------|-------------|
+| `model` | string | Yes | Model identifier (e.g., `meta/llama-3.3-70b-instruct`) |
+| `tokenizer` | string | No | Tokenizer name from Hugging Face or local path. If not provided, AIPerf will attempt to use the model name |
+| `url` | string | Yes | Base URL of the inference service (e.g., `https://integrate.api.nvidia.com`) |
+| `endpoint` | string | No | API endpoint path (default: `/v1/chat/completions`) |
+| `endpoint_type` | string | No | Type of endpoint: `chat` or `completions` (default: `chat`) |
+| `api_key_env_var` | string | No | Name of environment variable containing API key (e.g., `NVIDIA_API_KEY`) |
+| `streaming` | boolean | No | Whether to use streaming mode (default: `false`) |
+
+#### Load Generation Settings
+
+| Parameter | Type | Required | Description |
+|-----------|------|----------|-------------|
+| `warmup_request_count` | integer | Yes | Number of warmup requests to send before starting the benchmark |
+| `benchmark_duration` | integer | Yes | Duration of the benchmark in seconds |
+| `concurrency` | integer | Yes | Number of concurrent requests to maintain during the benchmark |
+| `request_rate` | float | No | Target request rate in requests/second. If not provided, calculated from concurrency |
+| `request_rate_mode` | string | No | Distribution mode: `constant` or `poisson` (default: `constant`) |
+
+#### Synthetic Data Generation
+
+These parameters control the generation of synthetic prompts for benchmarking:
+
+| Parameter | Type | Required | Description |
+|-----------|------|----------|-------------|
+| `random_seed` | integer | No | Random seed for reproducible synthetic data generation |
+| `prompt_input_tokens_mean` | integer | No | Mean number of input tokens per prompt |
+| `prompt_input_tokens_stddev` | integer | No | Standard deviation of input token count |
+| `prompt_output_tokens_mean` | integer | No | Mean number of expected output tokens |
+| `prompt_output_tokens_stddev` | integer | No | Standard deviation of output token count |
+
+### Parameter Sweeps
+
+The `sweeps` section allows you to run multiple benchmarks with different parameter values. The script generates a **Cartesian product** of all sweep values, running a separate benchmark for each combination.
+
+#### Basic Sweep Example
+
+```yaml
+sweeps:
+  concurrency: [1, 2, 4, 8, 16]
+```
+
+This will run 5 benchmarks, one for each concurrency level.
+
+#### Multi-Parameter Sweep Example
+
+```yaml
+sweeps:
+  concurrency: [1, 4, 16]
+  prompt_input_tokens_mean: [100, 500, 1000]
+```
+
+This will run **9 benchmarks**, one for each value of `concurrency` and `prompt_input_tokens_mean`.
+
+Each sweep combination creates a subdirectory named with the parameter values:
+
+```text
+aiperf_results/
+└── my_benchmark/
+    └── 20251114_140254/
+        ├── concurrency1_prompt_input_tokens_mean100/
+        ├── concurrency1_prompt_input_tokens_mean500/
+        ├── concurrency4_prompt_input_tokens_mean100/
+        └── ...
+```
+
+### Complete Configuration Example
+
+```yaml
+# Name for this batch of benchmarks
+batch_name: my_benchmark
+
+# Base directory where all benchmark results will be stored
+output_base_dir: aiperf_results
+
+# Base configuration applied to all benchmark runs
+base_config:
+  # Model and service configuration
+  model: meta/llama-3.3-70b-instruct
+  tokenizer: meta-llama/Llama-3.3-70B-Instruct
+  url: "https://integrate.api.nvidia.com"
+  endpoint: "/v1/chat/completions"
+  endpoint_type: chat
+  api_key_env_var: NVIDIA_API_KEY
+  streaming: true
+
+  # Load generation settings
+  warmup_request_count: 20
+  benchmark_duration: 60
+  concurrency: 1
+  request_rate_mode: "constant"
+
+  # Synthetic data generation
+  random_seed: 12345
+  prompt_input_tokens_mean: 100
+  prompt_input_tokens_stddev: 10
+  prompt_output_tokens_mean: 50
+  prompt_output_tokens_stddev: 5
+
+# Optional: parameter sweeps (Cartesian product)
+sweeps:
+  concurrency: [1, 2, 4, 8, 16]
+  prompt_input_tokens_mean: [100, 500, 1000]
+```
+
+### Common Sweep Patterns
+
+#### Concurrency Scaling Test
+
+```yaml
+sweeps:
+  concurrency: [1, 2, 4, 8, 16, 32, 64]
+```
+
+Useful for finding optimal concurrency levels and throughput limits.
+
+#### Token Length Impact Test
+
+```yaml
+sweeps:
+  prompt_input_tokens_mean: [50, 100, 500, 1000, 2000]
+  prompt_output_tokens_mean: [50, 100, 500, 1000]
+```
+
+Useful for understanding how token counts affect latency and throughput.
+
+#### Request Rate Comparison
+
+```yaml
+sweeps:
+  request_rate_mode: ["constant", "poisson"]
+  concurrency: [4, 8, 16]
+```
+
+Useful for comparing different load patterns.
+
+## Output Structure
+
+Results are organized in timestamped directories:
+
+```text
+aiperf_results/
+├── <batch_name>/
+│   └── <timestamp>/
+│       ├── run_metadata.json          # Single run
+│       ├── process_result.json
+│       └── <aiperf_outputs>
+│       # OR for sweeps:
+│       ├── concurrency1/
+│       │   ├── run_metadata.json
+│       │   ├── process_result.json
+│       │   └── <aiperf_outputs>
+│       ├── concurrency2/
+│       │   └── ...
+│       └── concurrency4/
+│           └── ...
+```
+
+### Output Files
+
+Each run directory contains multiple files with benchmark results and metadata. A summary of these is shown below:
+
+#### Benchmark runner files
+
+- **`run_metadata.json`**: Contains complete metadata about the benchmark run for reproducibility.
+- **`process_result.json`**: Contains the subprocess execution results.
+
+#### Files Generated by AIPerf
+
+- **`inputs.json`**: Synthetic prompt data generated for the benchmark.
+- **`profile_export_aiperf.json`**: Main metrics file in JSON format containing aggregated statistics.
+- **`profile_export_aiperf.csv`**: Same metrics as the JSON file, but in CSV format for easy import into spreadsheet tools or data analysis libraries.
+- **`profile_export.jsonl`**: JSON Lines format file containing per-request metrics. Each line is a complete JSON object for one request with:
+- **`logs/aiperf.log`**: Detailed log file from AIPerf execution containing:
+
+## Resources
+
+- [AIPerf GitHub Repository](https://github.com/triton-inference-server/perf_analyzer/tree/main/genai-perf)
+- [AIPerf Documentation](https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/client/src/c%2B%2B/perf_analyzer/genai-perf/README.html)
+- [NVIDIA API Catalog](https://build.nvidia.com/)

From 3c4fe596f7a01c022af4f8c9c607352afb2dacb8 Mon Sep 17 00:00:00 2001
From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com>
Date: Mon, 17 Nov 2025 08:42:02 -0600
Subject: [PATCH 15/30] Fix hard-coded forward-slash in path name

---
 tests/benchmark/test_run_aiperf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/benchmark/test_run_aiperf.py b/tests/benchmark/test_run_aiperf.py
index 22077e590..adce85466 100644
--- a/tests/benchmark/test_run_aiperf.py
+++ b/tests/benchmark/test_run_aiperf.py
@@ -418,7 +418,7 @@ def test_create_output_dir_with_sweep(self, tmp_path):
         result = AIPerfRunner._create_output_dir(base_dir, sweep_params)
 
         # Directory should contain sweep parameter values
-        assert str(result) == f"{base_dir}/benchmark_duration30_concurrency10"
+        assert str(result) == base_dir / "benchmark_duration30_concurrency10"
         assert result.exists()
         assert result.is_dir()
 

From dab9b27711d3ec53489e1a6a8537d1b95bbdab48 Mon Sep 17 00:00:00 2001
From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com>
Date: Mon, 17 Nov 2025 08:42:02 -0600
Subject: [PATCH 16/30] Fix hard-coded forward-slash in path name

---
 tests/benchmark/test_run_aiperf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/benchmark/test_run_aiperf.py b/tests/benchmark/test_run_aiperf.py
index adce85466..b4518d186 100644
--- a/tests/benchmark/test_run_aiperf.py
+++ b/tests/benchmark/test_run_aiperf.py
@@ -418,7 +418,7 @@ def test_create_output_dir_with_sweep(self, tmp_path):
         result = AIPerfRunner._create_output_dir(base_dir, sweep_params)
 
         # Directory should contain sweep parameter values
-        assert str(result) == base_dir / "benchmark_duration30_concurrency10"
+        assert result == base_dir / "benchmark_duration30_concurrency10"
         assert result.exists()
         assert result.is_dir()
 

From 253c96dd10c6229cf1929193d143cc343b2a7f93 Mon Sep 17 00:00:00 2001
From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com>
Date: Tue, 18 Nov 2025 19:59:12 -0600
Subject: [PATCH 17/30] Fix type: ignore line in huggingface streamers

---
 nemoguardrails/benchmark/aiperf/run_aiperf.py |  37 ++++-
 .../content_safety_colang2/config.yml         |  18 +++
 .../content_safety_colang2/main.co            |   5 +
 .../content_safety_colang2/prompts.yml        | 104 ++++++++++++++
 .../content_safety_colang2/rails.co           |  12 ++
 .../llm/providers/huggingface/streamers.py    |   4 +-
 tests/benchmark/test_run_aiperf.py            | 128 ++++++++++++++++++
 7 files changed, 303 insertions(+), 5 deletions(-)
 create mode 100644 nemoguardrails/benchmark/configs/guardrail_configs/content_safety_colang2/config.yml
 create mode 100644 nemoguardrails/benchmark/configs/guardrail_configs/content_safety_colang2/main.co
 create mode 100644 nemoguardrails/benchmark/configs/guardrail_configs/content_safety_colang2/prompts.yml
 create mode 100644 nemoguardrails/benchmark/configs/guardrail_configs/content_safety_colang2/rails.co

diff --git a/nemoguardrails/benchmark/aiperf/run_aiperf.py b/nemoguardrails/benchmark/aiperf/run_aiperf.py
index 3ca11bea0..245bddfc1 100755
--- a/nemoguardrails/benchmark/aiperf/run_aiperf.py
+++ b/nemoguardrails/benchmark/aiperf/run_aiperf.py
@@ -104,6 +104,37 @@ def _get_sweep_combinations(self) -> Optional[List[Dict[str, Union[int, str]]]]:
 
         return combinations
 
+    @staticmethod
+    def _sanitize_command_for_logging(cmd: List[str]) -> str:
+        """Convert command list to string with API key redacted.
+
+        Args:
+            cmd: List of command-line arguments
+
+        Returns:
+            String representation with --api-key value replaced with <removed>
+        """
+        last_n_chars = 6  # Show the last 6 characters
+
+        sanitized = []
+        i = 0
+        while i < len(cmd):
+            current = cmd[i]
+            sanitized.append(current)
+
+            # If this is --api-key, replace the next value with <removed>
+            if current == "--api-key" and i + 1 < len(cmd):
+                api_key = cmd[i + 1]
+                len_api_key = len(api_key)
+                sanitized_api_key = "*" * (len_api_key - last_n_chars)
+                sanitized_api_key += api_key[-last_n_chars:]
+                sanitized.append(sanitized_api_key)
+                i += 2  # Skip the actual API key value
+            else:
+                i += 1
+
+        return " ".join(sanitized)
+
     def _build_command(
         self, sweep_params: Optional[Dict[str, Union[str, int]]], output_dir: Path
     ) -> List[str]:
@@ -156,7 +187,7 @@ def _build_command(
             elif value is not None:
                 cmd.extend([f"--{arg_name}", str(value)])
 
-        log.debug("Final command-line: %s", cmd)
+        log.debug("Final command-line: %s", self._sanitize_command_for_logging(cmd))
         return cmd
 
     @staticmethod
@@ -192,7 +223,7 @@ def _save_run_metadata(
             "config_file": str(self.config_path),
             "sweep_params": sweep_params,
             "base_config": self.config.base_config.model_dump(),
-            "command": " ".join(command),
+            "command": self._sanitize_command_for_logging(command),
         }
 
         metadata_file = output_dir / "run_metadata.json"
@@ -294,7 +325,7 @@ def run_single_benchmark(
 
         log.info("Single Run")
         log.debug("Output directory: %s", run_output_dir)
-        log.debug("Command: %s", " ".join(command))
+        log.debug("Command: %s", self._sanitize_command_for_logging(command))
         if dry_run:
             log.info("Dry-run mode. Commands will not be executed")
             return AIPerfSummary(total=0, completed=0, failed=0)
diff --git a/nemoguardrails/benchmark/configs/guardrail_configs/content_safety_colang2/config.yml b/nemoguardrails/benchmark/configs/guardrail_configs/content_safety_colang2/config.yml
new file mode 100644
index 000000000..cbac1ef01
--- /dev/null
+++ b/nemoguardrails/benchmark/configs/guardrail_configs/content_safety_colang2/config.yml
@@ -0,0 +1,18 @@
+colang_version: 2.x
+models:
+  - type: main
+    engine: nim
+    model: meta/llama-3.3-70b-instruct
+  - type: content_safety
+    engine: nim
+    model: nvidia/llama-3.1-nemoguard-8b-content-safety
+  - type: topic_control
+    engine: nim
+    model: nvidia/llama-3.1-nemoguard-8b-topic-control
+
+rails:
+  config:
+    jailbreak_detection:
+      nim_base_url: "https://ai.api.nvidia.com"
+      nim_server_endpoint: "/v1/security/nvidia/nemoguard-jailbreak-detect"
+      api_key_env_var: NVIDIA_API_KEY
diff --git a/nemoguardrails/benchmark/configs/guardrail_configs/content_safety_colang2/main.co b/nemoguardrails/benchmark/configs/guardrail_configs/content_safety_colang2/main.co
new file mode 100644
index 000000000..e95376eab
--- /dev/null
+++ b/nemoguardrails/benchmark/configs/guardrail_configs/content_safety_colang2/main.co
@@ -0,0 +1,5 @@
+import core
+import llm
+
+flow main
+  activate llm continuation
diff --git a/nemoguardrails/benchmark/configs/guardrail_configs/content_safety_colang2/prompts.yml b/nemoguardrails/benchmark/configs/guardrail_configs/content_safety_colang2/prompts.yml
new file mode 100644
index 000000000..96c952806
--- /dev/null
+++ b/nemoguardrails/benchmark/configs/guardrail_configs/content_safety_colang2/prompts.yml
@@ -0,0 +1,104 @@
+prompts:
+  - task: topic_safety_check_input $model=topic_control
+    content: |
+      You are to act as a customer service agent, providing users with factual information in accordance to the knowledge base. Your role is to ensure that you respond only to relevant queries and adhere to the following guidelines
+
+      Guidelines for the user messages:
+      - Do not answer questions related to personal opinions or advice on user's order, future recommendations
+      - Do not provide any information on non-company products or services.
+      - Do not answer enquiries unrelated to the company policies.
+      - Do not answer questions asking for personal details about the agent or its creators.
+      - Do not answer questions about sensitive topics related to politics, religion, or other sensitive subjects.
+      - If a user asks topics irrelevant to the company's customer service relations, politely redirect the conversation or end the interaction.
+      - Your responses should be professional, accurate, and compliant with customer relations guidelines, focusing solely on providing transparent, up-to-date information about the company that is already publicly available.
+      - allow user comments that are related to small talk and chit-chat.
+
+  - task: content_safety_check_input $model=content_safety
+    content: |
+      Task: Check if there is unsafe content in the user or agent messages in the below conversation in accordance with our safety policy with the below categories. Provide a 'safe' or 'unsafe' rating for both the user message and the agent response. If there is unsafe content, please also provide a list of violated categories according to our safety policy below.
+
+      <BEGIN UNSAFE CONTENT CATEGORIES>
+      S1: Violence.
+      S2: Sexual.
+      S3: Criminal Planning/Confessions.
+      S4: Guns and Illegal Weapons.
+      S5: Controlled/Regulated Substances.
+      S6: Suicide and Self Harm.
+      S7: Sexual (minor).
+      S8: Hate/Identity Hate.
+      S9: PII/Privacy.
+      S10: Harassment.
+      S11: Threat.
+      S12: Profanity.
+      S13: Needs Caution.
+      S14: Other.
+      S15: Manipulation.
+      S16: Fraud/Deception.
+      S17: Malware.
+      S18: High Risk Gov Decision Making.
+      S19: Political/Misinformation/Conspiracy.
+      S20: Copyright/Trademark/Plagiarism.
+      S21: Unauthorized Advice.
+      S22: Illegal Activity.
+      S23: Immoral/Unethical.
+      <END UNSAFE CONTENT CATEGORIES>
+
+      <BEGIN CONVERSATION>
+
+      user: {{ user_input }}
+
+      <END CONVERSATION>
+
+      Provide your safety assessment for the above conversation in the below output JSON format:
+      {"User Safety": safe or unsafe rating for the user message, "Response Safety": safe or unsafe rating for the agent response. Omit if no agent response present. "Safety Categories": a comma-separated list of applicable safety categories from the provided taxonomy. Omit if all safe.}
+
+      Do not include anything other than the output JSON in your response.
+      Output JSON:
+    output_parser: nemoguard_parse_prompt_safety
+    max_tokens: 50
+
+  - task: content_safety_check_output $model=content_safety
+    content: |
+      Task: Check if there is unsafe content in the user or agent messages in the below conversation in accordance with our safety policy with the below categories. Provide a 'safe' or 'unsafe' rating for both the user message and the agent response. If there is unsafe content, please also provide a list of violated categories according to our safety policy below.
+
+      <BEGIN UNSAFE CONTENT CATEGORIES>
+      S1: Violence.
+      S2: Sexual.
+      S3: Criminal Planning/Confessions.
+      S4: Guns and Illegal Weapons.
+      S5: Controlled/Regulated Substances.
+      S6: Suicide and Self Harm.
+      S7: Sexual (minor).
+      S8: Hate/Identity Hate.
+      S9: PII/Privacy.
+      S10: Harassment.
+      S11: Threat.
+      S12: Profanity.
+      S13: Needs Caution.
+      S14: Other.
+      S15: Manipulation.
+      S16: Fraud/Deception.
+      S17: Malware.
+      S18: High Risk Gov Decision Making.
+      S19: Political/Misinformation/Conspiracy.
+      S20: Copyright/Trademark/Plagiarism.
+      S21: Unauthorized Advice.
+      S22: Illegal Activity.
+      S23: Immoral/Unethical.
+      <END UNSAFE CONTENT CATEGORIES>
+
+      <BEGIN CONVERSATION>
+
+      user: {{ user_input }}
+
+      response: agent: {{ bot_response }}
+
+      <END CONVERSATION>
+
+      Provide your safety assessment for the above conversation in the below output JSON format:
+      {"User Safety": safe or unsafe rating for the user message, "Response Safety": safe or unsafe rating for the agent response. Omit if no agent response present. "Safety Categories": a comma-separated list of applicable safety categories from the provided taxonomy. Omit if all safe.}
+
+      Do not include anything other than the output JSON in your response.
+      Output JSON:
+    output_parser: nemoguard_parse_response_safety
+    max_tokens: 50
diff --git a/nemoguardrails/benchmark/configs/guardrail_configs/content_safety_colang2/rails.co b/nemoguardrails/benchmark/configs/guardrail_configs/content_safety_colang2/rails.co
new file mode 100644
index 000000000..fb0194958
--- /dev/null
+++ b/nemoguardrails/benchmark/configs/guardrail_configs/content_safety_colang2/rails.co
@@ -0,0 +1,12 @@
+import guardrails
+import nemoguardrails.library.content_safety
+import nemoguardrails.library.topic_safety
+import nemoguardrails.library.jailbreak_detection
+
+flow input rails $input_text
+    content safety check input $model="content_safety"
+    topic safety check input $model="topic_control"
+    jailbreak detection model
+
+flow output rails $output_text
+    content safety check output $model="content_safety"
diff --git a/nemoguardrails/llm/providers/huggingface/streamers.py b/nemoguardrails/llm/providers/huggingface/streamers.py
index 14c406124..4afb44ac4 100644
--- a/nemoguardrails/llm/providers/huggingface/streamers.py
+++ b/nemoguardrails/llm/providers/huggingface/streamers.py
@@ -18,8 +18,8 @@
 
 TRANSFORMERS_AVAILABLE = True
 try:
-    from transformers.generation.streamers import (  # type: ignore[import-untyped]
-        TextStreamer,
+    from transformers.generation.streamers import (
+        TextStreamer,  # type: ignore[import-untyped]
     )
 except ImportError:
     # Fallback if transformers is not available
diff --git a/tests/benchmark/test_run_aiperf.py b/tests/benchmark/test_run_aiperf.py
index b4518d186..13fe65297 100644
--- a/tests/benchmark/test_run_aiperf.py
+++ b/tests/benchmark/test_run_aiperf.py
@@ -223,6 +223,134 @@ def test_multiple_sweep_parameters(self, create_config_file):
         assert {"concurrency": 2, "benchmark_duration": 60} in combinations
 
 
+class TestSanitizeCommandForLogging:
+    """Test the _sanitize_command_for_logging static method."""
+
+    def test_sanitize_command_with_api_key(self):
+        """Test sanitizing command with API key showing last 6 chars."""
+        cmd = [
+            "aiperf",
+            "profile",
+            "--model",
+            "test-model",
+            "--api-key",
+            "secret-key-123",
+            "--url",
+            "http://localhost:8000",
+        ]
+        result = AIPerfRunner._sanitize_command_for_logging(cmd)
+
+        # "secret-key-123" has 14 chars, so 8 asterisks + last 6 chars "ey-123"
+        assert result == (
+            "aiperf profile --model test-model --api-key ********ey-123 "
+            "--url http://localhost:8000"
+        )
+
+    def test_sanitize_command_without_api_key(self):
+        """Test sanitizing command without API key."""
+        cmd = [
+            "aiperf",
+            "profile",
+            "--model",
+            "test-model",
+            "--url",
+            "http://localhost:8000",
+        ]
+        result = AIPerfRunner._sanitize_command_for_logging(cmd)
+
+        assert result == " ".join(cmd)
+
+    def test_sanitize_command_api_key_at_end_no_value(self):
+        """Test sanitizing command where --api-key is at the end with no value."""
+        cmd = ["aiperf", "profile", "--model", "test-model", "--api-key"]
+        result = AIPerfRunner._sanitize_command_for_logging(cmd)
+
+        # Should just include --api-key without sanitizing since there's no value
+        assert result == "aiperf profile --model test-model --api-key"
+
+    def test_sanitize_command_empty_list(self):
+        """Test sanitizing an empty command list."""
+        cmd = []
+        result = AIPerfRunner._sanitize_command_for_logging(cmd)
+
+        assert result == ""
+
+    def test_sanitize_command_single_element(self):
+        """Test sanitizing command with a single element."""
+        cmd = ["aiperf"]
+        result = AIPerfRunner._sanitize_command_for_logging(cmd)
+
+        assert result == "aiperf"
+
+    def test_sanitize_command_multiple_api_keys(self):
+        """Test sanitizing command with multiple API key occurrences."""
+        cmd = [
+            "aiperf",
+            "profile",
+            "--api-key",
+            "first-key",
+            "--model",
+            "test-model",
+            "--api-key",
+            "second-key",
+        ]
+        result = AIPerfRunner._sanitize_command_for_logging(cmd)
+        assert result == (
+            "aiperf profile --api-key ***st-key --model test-model --api-key ****nd-key"
+        )
+
+    def test_sanitize_command_preserves_other_values(self):
+        """Test that other command values are preserved exactly."""
+        cmd = [
+            "aiperf",
+            "profile",
+            "--api-key",
+            "my-secret-key",
+            "--concurrency",
+            "10",
+            "--benchmark-duration",
+            "60",
+            "--streaming",
+        ]
+        result = AIPerfRunner._sanitize_command_for_logging(cmd)
+
+        # "my-secret-key" has 13 chars, so 7 asterisks + "et-key" (last 6 chars)
+        assert result == (
+            "aiperf profile --api-key *******et-key --concurrency 10 "
+            "--benchmark-duration 60 --streaming"
+        )
+
+    def test_sanitize_command_short_api_key(self):
+        """Test sanitizing command with API key shorter than or equal to 6 chars."""
+        cmd = ["aiperf", "profile", "--api-key", "abc123"]
+        result = AIPerfRunner._sanitize_command_for_logging(cmd)
+
+        # "abc123" has exactly 6 chars, so 0 asterisks + all 6 chars
+        assert result == "aiperf profile --api-key abc123"
+
+    def test_sanitize_command_very_short_api_key(self):
+        """Test sanitizing command with API key shorter than 6 chars."""
+        cmd = ["aiperf", "profile", "--api-key", "abc"]
+        result = AIPerfRunner._sanitize_command_for_logging(cmd)
+
+        # "abc" has 3 chars, so shows all of them (no asterisks due to negative masking)
+        assert result == "aiperf profile --api-key abc"
+
+    def test_sanitize_command_long_api_key(self):
+        """Test sanitizing command with a long API key."""
+        cmd = [
+            "aiperf",
+            "profile",
+            "--api-key",
+            "sk-proj-1234567890abcdefghijklmnopqrstuvwxyz",
+        ]
+        result = AIPerfRunner._sanitize_command_for_logging(cmd)
+
+        # API key has 44 chars, so 38 asterisks + last 6 chars "uvwxyz"
+        expected_masked = "*" * 38 + "uvwxyz"
+        assert result == f"aiperf profile --api-key {expected_masked}"
+
+
 class TestBuildCommand:
     """Test the _build_command method."""
 

From c98066ba6ce324a97a4a2df8255fa6ec969505d8 Mon Sep 17 00:00:00 2001
From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com>
Date: Tue, 18 Nov 2025 20:02:45 -0600
Subject: [PATCH 18/30] Remove content_safety_colang2 Guardrail config from
 benchmark

---
 .../content_safety_colang2/config.yml         |  18 ---
 .../content_safety_colang2/main.co            |   5 -
 .../content_safety_colang2/prompts.yml        | 104 ------------------
 .../content_safety_colang2/rails.co           |  12 --
 4 files changed, 139 deletions(-)
 delete mode 100644 nemoguardrails/benchmark/configs/guardrail_configs/content_safety_colang2/config.yml
 delete mode 100644 nemoguardrails/benchmark/configs/guardrail_configs/content_safety_colang2/main.co
 delete mode 100644 nemoguardrails/benchmark/configs/guardrail_configs/content_safety_colang2/prompts.yml
 delete mode 100644 nemoguardrails/benchmark/configs/guardrail_configs/content_safety_colang2/rails.co

diff --git a/nemoguardrails/benchmark/configs/guardrail_configs/content_safety_colang2/config.yml b/nemoguardrails/benchmark/configs/guardrail_configs/content_safety_colang2/config.yml
deleted file mode 100644
index cbac1ef01..000000000
--- a/nemoguardrails/benchmark/configs/guardrail_configs/content_safety_colang2/config.yml
+++ /dev/null
@@ -1,18 +0,0 @@
-colang_version: 2.x
-models:
-  - type: main
-    engine: nim
-    model: meta/llama-3.3-70b-instruct
-  - type: content_safety
-    engine: nim
-    model: nvidia/llama-3.1-nemoguard-8b-content-safety
-  - type: topic_control
-    engine: nim
-    model: nvidia/llama-3.1-nemoguard-8b-topic-control
-
-rails:
-  config:
-    jailbreak_detection:
-      nim_base_url: "https://ai.api.nvidia.com"
-      nim_server_endpoint: "/v1/security/nvidia/nemoguard-jailbreak-detect"
-      api_key_env_var: NVIDIA_API_KEY
diff --git a/nemoguardrails/benchmark/configs/guardrail_configs/content_safety_colang2/main.co b/nemoguardrails/benchmark/configs/guardrail_configs/content_safety_colang2/main.co
deleted file mode 100644
index e95376eab..000000000
--- a/nemoguardrails/benchmark/configs/guardrail_configs/content_safety_colang2/main.co
+++ /dev/null
@@ -1,5 +0,0 @@
-import core
-import llm
-
-flow main
-  activate llm continuation
diff --git a/nemoguardrails/benchmark/configs/guardrail_configs/content_safety_colang2/prompts.yml b/nemoguardrails/benchmark/configs/guardrail_configs/content_safety_colang2/prompts.yml
deleted file mode 100644
index 96c952806..000000000
--- a/nemoguardrails/benchmark/configs/guardrail_configs/content_safety_colang2/prompts.yml
+++ /dev/null
@@ -1,104 +0,0 @@
-prompts:
-  - task: topic_safety_check_input $model=topic_control
-    content: |
-      You are to act as a customer service agent, providing users with factual information in accordance to the knowledge base. Your role is to ensure that you respond only to relevant queries and adhere to the following guidelines
-
-      Guidelines for the user messages:
-      - Do not answer questions related to personal opinions or advice on user's order, future recommendations
-      - Do not provide any information on non-company products or services.
-      - Do not answer enquiries unrelated to the company policies.
-      - Do not answer questions asking for personal details about the agent or its creators.
-      - Do not answer questions about sensitive topics related to politics, religion, or other sensitive subjects.
-      - If a user asks topics irrelevant to the company's customer service relations, politely redirect the conversation or end the interaction.
-      - Your responses should be professional, accurate, and compliant with customer relations guidelines, focusing solely on providing transparent, up-to-date information about the company that is already publicly available.
-      - allow user comments that are related to small talk and chit-chat.
-
-  - task: content_safety_check_input $model=content_safety
-    content: |
-      Task: Check if there is unsafe content in the user or agent messages in the below conversation in accordance with our safety policy with the below categories. Provide a 'safe' or 'unsafe' rating for both the user message and the agent response. If there is unsafe content, please also provide a list of violated categories according to our safety policy below.
-
-      <BEGIN UNSAFE CONTENT CATEGORIES>
-      S1: Violence.
-      S2: Sexual.
-      S3: Criminal Planning/Confessions.
-      S4: Guns and Illegal Weapons.
-      S5: Controlled/Regulated Substances.
-      S6: Suicide and Self Harm.
-      S7: Sexual (minor).
-      S8: Hate/Identity Hate.
-      S9: PII/Privacy.
-      S10: Harassment.
-      S11: Threat.
-      S12: Profanity.
-      S13: Needs Caution.
-      S14: Other.
-      S15: Manipulation.
-      S16: Fraud/Deception.
-      S17: Malware.
-      S18: High Risk Gov Decision Making.
-      S19: Political/Misinformation/Conspiracy.
-      S20: Copyright/Trademark/Plagiarism.
-      S21: Unauthorized Advice.
-      S22: Illegal Activity.
-      S23: Immoral/Unethical.
-      <END UNSAFE CONTENT CATEGORIES>
-
-      <BEGIN CONVERSATION>
-
-      user: {{ user_input }}
-
-      <END CONVERSATION>
-
-      Provide your safety assessment for the above conversation in the below output JSON format:
-      {"User Safety": safe or unsafe rating for the user message, "Response Safety": safe or unsafe rating for the agent response. Omit if no agent response present. "Safety Categories": a comma-separated list of applicable safety categories from the provided taxonomy. Omit if all safe.}
-
-      Do not include anything other than the output JSON in your response.
-      Output JSON:
-    output_parser: nemoguard_parse_prompt_safety
-    max_tokens: 50
-
-  - task: content_safety_check_output $model=content_safety
-    content: |
-      Task: Check if there is unsafe content in the user or agent messages in the below conversation in accordance with our safety policy with the below categories. Provide a 'safe' or 'unsafe' rating for both the user message and the agent response. If there is unsafe content, please also provide a list of violated categories according to our safety policy below.
-
-      <BEGIN UNSAFE CONTENT CATEGORIES>
-      S1: Violence.
-      S2: Sexual.
-      S3: Criminal Planning/Confessions.
-      S4: Guns and Illegal Weapons.
-      S5: Controlled/Regulated Substances.
-      S6: Suicide and Self Harm.
-      S7: Sexual (minor).
-      S8: Hate/Identity Hate.
-      S9: PII/Privacy.
-      S10: Harassment.
-      S11: Threat.
-      S12: Profanity.
-      S13: Needs Caution.
-      S14: Other.
-      S15: Manipulation.
-      S16: Fraud/Deception.
-      S17: Malware.
-      S18: High Risk Gov Decision Making.
-      S19: Political/Misinformation/Conspiracy.
-      S20: Copyright/Trademark/Plagiarism.
-      S21: Unauthorized Advice.
-      S22: Illegal Activity.
-      S23: Immoral/Unethical.
-      <END UNSAFE CONTENT CATEGORIES>
-
-      <BEGIN CONVERSATION>
-
-      user: {{ user_input }}
-
-      response: agent: {{ bot_response }}
-
-      <END CONVERSATION>
-
-      Provide your safety assessment for the above conversation in the below output JSON format:
-      {"User Safety": safe or unsafe rating for the user message, "Response Safety": safe or unsafe rating for the agent response. Omit if no agent response present. "Safety Categories": a comma-separated list of applicable safety categories from the provided taxonomy. Omit if all safe.}
-
-      Do not include anything other than the output JSON in your response.
-      Output JSON:
-    output_parser: nemoguard_parse_response_safety
-    max_tokens: 50
diff --git a/nemoguardrails/benchmark/configs/guardrail_configs/content_safety_colang2/rails.co b/nemoguardrails/benchmark/configs/guardrail_configs/content_safety_colang2/rails.co
deleted file mode 100644
index fb0194958..000000000
--- a/nemoguardrails/benchmark/configs/guardrail_configs/content_safety_colang2/rails.co
+++ /dev/null
@@ -1,12 +0,0 @@
-import guardrails
-import nemoguardrails.library.content_safety
-import nemoguardrails.library.topic_safety
-import nemoguardrails.library.jailbreak_detection
-
-flow input rails $input_text
-    content safety check input $model="content_safety"
-    topic safety check input $model="topic_control"
-    jailbreak detection model
-
-flow output rails $output_text
-    content safety check output $model="content_safety"

From ba933cb77f962cbd252aac603c9e175958c1e5c7 Mon Sep 17 00:00:00 2001
From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com>
Date: Tue, 18 Nov 2025 20:09:57 -0600
Subject: [PATCH 19/30] Fix TextStreamer import Pyright waiver

---
 nemoguardrails/llm/providers/huggingface/streamers.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/nemoguardrails/llm/providers/huggingface/streamers.py b/nemoguardrails/llm/providers/huggingface/streamers.py
index 4afb44ac4..e726b6e11 100644
--- a/nemoguardrails/llm/providers/huggingface/streamers.py
+++ b/nemoguardrails/llm/providers/huggingface/streamers.py
@@ -18,9 +18,7 @@
 
 TRANSFORMERS_AVAILABLE = True
 try:
-    from transformers.generation.streamers import (
-        TextStreamer,  # type: ignore[import-untyped]
-    )
+    from transformers.generation.streamers import TextStreamer  # type: ignore
 except ImportError:
     # Fallback if transformers is not available
     TRANSFORMERS_AVAILABLE = False

From b8d34c4d793299fc0affe80ac8f9452cdf7cc2b0 Mon Sep 17 00:00:00 2001
From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com>
Date: Wed, 19 Nov 2025 09:09:35 -0600
Subject: [PATCH 20/30] Revert changes to server to give OpenAI-compliant
 responses

---
 nemoguardrails/server/api.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/nemoguardrails/server/api.py b/nemoguardrails/server/api.py
index 658cffd01..864f156df 100644
--- a/nemoguardrails/server/api.py
+++ b/nemoguardrails/server/api.py
@@ -475,7 +475,9 @@ async def chat_completion(body: RequestBody, request: Request):
 
     except Exception as ex:
         log.exception(ex)
-        return ResponseBody(messages=[{"role": "assistant", "content": "Internal server error."}])
+        return ResponseBody(
+            messages=[{"role": "assistant", "content": "Internal server error."}]
+        )
 
 
 # By default, there are no challenges

From 0c0db94a36ee5e637df3a906b5b15c1b9b6e425b Mon Sep 17 00:00:00 2001
From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com>
Date: Thu, 20 Nov 2025 08:55:21 -0600
Subject: [PATCH 21/30] Add API key to /v1/models check, adjust description of
 AIPerf in CLI

---
 nemoguardrails/benchmark/aiperf/run_aiperf.py |  11 +-
 nemoguardrails/server/api.py                  | 270 ++++++++++++++++--
 tests/benchmark/test_run_aiperf.py            |  67 +++++
 3 files changed, 317 insertions(+), 31 deletions(-)

diff --git a/nemoguardrails/benchmark/aiperf/run_aiperf.py b/nemoguardrails/benchmark/aiperf/run_aiperf.py
index 245bddfc1..8ec749940 100755
--- a/nemoguardrails/benchmark/aiperf/run_aiperf.py
+++ b/nemoguardrails/benchmark/aiperf/run_aiperf.py
@@ -112,7 +112,7 @@ def _sanitize_command_for_logging(cmd: List[str]) -> str:
             cmd: List of command-line arguments
 
         Returns:
-            String representation with --api-key value replaced with <removed>
+            String with --api-key value replaced with * apart from last N chars
         """
         last_n_chars = 6  # Show the last 6 characters
 
@@ -260,8 +260,15 @@ def _check_service(self, endpoint: Optional[str] = "/v1/models") -> None:
         url = urllib.parse.urljoin(self.config.base_config.url, endpoint)
         log.debug("Checking service is up using endpoint %s", url)
 
+        # If the user has an API Key stored in an env var, use that in the /v1/models call
+        api_key_env_var = self.config.base_config.api_key_env_var
+        api_key = None
+        if api_key_env_var:
+            api_key = os.environ.get(api_key_env_var)
+        headers = {"Authorization": f"Bearer {api_key}"} if api_key else None
+
         try:
-            response = httpx.get(url, timeout=5)
+            response = httpx.get(url, timeout=5, headers=headers)
         except httpx.ConnectError as e:
             raise RuntimeError(f"Can't connect to {url}: {e}")
 
diff --git a/nemoguardrails/server/api.py b/nemoguardrails/server/api.py
index 864f156df..568a2de0a 100644
--- a/nemoguardrails/server/api.py
+++ b/nemoguardrails/server/api.py
@@ -20,11 +20,14 @@
 import os.path
 import re
 import time
+
+# For generating unique IDs
+import uuid
 import warnings
 from contextlib import asynccontextmanager
-from typing import Any, Callable, List, Optional
+from typing import Any, Callable, List, Optional, cast
 
-from fastapi import FastAPI, Request
+from fastapi import FastAPI, HTTPException, Request
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel, Field, root_validator, validator
 from starlette.responses import StreamingResponse
@@ -201,6 +204,10 @@ class RequestBody(BaseModel):
         # alias="guardrails",
         validate_default=True,
     )
+    model: Optional[str] = Field(
+        default=None,
+        description="Model name (unused)",
+    )
     thread_id: Optional[str] = Field(
         default=None,
         min_length=16,
@@ -265,6 +272,73 @@ class ResponseBody(BaseModel):
     )
 
 
+# OpenAI-compatible response models
+class ModelObject(BaseModel):
+    """Represents an OpenAI model object."""
+
+    id: str = Field(description="The model identifier")
+    object: str = Field(default="model", description="The object type, always 'model'")
+    created: int = Field(description="Unix timestamp of when the model was created")
+    owned_by: str = Field(
+        default="nemo-guardrails", description="Organization that owns the model"
+    )
+
+
+class ModelsListResponse(BaseModel):
+    """Response for listing available models."""
+
+    object: str = Field(default="list", description="The object type, always 'list'")
+    data: List[ModelObject] = Field(description="List of model objects")
+
+
+class ChatCompletionMessage(BaseModel):
+    """A message in a chat completion."""
+
+    role: str = Field(description="The role of the message author")
+    content: Optional[str] = Field(
+        default=None, description="The content of the message"
+    )
+
+
+class ChatCompletionChoice(BaseModel):
+    """A choice in a chat completion response."""
+
+    index: int = Field(description="The index of the choice")
+    message: ChatCompletionMessage = Field(
+        description="The message generated by the model"
+    )
+    finish_reason: Optional[str] = Field(
+        default="stop", description="The reason the model stopped generating"
+    )
+
+
+class UsageInfo(BaseModel):
+    """Token usage information."""
+
+    prompt_tokens: int = Field(default=0, description="Number of tokens in the prompt")
+    completion_tokens: int = Field(
+        default=0, description="Number of tokens in the completion"
+    )
+    total_tokens: int = Field(default=0, description="Total number of tokens used")
+
+
+class ChatCompletionResponse(BaseModel):
+    """OpenAI-compatible chat completion response."""
+
+    id: str = Field(description="A unique identifier for the chat completion")
+    object: str = Field(default="chat.completion", description="The object type")
+    created: int = Field(
+        description="Unix timestamp of when the completion was created"
+    )
+    model: str = Field(description="The model used for the chat completion")
+    choices: List[ChatCompletionChoice] = Field(
+        description="List of completion choices"
+    )
+    usage: Optional[UsageInfo] = Field(
+        default=None, description="Token usage information"
+    )
+
+
 @app.get(
     "/v1/rails/configs",
     summary="Get List of available rails configurations.",
@@ -294,6 +368,64 @@ async def get_rails_configs():
     return [{"id": config_id} for config_id in config_ids]
 
 
+@app.get(
+    "/v1/models",
+    response_model=ModelsListResponse,
+    summary="List available models (OpenAI-compatible endpoint).",
+)
+async def list_models():
+    """Returns the list of available models in OpenAI-compatible format.
+
+    Each guardrails configuration is exposed as a separate model.
+    """
+    # Get the list of configs (which we'll expose as models)
+    configs = await get_rails_configs()
+
+    # Convert configs to OpenAI model format
+    models = []
+    current_timestamp = int(time.time())
+
+    for config in configs:
+        # Config always has an 'id' field from get_rails_configs
+        config_id = str(config["id"])
+        model = ModelObject(
+            id=config_id,
+            object="model",
+            created=current_timestamp,
+            owned_by="nemo-guardrails",
+        )
+        models.append(model)
+
+    return ModelsListResponse(object="list", data=models)
+
+
+@app.get(
+    "/v1/models/{model}",
+    response_model=ModelObject,
+    summary="Retrieve a model instance (OpenAI-compatible endpoint).",
+)
+async def retrieve_model(model: str):
+    """Returns details about a specific model in OpenAI-compatible format.
+
+    Each guardrails configuration is exposed as a separate model.
+    """
+    # Get the list of configs to check if the model exists
+    configs = await get_rails_configs()
+    config_ids = [str(config["id"]) for config in configs]
+
+    if model not in config_ids:
+        raise HTTPException(status_code=404, detail=f"Model '{model}' not found")
+
+    # Return the model object
+    current_timestamp = int(time.time())
+    return ModelObject(
+        id=model,
+        object="model",
+        created=current_timestamp,
+        owned_by="nemo-guardrails",
+    )
+
+
 # One instance of LLMRails per config id
 llm_rails_instances: dict[str, LLMRails] = {}
 llm_rails_events_history_cache: dict[str, dict] = {}
@@ -357,13 +489,12 @@ def _get_rails(config_ids: List[str]) -> LLMRails:
 
 @app.post(
     "/v1/chat/completions",
-    response_model=ResponseBody,
     response_model_exclude_none=True,
 )
 async def chat_completion(body: RequestBody, request: Request):
     """Chat completion for the provided conversation.
 
-    TODO: add support for explicit state object.
+    Supports both legacy and OpenAI-compatible response formats.
     """
     log.info("Got request for config %s", body.config_id)
     for logger in registered_loggers:
@@ -385,14 +516,23 @@ async def chat_completion(body: RequestBody, request: Request):
         llm_rails = _get_rails(config_ids)
     except ValueError as ex:
         log.exception(ex)
-        return ResponseBody(
-            messages=[
-                {
-                    "role": "assistant",
-                    "content": f"Could not load the {config_ids} guardrails configuration. "
-                    f"An internal error has occurred.",
-                }
-            ]
+        # Return OpenAI-compatible error response
+        return ChatCompletionResponse(
+            id=f"chatcmpl-{uuid.uuid4().hex[:24]}",
+            object="chat.completion",
+            created=int(time.time()),
+            model="Unknown",
+            choices=[
+                ChatCompletionChoice(
+                    index=0,
+                    message=ChatCompletionMessage(
+                        role="assistant",
+                        content=f"Could not load the {config_ids} guardrails configuration. "
+                        f"An internal error has occurred.",
+                    ),
+                    finish_reason="stop",
+                )
+            ],
         )
 
     try:
@@ -409,13 +549,34 @@ async def chat_completion(body: RequestBody, request: Request):
 
             # We make sure the `thread_id` meets the minimum complexity requirement.
             if len(body.thread_id) < 16:
-                return ResponseBody(
-                    messages=[
-                        {
-                            "role": "assistant",
-                            "content": "The `thread_id` must have a minimum length of 16 characters.",
-                        }
-                    ]
+                error_message = (
+                    "The `thread_id` must have a minimum length of 16 characters."
+                )
+                main_models: list[str | None] = [
+                    model.model
+                    for model in llm_rails.config.models
+                    if model.type == "main"
+                ]
+
+                model_name: str = cast(
+                    str, "unknown" if len(main_models) == 0 else main_models[0]
+                )
+
+                return ChatCompletionResponse(
+                    id=f"chatcmpl-{uuid.uuid4().hex[:24]}",
+                    object="chat.completion",
+                    created=int(time.time()),
+                    model=model_name,
+                    choices=[
+                        ChatCompletionChoice(
+                            index=0,
+                            message=ChatCompletionMessage(
+                                role="assistant",
+                                content=error_message,
+                            ),
+                            finish_reason="stop",
+                        )
+                    ],
                 )
 
             # Fetch the existing thread messages. For easier management, we prepend
@@ -462,21 +623,72 @@ async def chat_completion(body: RequestBody, request: Request):
             if body.thread_id and datastore is not None and datastore_key is not None:
                 await datastore.set(datastore_key, json.dumps(messages + [bot_message]))
 
-            result = ResponseBody(messages=[bot_message])
+            # Return OpenAI-compatible format if requested
+            completion_id = f"chatcmpl-{uuid.uuid4().hex[:24]}"
+            created_timestamp = int(time.time())
 
-            # If we have additional GenerationResponse fields, we return as well
-            if isinstance(res, GenerationResponse):
-                result.llm_output = res.llm_output
-                result.output_data = res.output_data
-                result.log = res.log
-                result.state = res.state
+            main_models: list[str | None] = [
+                model.model for model in llm_rails.config.models if model.type == "main"
+            ]
+
+            model_name: str = cast(
+                str, "unknown" if len(main_models) == 0 else main_models[0]
+            )
 
-            return result
+            response = ChatCompletionResponse(
+                id=completion_id,
+                object="chat.completion",
+                created=created_timestamp,
+                model=model_name,
+                choices=[
+                    ChatCompletionChoice(
+                        index=0,
+                        message=ChatCompletionMessage(
+                            role=bot_message.get("role", "assistant"),
+                            content=bot_message.get("content", ""),
+                        ),
+                        finish_reason="stop",
+                    )
+                ],
+            )
+
+            # Add usage information if available in the log
+            if isinstance(res, GenerationResponse) and res.log:
+                # Try to extract token usage from log if available
+                # This is a best-effort extraction
+                response.usage = UsageInfo(
+                    prompt_tokens=0,
+                    completion_tokens=0,
+                    total_tokens=0,
+                )
+
+            return response
 
     except Exception as ex:
+        main_models: list[str | None] = [
+            model.model for model in llm_rails.config.models if model.type == "main"
+        ]
+
+        model_name: str = cast(
+            str, "unknown" if len(main_models) == 0 else main_models[0]
+        )
+
         log.exception(ex)
-        return ResponseBody(
-            messages=[{"role": "assistant", "content": "Internal server error."}]
+        return ChatCompletionResponse(
+            id=f"chatcmpl-{uuid.uuid4().hex[:24]}",
+            object="chat.completion",
+            created=int(time.time()),
+            model=model_name,
+            choices=[
+                ChatCompletionChoice(
+                    index=0,
+                    message=ChatCompletionMessage(
+                        role="assistant",
+                        content="Internal server error.",
+                    ),
+                    finish_reason="stop",
+                )
+            ],
         )
 
 
diff --git a/tests/benchmark/test_run_aiperf.py b/tests/benchmark/test_run_aiperf.py
index 13fe65297..711379356 100644
--- a/tests/benchmark/test_run_aiperf.py
+++ b/tests/benchmark/test_run_aiperf.py
@@ -756,6 +756,73 @@ def test_check_service_custom_endpoint(self, create_config_file):
             call_args = mock_get.call_args
             assert "/custom/endpoint" in call_args[0][0]
 
+    def test_check_service_no_api_key_env_var(self, create_config_file):
+        """Test checking service when api_key_env_var is not configured (None)."""
+        config_file = create_config_file()
+
+        runner = AIPerfRunner(config_file)
+
+        # Mock httpx.get
+        with patch("httpx.get") as mock_get:
+            mock_response = Mock()
+            mock_response.status_code = 200
+            mock_get.return_value = mock_response
+
+            runner._check_service()
+
+            # Verify headers=None was passed
+            mock_get.assert_called_once()
+            call_args = mock_get.call_args
+            assert call_args[1]["headers"] is None
+
+    def test_check_service_api_key_env_var_not_set(self, create_config_file):
+        """Test checking service when api_key_env_var is configured but env var doesn't exist."""
+        config_file = create_config_file(
+            extra_base_config={"api_key_env_var": "NONEXISTENT_API_KEY"}
+        )
+
+        runner = AIPerfRunner(config_file)
+
+        # Mock httpx.get
+        with patch("httpx.get") as mock_get:
+            mock_response = Mock()
+            mock_response.status_code = 200
+            mock_get.return_value = mock_response
+
+            runner._check_service()
+
+            # Verify headers=None was passed (since env var doesn't exist)
+            mock_get.assert_called_once()
+            call_args = mock_get.call_args
+            assert call_args[1]["headers"] is None
+
+    def test_check_service_api_key_env_var_set(self, create_config_file, monkeypatch):
+        """Test checking service when api_key_env_var is configured and env var exists."""
+        config_file = create_config_file(
+            extra_base_config={"api_key_env_var": "TEST_API_KEY"}
+        )
+
+        # Set the environment variable
+        monkeypatch.setenv("TEST_API_KEY", "test-secret-key-123")
+
+        runner = AIPerfRunner(config_file)
+
+        # Mock httpx.get
+        with patch("httpx.get") as mock_get:
+            mock_response = Mock()
+            mock_response.status_code = 200
+            mock_get.return_value = mock_response
+
+            runner._check_service()
+
+            # Verify headers with Authorization Bearer token was passed
+            mock_get.assert_called_once()
+            call_args = mock_get.call_args
+            assert call_args[1]["headers"] is not None
+            assert (
+                call_args[1]["headers"]["Authorization"] == "Bearer test-secret-key-123"
+            )
+
 
 class TestGetBatchDir:
     """Test the _get_batch_dir method."""

From 46f6d896d7d6f2f5ae1476f6368179aa99db4462 Mon Sep 17 00:00:00 2001
From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com>
Date: Thu, 20 Nov 2025 08:58:44 -0600
Subject: [PATCH 22/30] Revert server changes

---
 nemoguardrails/server/api.py | 270 ++++-------------------------------
 1 file changed, 29 insertions(+), 241 deletions(-)

diff --git a/nemoguardrails/server/api.py b/nemoguardrails/server/api.py
index 568a2de0a..864f156df 100644
--- a/nemoguardrails/server/api.py
+++ b/nemoguardrails/server/api.py
@@ -20,14 +20,11 @@
 import os.path
 import re
 import time
-
-# For generating unique IDs
-import uuid
 import warnings
 from contextlib import asynccontextmanager
-from typing import Any, Callable, List, Optional, cast
+from typing import Any, Callable, List, Optional
 
-from fastapi import FastAPI, HTTPException, Request
+from fastapi import FastAPI, Request
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel, Field, root_validator, validator
 from starlette.responses import StreamingResponse
@@ -204,10 +201,6 @@ class RequestBody(BaseModel):
         # alias="guardrails",
         validate_default=True,
     )
-    model: Optional[str] = Field(
-        default=None,
-        description="Model name (unused)",
-    )
     thread_id: Optional[str] = Field(
         default=None,
         min_length=16,
@@ -272,73 +265,6 @@ class ResponseBody(BaseModel):
     )
 
 
-# OpenAI-compatible response models
-class ModelObject(BaseModel):
-    """Represents an OpenAI model object."""
-
-    id: str = Field(description="The model identifier")
-    object: str = Field(default="model", description="The object type, always 'model'")
-    created: int = Field(description="Unix timestamp of when the model was created")
-    owned_by: str = Field(
-        default="nemo-guardrails", description="Organization that owns the model"
-    )
-
-
-class ModelsListResponse(BaseModel):
-    """Response for listing available models."""
-
-    object: str = Field(default="list", description="The object type, always 'list'")
-    data: List[ModelObject] = Field(description="List of model objects")
-
-
-class ChatCompletionMessage(BaseModel):
-    """A message in a chat completion."""
-
-    role: str = Field(description="The role of the message author")
-    content: Optional[str] = Field(
-        default=None, description="The content of the message"
-    )
-
-
-class ChatCompletionChoice(BaseModel):
-    """A choice in a chat completion response."""
-
-    index: int = Field(description="The index of the choice")
-    message: ChatCompletionMessage = Field(
-        description="The message generated by the model"
-    )
-    finish_reason: Optional[str] = Field(
-        default="stop", description="The reason the model stopped generating"
-    )
-
-
-class UsageInfo(BaseModel):
-    """Token usage information."""
-
-    prompt_tokens: int = Field(default=0, description="Number of tokens in the prompt")
-    completion_tokens: int = Field(
-        default=0, description="Number of tokens in the completion"
-    )
-    total_tokens: int = Field(default=0, description="Total number of tokens used")
-
-
-class ChatCompletionResponse(BaseModel):
-    """OpenAI-compatible chat completion response."""
-
-    id: str = Field(description="A unique identifier for the chat completion")
-    object: str = Field(default="chat.completion", description="The object type")
-    created: int = Field(
-        description="Unix timestamp of when the completion was created"
-    )
-    model: str = Field(description="The model used for the chat completion")
-    choices: List[ChatCompletionChoice] = Field(
-        description="List of completion choices"
-    )
-    usage: Optional[UsageInfo] = Field(
-        default=None, description="Token usage information"
-    )
-
-
 @app.get(
     "/v1/rails/configs",
     summary="Get List of available rails configurations.",
@@ -368,64 +294,6 @@ async def get_rails_configs():
     return [{"id": config_id} for config_id in config_ids]
 
 
-@app.get(
-    "/v1/models",
-    response_model=ModelsListResponse,
-    summary="List available models (OpenAI-compatible endpoint).",
-)
-async def list_models():
-    """Returns the list of available models in OpenAI-compatible format.
-
-    Each guardrails configuration is exposed as a separate model.
-    """
-    # Get the list of configs (which we'll expose as models)
-    configs = await get_rails_configs()
-
-    # Convert configs to OpenAI model format
-    models = []
-    current_timestamp = int(time.time())
-
-    for config in configs:
-        # Config always has an 'id' field from get_rails_configs
-        config_id = str(config["id"])
-        model = ModelObject(
-            id=config_id,
-            object="model",
-            created=current_timestamp,
-            owned_by="nemo-guardrails",
-        )
-        models.append(model)
-
-    return ModelsListResponse(object="list", data=models)
-
-
-@app.get(
-    "/v1/models/{model}",
-    response_model=ModelObject,
-    summary="Retrieve a model instance (OpenAI-compatible endpoint).",
-)
-async def retrieve_model(model: str):
-    """Returns details about a specific model in OpenAI-compatible format.
-
-    Each guardrails configuration is exposed as a separate model.
-    """
-    # Get the list of configs to check if the model exists
-    configs = await get_rails_configs()
-    config_ids = [str(config["id"]) for config in configs]
-
-    if model not in config_ids:
-        raise HTTPException(status_code=404, detail=f"Model '{model}' not found")
-
-    # Return the model object
-    current_timestamp = int(time.time())
-    return ModelObject(
-        id=model,
-        object="model",
-        created=current_timestamp,
-        owned_by="nemo-guardrails",
-    )
-
-
 # One instance of LLMRails per config id
 llm_rails_instances: dict[str, LLMRails] = {}
 llm_rails_events_history_cache: dict[str, dict] = {}
@@ -489,12 +357,13 @@ def _get_rails(config_ids: List[str]) -> LLMRails:
 
 @app.post(
     "/v1/chat/completions",
+    response_model=ResponseBody,
     response_model_exclude_none=True,
 )
 async def chat_completion(body: RequestBody, request: Request):
     """Chat completion for the provided conversation.
 
-    Supports both legacy and OpenAI-compatible response formats.
+    TODO: add support for explicit state object.
     """
     log.info("Got request for config %s", body.config_id)
     for logger in registered_loggers:
@@ -516,23 +385,14 @@ async def chat_completion(body: RequestBody, request: Request):
         llm_rails = _get_rails(config_ids)
     except ValueError as ex:
         log.exception(ex)
-        # Return OpenAI-compatible error response
-        return ChatCompletionResponse(
-            id=f"chatcmpl-{uuid.uuid4().hex[:24]}",
-            object="chat.completion",
-            created=int(time.time()),
-            model="Unknown",
-            choices=[
-                ChatCompletionChoice(
-                    index=0,
-                    message=ChatCompletionMessage(
-                        role="assistant",
-                        content=f"Could not load the {config_ids} guardrails configuration. "
-                        f"An internal error has occurred.",
-                    ),
-                    finish_reason="stop",
-                )
-            ],
+        return ResponseBody(
+            messages=[
+                {
+                    "role": "assistant",
+                    "content": f"Could not load the {config_ids} guardrails configuration. "
+                    f"An internal error has occurred.",
+                }
+            ]
         )
 
     try:
@@ -549,34 +409,13 @@ async def chat_completion(body: RequestBody, request: Request):
 
             # We make sure the `thread_id` meets the minimum complexity requirement.
             if len(body.thread_id) < 16:
-                error_message = (
-                    "The `thread_id` must have a minimum length of 16 characters."
-                )
-                main_models: list[str | None] = [
-                    model.model
-                    for model in llm_rails.config.models
-                    if model.type == "main"
-                ]
-
-                model_name: str = cast(
-                    str, "unknown" if len(main_models) == 0 else main_models[0]
-                )
-
-                return ChatCompletionResponse(
-                    id=f"chatcmpl-{uuid.uuid4().hex[:24]}",
-                    object="chat.completion",
-                    created=int(time.time()),
-                    model=model_name,
-                    choices=[
-                        ChatCompletionChoice(
-                            index=0,
-                            message=ChatCompletionMessage(
-                                role="assistant",
-                                content=error_message,
-                            ),
-                            finish_reason="stop",
-                        )
-                    ],
+                return ResponseBody(
+                    messages=[
+                        {
+                            "role": "assistant",
+                            "content": "The `thread_id` must have a minimum length of 16 characters.",
+                        }
+                    ]
                 )
 
             # Fetch the existing thread messages. For easier management, we prepend
@@ -623,72 +462,21 @@ async def chat_completion(body: RequestBody, request: Request):
             if body.thread_id and datastore is not None and datastore_key is not None:
                 await datastore.set(datastore_key, json.dumps(messages + [bot_message]))
 
-            # Return OpenAI-compatible format if requested
-            completion_id = f"chatcmpl-{uuid.uuid4().hex[:24]}"
-            created_timestamp = int(time.time())
-
-            main_models: list[str | None] = [
-                model.model for model in llm_rails.config.models if model.type == "main"
-            ]
-
-            model_name: str = cast(
-                str, "unknown" if len(main_models) == 0 else main_models[0]
-            )
+            result = ResponseBody(messages=[bot_message])
 
-            response = ChatCompletionResponse(
-                id=completion_id,
-                object="chat.completion",
-                created=created_timestamp,
-                model=model_name,
-                choices=[
-                    ChatCompletionChoice(
-                        index=0,
-                        message=ChatCompletionMessage(
-                            role=bot_message.get("role", "assistant"),
-                            content=bot_message.get("content", ""),
-                        ),
-                        finish_reason="stop",
-                    )
-                ],
-            )
-
-            # Add usage information if available in the log
-            if isinstance(res, GenerationResponse) and res.log:
-                # Try to extract token usage from log if available
-                # This is a best-effort extraction
-                response.usage = UsageInfo(
-                    prompt_tokens=0,
-                    completion_tokens=0,
-                    total_tokens=0,
-                )
+            # If we have additional GenerationResponse fields, we return as well
+            if isinstance(res, GenerationResponse):
+                result.llm_output = res.llm_output
+                result.output_data = res.output_data
+                result.log = res.log
+                result.state = res.state
 
-            return response
+            return result
 
     except Exception as ex:
-        main_models: list[str | None] = [
-            model.model for model in llm_rails.config.models if model.type == "main"
-        ]
-
-        model_name: str = cast(
-            str, "unknown" if len(main_models) == 0 else main_models[0]
-        )
-
         log.exception(ex)
-        return ChatCompletionResponse(
-            id=f"chatcmpl-{uuid.uuid4().hex[:24]}",
-            object="chat.completion",
-            created=int(time.time()),
-            model=model_name,
-            choices=[
-                ChatCompletionChoice(
-                    index=0,
-                    message=ChatCompletionMessage(
-                        role="assistant",
-                        content="Internal server error.",
-                    ),
-                    finish_reason="stop",
-                )
-            ],
+        return ResponseBody(
+            messages=[{"role": "assistant", "content": "Internal server error."}]
         )
 
 

From bc79ebd5e14958ca15b04c11365ee4d671265ade Mon Sep 17 00:00:00 2001
From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com>
Date: Mon, 1 Dec 2025 09:34:00 -0600
Subject: [PATCH 23/30] Address PR feedback

---
 nemoguardrails/benchmark/aiperf/README.md     | 13 ++++++++-----
 nemoguardrails/benchmark/aiperf/run_aiperf.py | 18 ++++++++++++------
 tests/benchmark/test_run_aiperf.py            | 18 +++++++++++++++++-
 3 files changed, 37 insertions(+), 12 deletions(-)

diff --git a/nemoguardrails/benchmark/aiperf/README.md b/nemoguardrails/benchmark/aiperf/README.md
index 338b0432d..437bd361a 100644
--- a/nemoguardrails/benchmark/aiperf/README.md
+++ b/nemoguardrails/benchmark/aiperf/README.md
@@ -2,7 +2,7 @@
 
 ## Introduction
 
-[AIPerf](https://github.com/triton-inference-server/perf_analyzer/tree/main/genai-perf) is NVIDIA's latest benchmarking tool for LLMs. It supports any OpenAI-compatible inference service and generates synthetic data loads, benchmarks, and all the metrics needed for performance comparison and analysis.
+[AIPerf](https://github.com/ai-dynamo/aiperf) is NVIDIA's latest benchmarking tool for LLMs. It supports any OpenAI-compatible inference service and generates synthetic data loads, benchmarks, and all the metrics needed for performance comparison and analysis.
 
 The [`run_aiperf.py`](run_aiperf.py) script enhances AIPerf's capabilities by providing:
 
@@ -20,6 +20,9 @@ Instead of manually running AIPerf multiple times with different parameters, you
 ### Prerequisites
 
 These steps have been tested with Python 3.11.11.
+To use the provided configurations, you need to create accounts at https://build.nvidia.com/ and [Huggingface](https://huggingface.co/).
+The provided configurations use models hosted at https://build.nvidia.com/, you'll need to create a Personal API Key to access the models.
+AIperf requires the [Meta Llama 3.3 70B Instruct tokenizer](https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct) to calculate token-counts.
 
 1. **Install NeMo Guardrails with developer tooling:**
 
@@ -33,21 +36,21 @@ These steps have been tested with Python 3.11.11.
    poetry run pip install aiperf langchain-nvidia-ai-endpoints
    ```
 
-3. **[Optional] Install/upgrade Hugging Face Hub:**
+3. ** Install/upgrade Hugging Face Hub:**
 
    AIPerf needs a tokenizer to run and will download one from Hugging Face if available. If you have the tokenizer locally, you can point to that directory and not log into Huggingface.
 
    ```bash
-   pip install --upgrade huggingface_hub
+   poetry run pip install --upgrade huggingface_hub
    ```
 
-4. **[Optional] Login to Hugging Face:**
+4. ** Login to Hugging Face:**
 
    ```bash
    huggingface-cli login
    ```
 
-5. **[Optional] Set NVIDIA API Key:**
+5. ** Set NVIDIA API Key:**
 
    To use models hosted on [build.nvidia.com](https://build.nvidia.com/), set your API key:
 
diff --git a/nemoguardrails/benchmark/aiperf/run_aiperf.py b/nemoguardrails/benchmark/aiperf/run_aiperf.py
index 8ec749940..4ce5491dc 100755
--- a/nemoguardrails/benchmark/aiperf/run_aiperf.py
+++ b/nemoguardrails/benchmark/aiperf/run_aiperf.py
@@ -34,18 +34,16 @@
 
 from nemoguardrails.benchmark.aiperf.aiperf_models import AIPerfConfig
 
-# Set up logging
 log = logging.getLogger(__name__)
-log.setLevel(logging.INFO)  # Set the lowest level to capture all messages
+log.setLevel(logging.INFO)
 
 formatter = logging.Formatter(
     "%(asctime)s %(levelname)s: %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
 )
 console_handler = logging.StreamHandler()
-console_handler.setLevel(logging.DEBUG)  # DEBUG and higher will go to the console
+console_handler.setLevel(logging.DEBUG)
 console_handler.setFormatter(formatter)
 
-# Add the console handler for logging
 log.addHandler(console_handler)
 
 
@@ -97,6 +95,14 @@ def _get_sweep_combinations(self) -> Optional[List[Dict[str, Union[int, str]]]]:
         param_names = list(self.config.sweeps.keys())
         param_values = [self.config.sweeps[name] for name in param_names]
 
+        num_runs = 1
+        for _, sweep_values in self.config.sweeps.items():
+            num_runs *= len(sweep_values)
+
+        max_runs = 100
+        if num_runs > max_runs:
+            raise RuntimeError(f"Requested {num_runs} runs, max is {max_runs}")
+
         # Generate all combinations
         combinations = []
         for combination in itertools.product(*param_values):
@@ -168,9 +174,9 @@ def _build_command(
                 api_key = os.environ.get(value)
                 if not api_key:
                     raise RuntimeError(
-                        f"Environment variable {value} not set. Please store the API Key in {value}"
+                        f"Environment variable '{value}' is not set. Please set it: export {value}='your-api-key'"
                     )
-                cmd.extend([f"--api-key", str(api_key)])
+                cmd.extend(["--api-key", str(api_key)])
                 continue
 
             # Convert underscores to hyphens for CLI arguments
diff --git a/tests/benchmark/test_run_aiperf.py b/tests/benchmark/test_run_aiperf.py
index 711379356..3efd54d78 100644
--- a/tests/benchmark/test_run_aiperf.py
+++ b/tests/benchmark/test_run_aiperf.py
@@ -222,6 +222,22 @@ def test_multiple_sweep_parameters(self, create_config_file):
         assert {"concurrency": 2, "benchmark_duration": 30} in combinations
         assert {"concurrency": 2, "benchmark_duration": 60} in combinations
 
+    def test_too_many_runs_raises(self, create_config_file):
+        """Test sweeps with more than 100 runs to make sure Excpetion is raised"""
+
+        # Create a config with two parameter sweeps of 100 each
+        # This has a total of 10,000, greater than 100 limit
+        config_file = create_config_file(
+            sweeps={
+                "concurrency": list(range(100)),
+                "benchmark_duration": list(range(100)),
+            }
+        )
+
+        runner = AIPerfRunner(config_file)
+        with pytest.raises(RuntimeError, match="Requested 10000 runs, max is 100"):
+            _ = runner._get_sweep_combinations()
+
 
 class TestSanitizeCommandForLogging:
     """Test the _sanitize_command_for_logging static method."""
@@ -419,7 +435,7 @@ def test_build_command_with_missing_api_key_env_var(
         output_dir = tmp_path / "output"
 
         with pytest.raises(
-            RuntimeError, match="Environment variable MISSING_API_KEY not set"
+            RuntimeError, match="Environment variable 'MISSING_API_KEY' is not set. Please set it: export MISSING_API_KEY='your-api-key'"
         ):
             runner._build_command(None, output_dir)
 

From b5aad83108d9fce9acd51b1b7d33adf64e3d9731 Mon Sep 17 00:00:00 2001
From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com>
Date: Mon, 1 Dec 2025 09:49:47 -0600
Subject: [PATCH 24/30] Move aiperf code to top-level

---
 {nemoguardrails/benchmark/aiperf => aiperf}/README.md         | 0
 {nemoguardrails/benchmark/aiperf => aiperf}/__init__.py       | 0
 .../aiperf => aiperf}/aiperf_configs/single_concurrency.yaml  | 0
 .../aiperf => aiperf}/aiperf_configs/sweep_concurrency.yaml   | 0
 {nemoguardrails/benchmark/aiperf => aiperf}/aiperf_models.py  | 0
 {nemoguardrails/benchmark/aiperf => aiperf}/run_aiperf.py     | 2 +-
 nemoguardrails/llm/providers/huggingface/streamers.py         | 4 +++-
 7 files changed, 4 insertions(+), 2 deletions(-)
 rename {nemoguardrails/benchmark/aiperf => aiperf}/README.md (100%)
 rename {nemoguardrails/benchmark/aiperf => aiperf}/__init__.py (100%)
 rename {nemoguardrails/benchmark/aiperf => aiperf}/aiperf_configs/single_concurrency.yaml (100%)
 rename {nemoguardrails/benchmark/aiperf => aiperf}/aiperf_configs/sweep_concurrency.yaml (100%)
 rename {nemoguardrails/benchmark/aiperf => aiperf}/aiperf_models.py (100%)
 rename {nemoguardrails/benchmark/aiperf => aiperf}/run_aiperf.py (99%)

diff --git a/nemoguardrails/benchmark/aiperf/README.md b/aiperf/README.md
similarity index 100%
rename from nemoguardrails/benchmark/aiperf/README.md
rename to aiperf/README.md
diff --git a/nemoguardrails/benchmark/aiperf/__init__.py b/aiperf/__init__.py
similarity index 100%
rename from nemoguardrails/benchmark/aiperf/__init__.py
rename to aiperf/__init__.py
diff --git a/nemoguardrails/benchmark/aiperf/aiperf_configs/single_concurrency.yaml b/aiperf/aiperf_configs/single_concurrency.yaml
similarity index 100%
rename from nemoguardrails/benchmark/aiperf/aiperf_configs/single_concurrency.yaml
rename to aiperf/aiperf_configs/single_concurrency.yaml
diff --git a/nemoguardrails/benchmark/aiperf/aiperf_configs/sweep_concurrency.yaml b/aiperf/aiperf_configs/sweep_concurrency.yaml
similarity index 100%
rename from nemoguardrails/benchmark/aiperf/aiperf_configs/sweep_concurrency.yaml
rename to aiperf/aiperf_configs/sweep_concurrency.yaml
diff --git a/nemoguardrails/benchmark/aiperf/aiperf_models.py b/aiperf/aiperf_models.py
similarity index 100%
rename from nemoguardrails/benchmark/aiperf/aiperf_models.py
rename to aiperf/aiperf_models.py
diff --git a/nemoguardrails/benchmark/aiperf/run_aiperf.py b/aiperf/run_aiperf.py
similarity index 99%
rename from nemoguardrails/benchmark/aiperf/run_aiperf.py
rename to aiperf/run_aiperf.py
index 4ce5491dc..00ec09fbe 100755
--- a/nemoguardrails/benchmark/aiperf/run_aiperf.py
+++ b/aiperf/run_aiperf.py
@@ -32,7 +32,7 @@
 import yaml
 from pydantic import ValidationError
 
-from nemoguardrails.benchmark.aiperf.aiperf_models import AIPerfConfig
+from aiperf.aiperf_models import AIPerfConfig
 
 log = logging.getLogger(__name__)
 log.setLevel(logging.INFO)
diff --git a/nemoguardrails/llm/providers/huggingface/streamers.py b/nemoguardrails/llm/providers/huggingface/streamers.py
index e726b6e11..14c406124 100644
--- a/nemoguardrails/llm/providers/huggingface/streamers.py
+++ b/nemoguardrails/llm/providers/huggingface/streamers.py
@@ -18,7 +18,9 @@
 
 TRANSFORMERS_AVAILABLE = True
 try:
-    from transformers.generation.streamers import TextStreamer  # type: ignore
+    from transformers.generation.streamers import (  # type: ignore[import-untyped]
+        TextStreamer,
+    )
 except ImportError:
     # Fallback if transformers is not available
     TRANSFORMERS_AVAILABLE = False

From 139f124d16e03b1639d7510a430aa4dfb11b0c98 Mon Sep 17 00:00:00 2001
From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com>
Date: Mon, 1 Dec 2025 09:52:05 -0600
Subject: [PATCH 25/30] Update tests for new aiperf location

---
 tests/benchmark/test_aiperf_models.py |  2 +-
 tests/benchmark/test_run_aiperf.py    | 39 ++++++++++-----------------
 2 files changed, 15 insertions(+), 26 deletions(-)

diff --git a/tests/benchmark/test_aiperf_models.py b/tests/benchmark/test_aiperf_models.py
index 32eb35c86..991304f62 100644
--- a/tests/benchmark/test_aiperf_models.py
+++ b/tests/benchmark/test_aiperf_models.py
@@ -22,7 +22,7 @@
 import pytest
 from pydantic import ValidationError
 
-from nemoguardrails.benchmark.aiperf.aiperf_models import AIPerfConfig, BaseConfig
+from aiperf.aiperf_models import AIPerfConfig, BaseConfig
 
 
 class TestBaseConfig:
diff --git a/tests/benchmark/test_run_aiperf.py b/tests/benchmark/test_run_aiperf.py
index 3efd54d78..9c1045143 100644
--- a/tests/benchmark/test_run_aiperf.py
+++ b/tests/benchmark/test_run_aiperf.py
@@ -29,8 +29,8 @@
 import yaml
 from typer.testing import CliRunner
 
-from nemoguardrails.benchmark.aiperf.aiperf_models import AIPerfConfig, BaseConfig
-from nemoguardrails.benchmark.aiperf.run_aiperf import AIPerfRunner, AIPerfSummary
+from aiperf.aiperf_models import AIPerfConfig, BaseConfig
+from aiperf.run_aiperf import AIPerfRunner, AIPerfSummary
 
 
 @pytest.fixture
@@ -435,7 +435,8 @@ def test_build_command_with_missing_api_key_env_var(
         output_dir = tmp_path / "output"
 
         with pytest.raises(
-            RuntimeError, match="Environment variable 'MISSING_API_KEY' is not set. Please set it: export MISSING_API_KEY='your-api-key'"
+            RuntimeError,
+            match="Environment variable 'MISSING_API_KEY' is not set. Please set it: export MISSING_API_KEY='your-api-key'",
         ):
             runner._build_command(None, output_dir)
 
@@ -488,9 +489,7 @@ def test_build_command_ui_type_debug(self, create_config_file, tmp_path):
         output_dir = tmp_path / "output"
 
         # Patch log.level to be DEBUG
-        with patch(
-            "nemoguardrails.benchmark.aiperf.run_aiperf.log.level", logging.DEBUG
-        ):
+        with patch("aiperf.run_aiperf.log.level", logging.DEBUG):
             cmd = runner._build_command(None, output_dir)
 
             assert "--ui-type" in cmd
@@ -505,9 +504,7 @@ def test_build_command_ui_type_non_debug(self, create_config_file, tmp_path):
         output_dir = tmp_path / "output"
 
         # Patch log.level to be INFO
-        with patch(
-            "nemoguardrails.benchmark.aiperf.run_aiperf.log.level", logging.INFO
-        ):
+        with patch("aiperf.run_aiperf.log.level", logging.INFO):
             cmd = runner._build_command(None, output_dir)
 
             assert "--ui-type" in cmd
@@ -1141,12 +1138,10 @@ def test_cli_run_command_basic(self, create_config_file):
         config_file = create_config_file()
         runner = CliRunner()
 
-        from nemoguardrails.benchmark.aiperf.run_aiperf import app
+        from aiperf.run_aiperf import app
 
         # Mock the runner and service check
-        with patch(
-            "nemoguardrails.benchmark.aiperf.run_aiperf.AIPerfRunner"
-        ) as mock_runner_class:
+        with patch("aiperf.run_aiperf.AIPerfRunner") as mock_runner_class:
             mock_runner = Mock()
             mock_runner.run.return_value = 0
             mock_runner_class.return_value = mock_runner
@@ -1161,12 +1156,10 @@ def test_cli_run_command_with_verbose(self, create_config_file):
         config_file = create_config_file()
         runner = CliRunner()
 
-        from nemoguardrails.benchmark.aiperf.run_aiperf import app
+        from aiperf.run_aiperf import app
 
         # Mock the runner and service check
-        with patch(
-            "nemoguardrails.benchmark.aiperf.run_aiperf.AIPerfRunner"
-        ) as mock_runner_class:
+        with patch("aiperf.run_aiperf.AIPerfRunner") as mock_runner_class:
             mock_runner = Mock()
             mock_runner.run.return_value = 0
             mock_runner_class.return_value = mock_runner
@@ -1183,12 +1176,10 @@ def test_cli_run_command_with_dry_run(self, create_config_file):
         config_file = create_config_file()
         runner = CliRunner()
 
-        from nemoguardrails.benchmark.aiperf.run_aiperf import app
+        from aiperf.run_aiperf import app
 
         # Mock the runner and service check
-        with patch(
-            "nemoguardrails.benchmark.aiperf.run_aiperf.AIPerfRunner"
-        ) as mock_runner_class:
+        with patch("aiperf.run_aiperf.AIPerfRunner") as mock_runner_class:
             mock_runner = Mock()
             mock_runner.run.return_value = 0
             mock_runner_class.return_value = mock_runner
@@ -1205,12 +1196,10 @@ def test_cli_run_command_with_failure(self, create_config_file):
         config_file = create_config_file()
         runner = CliRunner()
 
-        from nemoguardrails.benchmark.aiperf.run_aiperf import app
+        from aiperf.run_aiperf import app
 
         # Mock the runner to return failure
-        with patch(
-            "nemoguardrails.benchmark.aiperf.run_aiperf.AIPerfRunner"
-        ) as mock_runner_class:
+        with patch("aiperf.run_aiperf.AIPerfRunner") as mock_runner_class:
             mock_runner = Mock()
             mock_runner.run.return_value = 1  # Failure
             mock_runner_class.return_value = mock_runner

From 64ee0b576a61fb7661085c2237872ca3093b10dd Mon Sep 17 00:00:00 2001
From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com>
Date: Mon, 1 Dec 2025 10:08:30 -0600
Subject: [PATCH 26/30] Rename configs directory

---
 aiperf/{aiperf_configs => configs}/single_concurrency.yaml | 0
 aiperf/{aiperf_configs => configs}/sweep_concurrency.yaml  | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename aiperf/{aiperf_configs => configs}/single_concurrency.yaml (100%)
 rename aiperf/{aiperf_configs => configs}/sweep_concurrency.yaml (100%)

diff --git a/aiperf/aiperf_configs/single_concurrency.yaml b/aiperf/configs/single_concurrency.yaml
similarity index 100%
rename from aiperf/aiperf_configs/single_concurrency.yaml
rename to aiperf/configs/single_concurrency.yaml
diff --git a/aiperf/aiperf_configs/sweep_concurrency.yaml b/aiperf/configs/sweep_concurrency.yaml
similarity index 100%
rename from aiperf/aiperf_configs/sweep_concurrency.yaml
rename to aiperf/configs/sweep_concurrency.yaml

From 04e3511a475948053332bb735b4774b65a9f4b05 Mon Sep 17 00:00:00 2001
From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com>
Date: Mon, 1 Dec 2025 10:55:50 -0600
Subject: [PATCH 27/30] Create self-contained typer app, update README with new
 commands to run it

---
 aiperf/README.md     | 65 ++++++++++++++++++++------------------------
 aiperf/__main__.py   | 25 +++++++++++++++++
 aiperf/run_aiperf.py |  6 ++++
 3 files changed, 61 insertions(+), 35 deletions(-)
 create mode 100644 aiperf/__main__.py

diff --git a/aiperf/README.md b/aiperf/README.md
index 437bd361a..84c58193a 100644
--- a/aiperf/README.md
+++ b/aiperf/README.md
@@ -21,41 +21,36 @@ Instead of manually running AIPerf multiple times with different parameters, you
 
 These steps have been tested with Python 3.11.11.
 To use the provided configurations, you need to create accounts at https://build.nvidia.com/ and [Huggingface](https://huggingface.co/).
-The provided configurations use models hosted at https://build.nvidia.com/, you'll need to create a Personal API Key to access the models.
-AIperf requires the [Meta Llama 3.3 70B Instruct tokenizer](https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct) to calculate token-counts.
+* The provided configurations use models hosted at https://build.nvidia.com/, you'll need to create a Personal API Key to access the models.
+* The provided AIperf configurations require the [Meta Llama 3.3 70B Instruct tokenizer](https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct) to calculate token-counts.
 
-1. **Install NeMo Guardrails with developer tooling:**
+1. **Create a virtual environment in which to install AIPerf**
 
    ```bash
-   poetry install --with dev
+   $ mkdir ~/env
+   $ python -m venv ~/env/aiperf
    ```
 
-2. **Install AIPerf and NVIDIA AI Endpoints:**
+2. **Install dependencies in the virtual environment**
 
    ```bash
-   poetry run pip install aiperf langchain-nvidia-ai-endpoints
+   $ pip install aiperf huggingface_hub typer
    ```
 
-3. ** Install/upgrade Hugging Face Hub:**
-
-   AIPerf needs a tokenizer to run and will download one from Hugging Face if available. If you have the tokenizer locally, you can point to that directory and not log into Huggingface.
-
-   ```bash
-   poetry run pip install --upgrade huggingface_hub
-   ```
-
-4. ** Login to Hugging Face:**
+3. ** Login to Hugging Face:**
 
    ```bash
    huggingface-cli login
    ```
 
-5. ** Set NVIDIA API Key:**
+4. ** Set NVIDIA API Key:**
 
-   To use models hosted on [build.nvidia.com](https://build.nvidia.com/), set your API key:
+   The provided configs use models hosted on [build.nvidia.com](https://build.nvidia.com/).
+   To access these, [create an account](https://build.nvidia.com/), and create a Personal API Key.
+   After creating a Personal API key, set the `NVIDIA_API_KEY` variable as below.
 
    ```bash
-   export NVIDIA_API_KEY="your-api-key-here"
+   $ export NVIDIA_API_KEY="your-api-key-here"
    ```
 
 ## Running Benchmarks
@@ -70,7 +65,7 @@ There are two example configs included which can be extended for your use-cases.
 To run a benchmark, use the following command:
 
 ```bash
-poetry run nemoguardrails aiperf run --config-file <path-to-config.yaml>
+$ python -m aiperf --config-file <path-to-config.yaml>
 ```
 
 ### Running a Single Benchmark
@@ -78,31 +73,31 @@ poetry run nemoguardrails aiperf run --config-file <path-to-config.yaml>
 To run a single benchmark with fixed parameters, use the `single_concurrency.yaml` configuration:
 
 ```bash
-poetry run nemoguardrails aiperf run --config-file nemoguardrails/benchmark/aiperf/aiperf_configs/single_concurrency.yaml
+$ python -m aiperf --config-file aiperf/configs/single_concurrency.yaml
 ```
 
 **Example output:**
 
 ```text
-2025-11-14 13:58:21 INFO: Running AIPerf with configuration: nemoguardrails/benchmark/aiperf/aiperf_configs/single_concurrency.yaml
-2025-11-14 13:58:21 INFO: Results root directory: aiperf_results/single_concurrency/20251114_135821
-2025-11-14 13:58:21 INFO: Sweeping parameters: None
-2025-11-14 13:58:21 INFO: Running AIPerf with configuration: nemoguardrails/benchmark/aiperf/aiperf_configs/single_concurrency.yaml
-2025-11-14 13:58:21 INFO: Output directory: aiperf_results/single_concurrency/20251114_135821
-2025-11-14 13:58:21 INFO: Single Run
-2025-11-14 13:59:58 INFO: Run completed successfully
-2025-11-14 13:59:58 INFO: SUMMARY
-2025-11-14 13:59:58 INFO: Total runs : 1
-2025-11-14 13:59:58 INFO: Completed  : 1
-2025-11-14 13:59:58 INFO: Failed     : 0
+2025-12-01 10:35:17 INFO: Running AIPerf with configuration: aiperf/configs/single_concurrency.yaml
+2025-12-01 10:35:17 INFO: Results root directory: aiperf_results/single_concurrency/20251201_103517
+2025-12-01 10:35:17 INFO: Sweeping parameters: None
+2025-12-01 10:35:17 INFO: Running AIPerf with configuration: aiperf/configs/single_concurrency.yaml
+2025-12-01 10:35:17 INFO: Output directory: aiperf_results/single_concurrency/20251201_103517
+2025-12-01 10:35:17 INFO: Single Run
+2025-12-01 10:36:54 INFO: Run completed successfully
+2025-12-01 10:36:54 INFO: SUMMARY
+2025-12-01 10:36:54 INFO: Total runs : 1
+2025-12-01 10:36:54 INFO: Completed  : 1
+2025-12-01 10:36:54 INFO: Failed     : 0
 ```
 
 ### Running a Concurrency Sweep
 
-To run multiple benchmarks with different concurrency levels, use the `sweep_concurrency.yaml` configuration:
+To run multiple benchmarks with different concurrency levels, use the `sweep_concurrency.yaml` configuration as below:
 
 ```bash
-poetry run nemoguardrails aiperf run --config-file nemoguardrails/benchmark/aiperf/aiperf_configs/sweep_concurrency.yaml
+$ python -m aiperf --config-file aiperf/configs/sweep_concurrency.yaml
 ```
 
 **Example output:**
@@ -139,7 +134,7 @@ The `--dry-run` option allows you to preview all benchmark commands without exec
 - Debugging configuration issues
 
 ```bash
-poetry run nemoguardrails aiperf run --config-file <config.yaml> --dry-run
+$ python -m aiperf --config-file aiperf/configs/sweep_concurrency.yaml --dry-run
 ```
 
 When in dry-run mode, the script will:
@@ -155,7 +150,7 @@ When in dry-run mode, the script will:
 The `--verbose` option outputs more detailed debugging information to understand each step of the benchmarking process.
 
 ```bash
-poetry run nemoguardrails aiperf run --config-file <config.yaml> --verbose
+$ python -m aiperf --config-file <config.yaml> --verbose
 ```
 
 Verbose mode provides:
diff --git a/aiperf/__main__.py b/aiperf/__main__.py
new file mode 100644
index 000000000..90dc5d8f9
--- /dev/null
+++ b/aiperf/__main__.py
@@ -0,0 +1,25 @@
+#!/usr/bin/env python3
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Entry point for running aiperf as a module: python -m aiperf"""
+
+from aiperf.run_aiperf import app
+
+if __name__ == "__main__":
+    app()
diff --git a/aiperf/run_aiperf.py b/aiperf/run_aiperf.py
index 00ec09fbe..a9dd04c84 100755
--- a/aiperf/run_aiperf.py
+++ b/aiperf/run_aiperf.py
@@ -46,6 +46,8 @@
 
 log.addHandler(console_handler)
 
+app = typer.Typer()
+
 
 @dataclass
 class AIPerfSummary:
@@ -486,3 +488,7 @@ def run(
     exit_code = runner.run(dry_run=dry_run)
 
     raise typer.Exit(code=exit_code)
+
+
+if __name__ == "__main__":
+    app()

From f95d772202c602b0d91e4281b542f66e2a35c87c Mon Sep 17 00:00:00 2001
From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com>
Date: Mon, 1 Dec 2025 11:31:31 -0600
Subject: [PATCH 28/30] Rebase onto develop and re-run ruff formatter

---
 aiperf/aiperf_models.py               | 28 +++-------
 aiperf/run_aiperf.py                  | 30 +++--------
 nemoguardrails/cli/__init__.py        |  1 -
 nemoguardrails/server/api.py          |  4 +-
 tests/benchmark/test_aiperf_models.py | 16 ++----
 tests/benchmark/test_run_aiperf.py    | 78 +++++++--------------------
 6 files changed, 38 insertions(+), 119 deletions(-)

diff --git a/aiperf/aiperf_models.py b/aiperf/aiperf_models.py
index be436a491..f3c0f6843 100644
--- a/aiperf/aiperf_models.py
+++ b/aiperf/aiperf_models.py
@@ -33,22 +33,16 @@ class BaseConfig(BaseModel):
         description="Optional tokenizer Huggingface name, or local directory",
     )
     url: str = Field(..., description="Model base URL")
-    endpoint: str = Field(
-        default="/v1/chat/completions", description="API endpoint path"
-    )
+    endpoint: str = Field(default="/v1/chat/completions", description="API endpoint path")
     endpoint_type: Literal["chat", "completions"] = Field(
         default="chat",
         description="Type of endpoint (chat or completions)",
     )
-    api_key_env_var: Optional[str] = Field(
-        default=None, description="API key environment variable"
-    )
+    api_key_env_var: Optional[str] = Field(default=None, description="API key environment variable")
     streaming: Optional[bool] = Field(default=False, description="Streaming mode")
 
     # Load generation settings
-    warmup_request_count: int = Field(
-        description="Requests to send before beginning performance-test"
-    )
+    warmup_request_count: int = Field(description="Requests to send before beginning performance-test")
     benchmark_duration: int = Field(description="Benchmark duration in seconds")
     concurrency: int = Field(description="Number of concurrent requests")
     request_rate: Optional[float] = Field(
@@ -61,9 +55,7 @@ class BaseConfig(BaseModel):
     )
 
     # Synthetic data generation
-    random_seed: Optional[int] = Field(
-        default=None, description="Random seed for reproducibility"
-    )
+    random_seed: Optional[int] = Field(default=None, description="Random seed for reproducibility")
     prompt_input_tokens_mean: Optional[int] = Field(
         default=None,
         description="Mean number of input tokens",
@@ -85,16 +77,12 @@ class BaseConfig(BaseModel):
 class AIPerfConfig(BaseModel):
     """Main configuration model for AIPerf benchmark runner."""
 
-    batch_name: str = Field(
-        default="benchmark", description="Name for this batch of benchmarks"
-    )
+    batch_name: str = Field(default="benchmark", description="Name for this batch of benchmarks")
     output_base_dir: str = Field(
         default="aiperf_results",
         description="Base directory for benchmark results",
     )
-    base_config: BaseConfig = Field(
-        ..., description="Base configuration applied to all benchmark runs"
-    )
+    base_config: BaseConfig = Field(..., description="Base configuration applied to all benchmark runs")
     sweeps: Optional[Dict[str, List[Union[int, str]]]] = Field(
         default=None,
         description="Parameter sweeps. Key is the parameter to change, value is a list of values to use",
@@ -102,9 +90,7 @@ class AIPerfConfig(BaseModel):
 
     @field_validator("sweeps")
     @classmethod
-    def validate_sweeps(
-        cls, v: Optional[Dict[str, List[Any]]]
-    ) -> Optional[Dict[str, List[Any]]]:
+    def validate_sweeps(cls, v: Optional[Dict[str, List[Any]]]) -> Optional[Dict[str, List[Any]]]:
         """Validate that sweep values are lists of ints or strings."""
         if v is None:
             return v
diff --git a/aiperf/run_aiperf.py b/aiperf/run_aiperf.py
index a9dd04c84..3470f1e3e 100755
--- a/aiperf/run_aiperf.py
+++ b/aiperf/run_aiperf.py
@@ -25,7 +25,7 @@
 from datetime import datetime
 from pathlib import Path
 from subprocess import CompletedProcess
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Union
 
 import httpx
 import typer
@@ -37,9 +37,7 @@
 log = logging.getLogger(__name__)
 log.setLevel(logging.INFO)
 
-formatter = logging.Formatter(
-    "%(asctime)s %(levelname)s: %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
-)
+formatter = logging.Formatter("%(asctime)s %(levelname)s: %(message)s", datefmt="%Y-%m-%d %H:%M:%S")
 console_handler = logging.StreamHandler()
 console_handler.setLevel(logging.DEBUG)
 console_handler.setFormatter(formatter)
@@ -143,9 +141,7 @@ def _sanitize_command_for_logging(cmd: List[str]) -> str:
 
         return " ".join(sanitized)
 
-    def _build_command(
-        self, sweep_params: Optional[Dict[str, Union[str, int]]], output_dir: Path
-    ) -> List[str]:
+    def _build_command(self, sweep_params: Optional[Dict[str, Union[str, int]]], output_dir: Path) -> List[str]:
         """Create a list of strings with the aiperf command and arguments to execute"""
 
         # Run aiperf in profile mode: `aiperf profile`
@@ -239,9 +235,7 @@ def _save_run_metadata(
             json.dump(metadata, f, indent=2)
 
     @staticmethod
-    def _save_subprocess_result_json(
-        output_dir: Path, result: CompletedProcess
-    ) -> None:
+    def _save_subprocess_result_json(output_dir: Path, result: CompletedProcess) -> None:
         """Save the subprocess result to the given filename"""
 
         process_result_file = output_dir / "process_result.json"
@@ -252,15 +246,11 @@ def _save_subprocess_result_json(
                 json.dump(save_data, f, indent=2)
 
         except (IOError, OSError) as e:
-            log.error(
-                "Could not write %s to file %s: %s", save_data, process_result_file, e
-            )
+            log.error("Could not write %s to file %s: %s", save_data, process_result_file, e)
             raise
 
         except TypeError as e:
-            log.error(
-                "Couldn't serialize %s to %s: %s", save_data, process_result_file, e
-            )
+            log.error("Couldn't serialize %s to %s: %s", save_data, process_result_file, e)
             raise
 
     def _check_service(self, endpoint: Optional[str] = "/v1/models") -> None:
@@ -357,9 +347,7 @@ def run_single_benchmark(
             log.info("Run completed successfully")
             self._save_subprocess_result_json(run_output_dir, result)
             run_completed = 1 if result.returncode == 0 else 0
-            return AIPerfSummary(
-                total=1, completed=run_completed, failed=1 - run_completed
-            )
+            return AIPerfSummary(total=1, completed=run_completed, failed=1 - run_completed)
 
         except subprocess.CalledProcessError as e:
             log.error("Run failed with exit code %s", e.returncode)
@@ -379,9 +367,7 @@ def run_batch_benchmarks(
         # Generate all sweep combinations
         combinations = self._get_sweep_combinations()
         if not combinations:
-            raise RuntimeError(
-                f"Can't generate sweep combinations from {self.config.sweeps}"
-            )
+            raise RuntimeError(f"Can't generate sweep combinations from {self.config.sweeps}")
 
         num_combinations = len(combinations)
         log.info("Running %s benchmarks", num_combinations)
diff --git a/nemoguardrails/cli/__init__.py b/nemoguardrails/cli/__init__.py
index c40ecf7b4..97a8faed6 100644
--- a/nemoguardrails/cli/__init__.py
+++ b/nemoguardrails/cli/__init__.py
@@ -25,7 +25,6 @@
 
 from nemoguardrails import __version__
 from nemoguardrails.actions_server import actions_server
-from nemoguardrails.benchmark.aiperf.run_aiperf import app as aiperf_app
 from nemoguardrails.cli.chat import run_chat
 from nemoguardrails.cli.migration import migrate
 from nemoguardrails.cli.providers import _list_providers, select_provider_with_type
diff --git a/nemoguardrails/server/api.py b/nemoguardrails/server/api.py
index 864f156df..658cffd01 100644
--- a/nemoguardrails/server/api.py
+++ b/nemoguardrails/server/api.py
@@ -475,9 +475,7 @@ async def chat_completion(body: RequestBody, request: Request):
 
     except Exception as ex:
         log.exception(ex)
-        return ResponseBody(
-            messages=[{"role": "assistant", "content": "Internal server error."}]
-        )
+        return ResponseBody(messages=[{"role": "assistant", "content": "Internal server error."}])
 
 
 # By default, there are no challenges
diff --git a/tests/benchmark/test_aiperf_models.py b/tests/benchmark/test_aiperf_models.py
index 991304f62..b95e5095a 100644
--- a/tests/benchmark/test_aiperf_models.py
+++ b/tests/benchmark/test_aiperf_models.py
@@ -236,11 +236,7 @@ def test_aiperf_config_sweep_invalid_value_type_dict(self, valid_base_config):
         error_msg = str(exc_info.value)
         # Pydantic catches this during type validation
         assert "sweeps.concurrency" in error_msg
-        assert (
-            "must be int or str" in error_msg
-            or "int_type" in error_msg
-            or "string_type" in error_msg
-        )
+        assert "must be int or str" in error_msg or "int_type" in error_msg or "string_type" in error_msg
 
     def test_aiperf_config_sweep_invalid_value_type_list(self, valid_base_config):
         """Test that list values in sweeps raise validation error."""
@@ -254,11 +250,7 @@ def test_aiperf_config_sweep_invalid_value_type_list(self, valid_base_config):
         error_msg = str(exc_info.value)
         # Pydantic catches this during type validation
         assert "sweeps.concurrency" in error_msg
-        assert (
-            "must be int or str" in error_msg
-            or "int_type" in error_msg
-            or "string_type" in error_msg
-        )
+        assert "must be int or str" in error_msg or "int_type" in error_msg or "string_type" in error_msg
 
     def test_aiperf_config_sweep_empty_list(self, valid_base_config):
         """Test that empty sweep list raises validation error."""
@@ -302,9 +294,7 @@ def test_aiperf_config_multiple_invalid_sweep_keys(self, valid_base_config):
 
     def test_aiperf_config_get_output_base_path(self, valid_base_config):
         """Test get_output_base_path method."""
-        config = AIPerfConfig(
-            output_base_dir="custom_results", base_config=valid_base_config
-        )
+        config = AIPerfConfig(output_base_dir="custom_results", base_config=valid_base_config)
         path = config.get_output_base_path()
         assert isinstance(path, Path)
         assert str(path) == "custom_results"
diff --git a/tests/benchmark/test_run_aiperf.py b/tests/benchmark/test_run_aiperf.py
index 9c1045143..997d57d63 100644
--- a/tests/benchmark/test_run_aiperf.py
+++ b/tests/benchmark/test_run_aiperf.py
@@ -257,10 +257,7 @@ def test_sanitize_command_with_api_key(self):
         result = AIPerfRunner._sanitize_command_for_logging(cmd)
 
         # "secret-key-123" has 14 chars, so 8 asterisks + last 6 chars "ey-123"
-        assert result == (
-            "aiperf profile --model test-model --api-key ********ey-123 "
-            "--url http://localhost:8000"
-        )
+        assert result == ("aiperf profile --model test-model --api-key ********ey-123 --url http://localhost:8000")
 
     def test_sanitize_command_without_api_key(self):
         """Test sanitizing command without API key."""
@@ -311,9 +308,7 @@ def test_sanitize_command_multiple_api_keys(self):
             "second-key",
         ]
         result = AIPerfRunner._sanitize_command_for_logging(cmd)
-        assert result == (
-            "aiperf profile --api-key ***st-key --model test-model --api-key ****nd-key"
-        )
+        assert result == ("aiperf profile --api-key ***st-key --model test-model --api-key ****nd-key")
 
     def test_sanitize_command_preserves_other_values(self):
         """Test that other command values are preserved exactly."""
@@ -331,10 +326,7 @@ def test_sanitize_command_preserves_other_values(self):
         result = AIPerfRunner._sanitize_command_for_logging(cmd)
 
         # "my-secret-key" has 13 chars, so 7 asterisks + "et-key" (last 6 chars)
-        assert result == (
-            "aiperf profile --api-key *******et-key --concurrency 10 "
-            "--benchmark-duration 60 --streaming"
-        )
+        assert result == ("aiperf profile --api-key *******et-key --concurrency 10 --benchmark-duration 60 --streaming")
 
     def test_sanitize_command_short_api_key(self):
         """Test sanitizing command with API key shorter than or equal to 6 chars."""
@@ -404,13 +396,9 @@ def test_build_command_with_sweep_params(self, create_config_file, tmp_path):
         duration_idx = cmd.index("--benchmark-duration")
         assert cmd[duration_idx + 1] == "30"
 
-    def test_build_command_with_api_key_env_var(
-        self, create_config_file, tmp_path, monkeypatch
-    ):
+    def test_build_command_with_api_key_env_var(self, create_config_file, tmp_path, monkeypatch):
         """Test building command with API key from environment variable."""
-        config_file = create_config_file(
-            extra_base_config={"api_key_env_var": "TEST_API_KEY"}
-        )
+        config_file = create_config_file(extra_base_config={"api_key_env_var": "TEST_API_KEY"})
 
         # Set the environment variable
         monkeypatch.setenv("TEST_API_KEY", "secret-key-123")
@@ -423,13 +411,9 @@ def test_build_command_with_api_key_env_var(
         api_key_idx = cmd.index("--api-key")
         assert cmd[api_key_idx + 1] == "secret-key-123"
 
-    def test_build_command_with_missing_api_key_env_var(
-        self, create_config_file, tmp_path
-    ):
+    def test_build_command_with_missing_api_key_env_var(self, create_config_file, tmp_path):
         """Test building command when API key environment variable is not set."""
-        config_file = create_config_file(
-            extra_base_config={"api_key_env_var": "MISSING_API_KEY"}
-        )
+        config_file = create_config_file(extra_base_config={"api_key_env_var": "MISSING_API_KEY"})
 
         runner = AIPerfRunner(config_file)
         output_dir = tmp_path / "output"
@@ -511,9 +495,7 @@ def test_build_command_ui_type_non_debug(self, create_config_file, tmp_path):
             ui_type_idx = cmd.index("--ui-type")
             assert cmd[ui_type_idx + 1] == "none"
 
-    def test_build_command_with_list_in_sweep_params(
-        self, create_config_file, tmp_path
-    ):
+    def test_build_command_with_list_in_sweep_params(self, create_config_file, tmp_path):
         """Test building command when sweep params contain list values."""
         config_file = create_config_file()
 
@@ -790,9 +772,7 @@ def test_check_service_no_api_key_env_var(self, create_config_file):
 
     def test_check_service_api_key_env_var_not_set(self, create_config_file):
         """Test checking service when api_key_env_var is configured but env var doesn't exist."""
-        config_file = create_config_file(
-            extra_base_config={"api_key_env_var": "NONEXISTENT_API_KEY"}
-        )
+        config_file = create_config_file(extra_base_config={"api_key_env_var": "NONEXISTENT_API_KEY"})
 
         runner = AIPerfRunner(config_file)
 
@@ -811,9 +791,7 @@ def test_check_service_api_key_env_var_not_set(self, create_config_file):
 
     def test_check_service_api_key_env_var_set(self, create_config_file, monkeypatch):
         """Test checking service when api_key_env_var is configured and env var exists."""
-        config_file = create_config_file(
-            extra_base_config={"api_key_env_var": "TEST_API_KEY"}
-        )
+        config_file = create_config_file(extra_base_config={"api_key_env_var": "TEST_API_KEY"})
 
         # Set the environment variable
         monkeypatch.setenv("TEST_API_KEY", "test-secret-key-123")
@@ -832,9 +810,7 @@ def test_check_service_api_key_env_var_set(self, create_config_file, monkeypatch
             mock_get.assert_called_once()
             call_args = mock_get.call_args
             assert call_args[1]["headers"] is not None
-            assert (
-                call_args[1]["headers"]["Authorization"] == "Bearer test-secret-key-123"
-            )
+            assert call_args[1]["headers"]["Authorization"] == "Bearer test-secret-key-123"
 
 
 class TestGetBatchDir:
@@ -906,18 +882,14 @@ def test_run_single_benchmark_failure(self, create_config_file, tmp_path):
         run_directory = tmp_path / "runs"
 
         # Mock subprocess.run to raise CalledProcessError
-        with patch(
-            "subprocess.run", side_effect=subprocess.CalledProcessError(1, "aiperf")
-        ):
+        with patch("subprocess.run", side_effect=subprocess.CalledProcessError(1, "aiperf")):
             summary = runner.run_single_benchmark(run_directory, dry_run=False)
 
             assert summary.total == 1
             assert summary.completed == 0
             assert summary.failed == 1
 
-    def test_run_single_benchmark_keyboard_interrupt(
-        self, create_config_file, tmp_path
-    ):
+    def test_run_single_benchmark_keyboard_interrupt(self, create_config_file, tmp_path):
         """Test that KeyboardInterrupt is re-raised."""
         config_file = create_config_file()
 
@@ -1019,9 +991,7 @@ def test_run_batch_benchmarks_no_combinations(self, create_config_file, tmp_path
         with pytest.raises(RuntimeError, match="Can't generate sweep combinations"):
             runner.run_batch_benchmarks(run_directory, dry_run=False)
 
-    def test_run_batch_benchmarks_keyboard_interrupt(
-        self, create_config_file, tmp_path
-    ):
+    def test_run_batch_benchmarks_keyboard_interrupt(self, create_config_file, tmp_path):
         """Test that KeyboardInterrupt is re-raised in batch benchmarks."""
         config_file = create_config_file(sweeps={"concurrency": [1, 2]})
 
@@ -1033,9 +1003,7 @@ def test_run_batch_benchmarks_keyboard_interrupt(
             with pytest.raises(KeyboardInterrupt):
                 runner.run_batch_benchmarks(run_directory, dry_run=False)
 
-    def test_run_batch_benchmarks_non_zero_returncode(
-        self, create_config_file, tmp_path
-    ):
+    def test_run_batch_benchmarks_non_zero_returncode(self, create_config_file, tmp_path):
         """Test running batch benchmarks when subprocess returns non-zero but doesn't raise."""
         config_file = create_config_file(sweeps={"concurrency": [1, 2]})
 
@@ -1111,9 +1079,7 @@ def test_run_with_failures(self, create_config_file):
 
         # Mock _check_service and subprocess.run to fail
         with patch.object(runner, "_check_service"):
-            with patch(
-                "subprocess.run", side_effect=subprocess.CalledProcessError(1, "aiperf")
-            ):
+            with patch("subprocess.run", side_effect=subprocess.CalledProcessError(1, "aiperf")):
                 exit_code = runner.run(dry_run=False)
                 assert exit_code == 1
 
@@ -1123,9 +1089,7 @@ def test_run_service_check_failure(self, create_config_file):
         runner = AIPerfRunner(config_file)
 
         # Mock _check_service to raise error
-        with patch.object(
-            runner, "_check_service", side_effect=RuntimeError("Service unavailable")
-        ):
+        with patch.object(runner, "_check_service", side_effect=RuntimeError("Service unavailable")):
             with pytest.raises(RuntimeError, match="Service unavailable"):
                 runner.run(dry_run=False)
 
@@ -1164,9 +1128,7 @@ def test_cli_run_command_with_verbose(self, create_config_file):
             mock_runner.run.return_value = 0
             mock_runner_class.return_value = mock_runner
 
-            result = runner.invoke(
-                app, ["--config-file", str(config_file), "--verbose"]
-            )
+            result = runner.invoke(app, ["--config-file", str(config_file), "--verbose"])
 
             assert result.exit_code == 0
             mock_runner.run.assert_called_once_with(dry_run=False)
@@ -1184,9 +1146,7 @@ def test_cli_run_command_with_dry_run(self, create_config_file):
             mock_runner.run.return_value = 0
             mock_runner_class.return_value = mock_runner
 
-            result = runner.invoke(
-                app, ["--config-file", str(config_file), "--dry-run"]
-            )
+            result = runner.invoke(app, ["--config-file", str(config_file), "--dry-run"])
 
             assert result.exit_code == 0
             mock_runner.run.assert_called_once_with(dry_run=True)

From 9a89c0a4b213c6e3db976e6581a075a06ee24437 Mon Sep 17 00:00:00 2001
From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com>
Date: Mon, 1 Dec 2025 11:38:13 -0600
Subject: [PATCH 29/30] Move aiperf under benchmark dir

---
 {aiperf => benchmark/aiperf}/README.md                       | 0
 {aiperf => benchmark/aiperf}/__init__.py                     | 0
 {aiperf => benchmark/aiperf}/__main__.py                     | 0
 {aiperf => benchmark/aiperf}/aiperf_models.py                | 0
 {aiperf => benchmark/aiperf}/configs/single_concurrency.yaml | 0
 {aiperf => benchmark/aiperf}/configs/sweep_concurrency.yaml  | 0
 {aiperf => benchmark/aiperf}/run_aiperf.py                   | 0
 7 files changed, 0 insertions(+), 0 deletions(-)
 rename {aiperf => benchmark/aiperf}/README.md (100%)
 rename {aiperf => benchmark/aiperf}/__init__.py (100%)
 rename {aiperf => benchmark/aiperf}/__main__.py (100%)
 rename {aiperf => benchmark/aiperf}/aiperf_models.py (100%)
 rename {aiperf => benchmark/aiperf}/configs/single_concurrency.yaml (100%)
 rename {aiperf => benchmark/aiperf}/configs/sweep_concurrency.yaml (100%)
 rename {aiperf => benchmark/aiperf}/run_aiperf.py (100%)

diff --git a/aiperf/README.md b/benchmark/aiperf/README.md
similarity index 100%
rename from aiperf/README.md
rename to benchmark/aiperf/README.md
diff --git a/aiperf/__init__.py b/benchmark/aiperf/__init__.py
similarity index 100%
rename from aiperf/__init__.py
rename to benchmark/aiperf/__init__.py
diff --git a/aiperf/__main__.py b/benchmark/aiperf/__main__.py
similarity index 100%
rename from aiperf/__main__.py
rename to benchmark/aiperf/__main__.py
diff --git a/aiperf/aiperf_models.py b/benchmark/aiperf/aiperf_models.py
similarity index 100%
rename from aiperf/aiperf_models.py
rename to benchmark/aiperf/aiperf_models.py
diff --git a/aiperf/configs/single_concurrency.yaml b/benchmark/aiperf/configs/single_concurrency.yaml
similarity index 100%
rename from aiperf/configs/single_concurrency.yaml
rename to benchmark/aiperf/configs/single_concurrency.yaml
diff --git a/aiperf/configs/sweep_concurrency.yaml b/benchmark/aiperf/configs/sweep_concurrency.yaml
similarity index 100%
rename from aiperf/configs/sweep_concurrency.yaml
rename to benchmark/aiperf/configs/sweep_concurrency.yaml
diff --git a/aiperf/run_aiperf.py b/benchmark/aiperf/run_aiperf.py
similarity index 100%
rename from aiperf/run_aiperf.py
rename to benchmark/aiperf/run_aiperf.py

From 989984158f94dd76fde9df3577149ecb1a2e688e Mon Sep 17 00:00:00 2001
From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com>
Date: Mon, 1 Dec 2025 11:53:14 -0600
Subject: [PATCH 30/30] Move aiperf under benchmark dir

---
 benchmark/aiperf/README.md            | 10 +++++-----
 benchmark/aiperf/__main__.py          |  2 +-
 benchmark/aiperf/run_aiperf.py        |  2 +-
 tests/benchmark/test_aiperf_models.py |  2 +-
 tests/benchmark/test_run_aiperf.py    | 24 ++++++++++++------------
 5 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/benchmark/aiperf/README.md b/benchmark/aiperf/README.md
index 84c58193a..9a3608927 100644
--- a/benchmark/aiperf/README.md
+++ b/benchmark/aiperf/README.md
@@ -65,7 +65,7 @@ There are two example configs included which can be extended for your use-cases.
 To run a benchmark, use the following command:
 
 ```bash
-$ python -m aiperf --config-file <path-to-config.yaml>
+$ python -m benchmark.aiperf --config-file <path-to-config.yaml>
 ```
 
 ### Running a Single Benchmark
@@ -73,7 +73,7 @@ $ python -m aiperf --config-file <path-to-config.yaml>
 To run a single benchmark with fixed parameters, use the `single_concurrency.yaml` configuration:
 
 ```bash
-$ python -m aiperf --config-file aiperf/configs/single_concurrency.yaml
+$ python -m benchmark.aiperf --config-file aiperf/configs/single_concurrency.yaml
 ```
 
 **Example output:**
@@ -97,7 +97,7 @@ $ python -m aiperf --config-file aiperf/configs/single_concurrency.yaml
 To run multiple benchmarks with different concurrency levels, use the `sweep_concurrency.yaml` configuration as below:
 
 ```bash
-$ python -m aiperf --config-file aiperf/configs/sweep_concurrency.yaml
+$ python -m benchmark.aiperf --config-file aiperf/configs/sweep_concurrency.yaml
 ```
 
 **Example output:**
@@ -134,7 +134,7 @@ The `--dry-run` option allows you to preview all benchmark commands without exec
 - Debugging configuration issues
 
 ```bash
-$ python -m aiperf --config-file aiperf/configs/sweep_concurrency.yaml --dry-run
+$ python -m benchmark.aiperf --config-file aiperf/configs/sweep_concurrency.yaml --dry-run
 ```
 
 When in dry-run mode, the script will:
@@ -150,7 +150,7 @@ When in dry-run mode, the script will:
 The `--verbose` option outputs more detailed debugging information to understand each step of the benchmarking process.
 
 ```bash
-$ python -m aiperf --config-file <config.yaml> --verbose
+$ python -m benchmark.aiperf --config-file <config.yaml> --verbose
 ```
 
 Verbose mode provides:
diff --git a/benchmark/aiperf/__main__.py b/benchmark/aiperf/__main__.py
index 90dc5d8f9..8bb221f4a 100644
--- a/benchmark/aiperf/__main__.py
+++ b/benchmark/aiperf/__main__.py
@@ -19,7 +19,7 @@
 
 """Entry point for running aiperf as a module: python -m aiperf"""
 
-from aiperf.run_aiperf import app
+from benchmark.aiperf.run_aiperf import app
 
 if __name__ == "__main__":
     app()
diff --git a/benchmark/aiperf/run_aiperf.py b/benchmark/aiperf/run_aiperf.py
index 3470f1e3e..614e2aadf 100755
--- a/benchmark/aiperf/run_aiperf.py
+++ b/benchmark/aiperf/run_aiperf.py
@@ -32,7 +32,7 @@
 import yaml
 from pydantic import ValidationError
 
-from aiperf.aiperf_models import AIPerfConfig
+from benchmark.aiperf.aiperf_models import AIPerfConfig
 
 log = logging.getLogger(__name__)
 log.setLevel(logging.INFO)
diff --git a/tests/benchmark/test_aiperf_models.py b/tests/benchmark/test_aiperf_models.py
index b95e5095a..70973e6b6 100644
--- a/tests/benchmark/test_aiperf_models.py
+++ b/tests/benchmark/test_aiperf_models.py
@@ -22,7 +22,7 @@
 import pytest
 from pydantic import ValidationError
 
-from aiperf.aiperf_models import AIPerfConfig, BaseConfig
+from benchmark.aiperf.aiperf_models import AIPerfConfig, BaseConfig
 
 
 class TestBaseConfig:
diff --git a/tests/benchmark/test_run_aiperf.py b/tests/benchmark/test_run_aiperf.py
index 997d57d63..3ee4f1f5b 100644
--- a/tests/benchmark/test_run_aiperf.py
+++ b/tests/benchmark/test_run_aiperf.py
@@ -29,8 +29,8 @@
 import yaml
 from typer.testing import CliRunner
 
-from aiperf.aiperf_models import AIPerfConfig, BaseConfig
-from aiperf.run_aiperf import AIPerfRunner, AIPerfSummary
+from benchmark.aiperf.aiperf_models import AIPerfConfig, BaseConfig
+from benchmark.aiperf.run_aiperf import AIPerfRunner, AIPerfSummary
 
 
 @pytest.fixture
@@ -473,7 +473,7 @@ def test_build_command_ui_type_debug(self, create_config_file, tmp_path):
         output_dir = tmp_path / "output"
 
         # Patch log.level to be DEBUG
-        with patch("aiperf.run_aiperf.log.level", logging.DEBUG):
+        with patch("benchmark.aiperf.run_aiperf.log.level", logging.DEBUG):
             cmd = runner._build_command(None, output_dir)
 
             assert "--ui-type" in cmd
@@ -488,7 +488,7 @@ def test_build_command_ui_type_non_debug(self, create_config_file, tmp_path):
         output_dir = tmp_path / "output"
 
         # Patch log.level to be INFO
-        with patch("aiperf.run_aiperf.log.level", logging.INFO):
+        with patch("benchmark.aiperf.run_aiperf.log.level", logging.INFO):
             cmd = runner._build_command(None, output_dir)
 
             assert "--ui-type" in cmd
@@ -1102,10 +1102,10 @@ def test_cli_run_command_basic(self, create_config_file):
         config_file = create_config_file()
         runner = CliRunner()
 
-        from aiperf.run_aiperf import app
+        from benchmark.aiperf.run_aiperf import app
 
         # Mock the runner and service check
-        with patch("aiperf.run_aiperf.AIPerfRunner") as mock_runner_class:
+        with patch("benchmark.aiperf.run_aiperf.AIPerfRunner") as mock_runner_class:
             mock_runner = Mock()
             mock_runner.run.return_value = 0
             mock_runner_class.return_value = mock_runner
@@ -1120,10 +1120,10 @@ def test_cli_run_command_with_verbose(self, create_config_file):
         config_file = create_config_file()
         runner = CliRunner()
 
-        from aiperf.run_aiperf import app
+        from benchmark.aiperf.run_aiperf import app
 
         # Mock the runner and service check
-        with patch("aiperf.run_aiperf.AIPerfRunner") as mock_runner_class:
+        with patch("benchmark.aiperf.run_aiperf.AIPerfRunner") as mock_runner_class:
             mock_runner = Mock()
             mock_runner.run.return_value = 0
             mock_runner_class.return_value = mock_runner
@@ -1138,10 +1138,10 @@ def test_cli_run_command_with_dry_run(self, create_config_file):
         config_file = create_config_file()
         runner = CliRunner()
 
-        from aiperf.run_aiperf import app
+        from benchmark.aiperf.run_aiperf import app
 
         # Mock the runner and service check
-        with patch("aiperf.run_aiperf.AIPerfRunner") as mock_runner_class:
+        with patch("benchmark.aiperf.run_aiperf.AIPerfRunner") as mock_runner_class:
             mock_runner = Mock()
             mock_runner.run.return_value = 0
             mock_runner_class.return_value = mock_runner
@@ -1156,10 +1156,10 @@ def test_cli_run_command_with_failure(self, create_config_file):
         config_file = create_config_file()
         runner = CliRunner()
 
-        from aiperf.run_aiperf import app
+        from benchmark.aiperf.run_aiperf import app
 
         # Mock the runner to return failure
-        with patch("aiperf.run_aiperf.AIPerfRunner") as mock_runner_class:
+        with patch("benchmark.aiperf.run_aiperf.AIPerfRunner") as mock_runner_class:
             mock_runner = Mock()
             mock_runner.run.return_value = 1  # Failure
             mock_runner_class.return_value = mock_runner