From d24dacf68e9607e81c258a21872ab9d1539c8743 Mon Sep 17 00:00:00 2001
From: Elias Bermudez <dbermudez@nvidia.com>
Date: Fri, 31 Oct 2025 16:14:01 -0700
Subject: [PATCH 1/2] Initial automation for cli options

---
 .pre-commit-config.yaml          |   6 +
 Makefile                         |  12 +-
 docs/cli_options.md              | 563 +++++++++++++++++++++----------
 tools/generate_cli_options_md.py | 197 +++++++++++
 4 files changed, 607 insertions(+), 171 deletions(-)
 create mode 100755 tools/generate_cli_options_md.py

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index ccff7e889..ad5960766 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -42,6 +42,12 @@ repos:
         types: [python]
         pass_filenames: false
         args: []
+      - id: update-cli-docs
+        name: update-cli-docs
+        entry: python tools/generate_cli_options_md.py
+        language: python
+        pass_filenames: false
+        files: ^(src/aiperf/cli.*\.py|src/aiperf/common/config/.*\.py|docs/cli_options\.md)$
   - repo: https://github.com/astral-sh/ruff-pre-commit
     rev: v0.11.8
     hooks:
diff --git a/Makefile b/Makefile
index 38e7d6d19..841e194c4 100644
--- a/Makefile
+++ b/Makefile
@@ -20,7 +20,8 @@
 		test-verbose init-files setup-venv setup-mkinit install-mock-server \
 		integration-tests integration-tests-ci integration-tests-verbose \
 		test-integration test-integration-ci test-integration-verbose \
-		test-stress stress-tests internal-help help
+		test-stress stress-tests internal-help help \
+		update-cli-docs check-cli-docs
 
 
 # Include user-defined environment variables
@@ -95,6 +96,15 @@ internal-help:
 init-files: #? run mkinit to generate the __init__.py files.
 	$(activate_venv) && tools/generate_init_files.sh
 
+update-cli-docs: #? regenerate docs/cli_options.md from aiperf profile --help.
+	@printf "$(bold)$(green)Regenerating CLI options documentation...$(reset)\n"
+	$(activate_venv) && python tools/generate_cli_options_md.py
+	@printf "$(bold)$(green)Done! docs/cli_options.md has been updated.$(reset)\n"
+
+check-cli-docs: #? check if docs/cli_options.md is in sync with aiperf profile --help.
+	@printf "$(bold)$(blue)Checking if CLI options documentation is up to date...$(reset)\n"
+	$(activate_venv) && python tools/generate_cli_options_md.py --check
+
 ruff lint: #? run the ruff linters
 	$(activate_venv) && ruff check . $(args)
 
diff --git a/docs/cli_options.md b/docs/cli_options.md
index 238e0e091..7705ec904 100644
--- a/docs/cli_options.md
+++ b/docs/cli_options.md
@@ -1,5 +1,5 @@
 <!--
-SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: Apache-2.0
 -->
 
@@ -7,173 +7,396 @@ SPDX-License-Identifier: Apache-2.0
 Use these options to profile with AIPerf.
 
 ```
-╭─ Endpoint ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
-│ *  MODEL-NAMES --model-names --model                    -m  Model name(s) to be benchmarked. Can be a comma-separated list or a single model name. [required]                         │
-│    MODEL-SELECTION-STRATEGY --model-selection-strategy      When multiple models are specified, this is how a specific model should be assigned to a prompt. round_robin: nth prompt  │
-│                                                             in the list gets assigned to n-mod len(models). random: assignment is uniformly random [choices: round-robin, random]     │
-│                                                             [default: round-robin]                                                                                                    │
-│    CUSTOM-ENDPOINT --custom-endpoint --endpoint             Set a custom endpoint that differs from the OpenAI defaults.                                                              │
-│    ENDPOINT-TYPE --endpoint-type                            The endpoint type to send requests to on the server. [choices: chat, completions, embeddings, rankings]                   │
-│                                                             [default: chat]                                                                                                           │
-│    STREAMING --streaming                                    An option to enable the use of the streaming API. [default: False]                                                        │
-│    URL --url                                            -u  URL of the endpoint to target for benchmarking. [default: localhost:8000]                                                 │
-│    REQUEST-TIMEOUT-SECONDS --request-timeout-seconds        The timeout in floating-point seconds for each request to the endpoint. [default: 600.0]                                  │
-│    API-KEY --api-key                                        The API key to use for the endpoint. If provided, it will be sent with every request as a header: Authorization: Bearer   │
-│                                                             <api_key>.                                                                                                                │
-╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
-```
-```
-╭─ Input ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
-│ EXTRA-INPUTS --extra-inputs                                    Provide additional inputs to include with every request. Inputs should be in an 'input_name:value'      │
-│                                                                format. Alternatively, a string representing a json formatted dict can be provided. [default: []]       │
-│ HEADER --header                                            -H  Adds a custom header to the requests. Headers must be specified as 'Header:Value' pairs. Alternatively, │
-│                                                                a string representing a json formatted dict can be provided. [default: []]                              │
-│ INPUT-FILE --input-file                                        The file or directory path that contains the dataset to use for profiling. This parameter is used in    │
-│                                                                conjunction with the custom_dataset_type parameter to support different types of user provided          │
-│                                                                datasets.                                                                                               │
-│ FIXED-SCHEDULE --fixed-schedule                                Specifies to run a fixed schedule of requests. This is normally inferred from the --input-file          │
-│                                                                parameter, but can be set manually here. [default: False]                                               │
-│ FIXED-SCHEDULE-AUTO-OFFSET --fixed-schedule-auto-offset        Specifies to automatically offset the timestamps in the fixed schedule, such that the first timestamp   │
-│                                                                is considered 0, and the rest are shifted accordingly. If disabled, the timestamps will be assumed to   │
-│                                                                be relative to 0. [default: False]                                                                      │
-│ FIXED-SCHEDULE-START-OFFSET --fixed-schedule-start-offset      Specifies the offset in milliseconds to start the fixed schedule at. By default, the schedule starts at │
-│                                                                0, but this option can be used to start at a reference point further in the schedule. This option       │
-│                                                                cannot be used in conjunction with the --fixed-schedule-auto-offset. The schedule will include any      │
-│                                                                requests at the start offset.                                                                           │
-│ FIXED-SCHEDULE-END-OFFSET --fixed-schedule-end-offset          Specifies the offset in milliseconds to end the fixed schedule at. By default, the schedule ends at the │
-│                                                                last timestamp in the trace dataset, but this option can be used to only run a subset of the trace. The │
-│                                                                schedule will include any requests at the end offset.                                                   │
-│ PUBLIC-DATASET --public-dataset                                The public dataset to use for the requests. [choices: sharegpt]                                         │
-│ CUSTOM-DATASET-TYPE --custom-dataset-type                      The type of custom dataset to use. This parameter is used in conjunction with the --input-file          │
-│                                                                parameter. [choices: single_turn, multi_turn, random_pool, mooncake_trace]                              │
-│ DATASET-SAMPLING-STRATEGY --dataset-sampling-strategy          The strategy to use for sampling the dataset. sequential: Iterate through the dataset sequentially,     │
-│                                                                then wrap around to the beginning. random: Randomly select a conversation from the dataset. Will        │
-│                                                                randomly sample with replacement. shuffle: Shuffle the dataset and iterate through it. Will randomly    │
-│                                                                sample without replacement. Once the end of the dataset is reached, shuffle the dataset again and start │
-│                                                                over. [choices: sequential, random, shuffle]                                                            │
-│ RANDOM-SEED --random-seed                                      The seed used to generate random values. Set to some value to make the synthetic data generation        │
-│                                                                deterministic. It will use system default if not provided.                                              │
-│ GOODPUT --goodput                                              Specify service level objectives (SLOs) for goodput as space-separated 'KEY:VALUE' pairs, where KEY is  │
-│                                                                a metric tag and VALUE is a number in the metric’s display unit (falls back to its base unit if no      │
-│                                                                display unit is defined). Examples: 'request_latency:250' (ms), 'inter_token_latency:10' (ms),          │
-│                                                                output_token_throughput_per_user:600 (tokens/s). Only metrics applicable to the current endpoint/config │
-│                                                                are considered. For more context on the definition of goodput, refer to DistServe paper:                │
-│                                                                https://arxiv.org/pdf/2401.09670 and the blog: https://hao-ai-lab.github.io/blogs/distserve             │
-╰────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
-```
-```
-╭─ Output ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
-│ OUTPUT-ARTIFACT-DIR --output-artifact-dir --artifact-dir  The directory to store all the (output) artifacts generated by AIPerf. [default: artifacts]                                 │
-╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
-```
-```
-╭─ Tokenizer ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
-│ TOKENIZER --tokenizer                                      The Hugging Face tokenizer to use to interpret token metrics from prompts and responses. The value can be the name of a    │
-│                                                            tokenizer or the filepath of the tokenizer. The default value is the model name.                                           │
-│ TOKENIZER-REVISION --tokenizer-revision                    The specific model version to use. It can be a branch name, tag name, or commit ID. [default: main]                        │
-│ TOKENIZER-TRUST-REMOTE-CODE --tokenizer-trust-remote-code  Allows custom tokenizer to be downloaded and executed. This carries security risks and should only be used for             │
-│                                                            repositories you trust. This is only necessary for custom tokenizers stored in Hugging Face Hub. [default: False]          │
-╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
-```
-```
-╭─ Load Generator ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
-│ BENCHMARK-DURATION --benchmark-duration                            The duration in seconds for benchmarking.                                                                          │
-│ BENCHMARK-GRACE-PERIOD --benchmark-grace-period                    The grace period in seconds to wait for responses after benchmark duration ends. Only applies when                 │
-│                                                                    --benchmark-duration is set. Responses received within this period are included in metrics. [default: 30.0]        │
-│ CONCURRENCY --concurrency                                          The concurrency value to benchmark.                                                                                │
-│ REQUEST-RATE --request-rate                                        Sets the request rate for the load generated by AIPerf. Unit: requests/second                                      │
-│ REQUEST-RATE-MODE --request-rate-mode                              Sets the request rate mode for the load generated by AIPerf. Valid values: constant, poisson. constant: Generate   │
-│                                                                    requests at a fixed rate. poisson: Generate requests using a poisson distribution. [default: poisson]              │
-│ REQUEST-COUNT --request-count --num-requests                       The number of requests to use for measurement. [default: 10]                                                       │
-│ WARMUP-REQUEST-COUNT --warmup-request-count --num-warmup-requests  The number of warmup requests to send before benchmarking. [default: 0]                                            │
-│ REQUEST-CANCELLATION-RATE --request-cancellation-rate              The percentage of requests to cancel. [default: 0.0]                                                               │
-│ REQUEST-CANCELLATION-DELAY --request-cancellation-delay            The delay in seconds before cancelling requests. This is used when --request-cancellation-rate is greater than 0.  │
-│                                                                    [default: 0.0]                                                                                                     │
-╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
-```
-```
-╭─ Conversation Input ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
-│ CONVERSATION-NUM --conversation-num --num-conversations --num-sessions                       The total number of unique conversations to generate. Each conversation represents a single request session between client and server. Supported on synthetic mode and    │
-│                                                                                              the custom random_pool dataset. The number of conversations will be used to determine the number of entries in both the custom random_pool and synthetic datasets and     │
-│                                                                                              will be reused until benchmarking is complete.                                                                                                                            │
-│ NUM-DATASET-ENTRIES --num-dataset-entries --num-prompts                                      The total number of unique dataset entries to generate for the dataset. Each entry represents a single turn used in a request. [default: 100]                             │
-│ CONVERSATION-TURN-MEAN --conversation-turn-mean --session-turns-mean                         The mean number of turns within a conversation. [default: 1]                                                                                                              │
-│ CONVERSATION-TURN-STDDEV --conversation-turn-stddev --session-turns-stddev                   The standard deviation of the number of turns within a conversation. [default: 0]                                                                                         │
-│ CONVERSATION-TURN-DELAY-MEAN --conversation-turn-delay-mean --session-turn-delay-mean        The mean delay between turns within a conversation in milliseconds. [default: 0.0]                                                                                        │
-│ CONVERSATION-TURN-DELAY-STDDEV --conversation-turn-delay-stddev --session-turn-delay-stddev  The standard deviation of the delay between turns within a conversation in milliseconds. [default: 0.0]                                                                   │
-│ CONVERSATION-TURN-DELAY-RATIO --conversation-turn-delay-ratio --session-delay-ratio          A ratio to scale multi-turn delays. [default: 1.0]                                                                                                                        │
-╰────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
-```
-```
-╭─ Input Sequence Length (ISL) ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
-│ PROMPT-INPUT-TOKENS-MEAN --prompt-input-tokens-mean              The mean of number of tokens in the generated prompts when using synthetic data. [default: 550]                      │
-│   --synthetic-input-tokens-mean --isl                                                                                                                                                 │
-│ PROMPT-INPUT-TOKENS-STDDEV --prompt-input-tokens-stddev          The standard deviation of number of tokens in the generated prompts when using synthetic data. [default: 0.0]        │
-│   --synthetic-input-tokens-stddev --isl-stddev                                                                                                                                        │
-│ PROMPT-INPUT-TOKENS-BLOCK-SIZE --prompt-input-tokens-block-size  The block size of the prompt. [default: 512]                                                                         │
-│   --synthetic-input-tokens-block-size --isl-block-size                                                                                                                                │
-╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
-```
-```
-╭─ Output Sequence Length (OSL) ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
-│ PROMPT-OUTPUT-TOKENS-MEAN --prompt-output-tokens-mean      The mean number of tokens in each output.                                                                                  │
-│   --output-tokens-mean --osl                                                                                                                                                          │
-│ PROMPT-OUTPUT-TOKENS-STDDEV --prompt-output-tokens-stddev  The standard deviation of the number of tokens in each output. [default: 0]                                                │
-│   --output-tokens-stddev --osl-stddev                                                                                                                                                 │
-╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
-```
-```
-╭─ Prompt ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
-│ PROMPT-BATCH-SIZE --prompt-batch-size --batch-size-text  -b  The batch size of text requests AIPerf should send. This is currently supported with the embeddings and rankings         │
-│   --batch-size                                               endpoint types [default: 1]                                                                                              │
-╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
-```
-```
-╭─ Prefix Prompt ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
-│ PROMPT-PREFIX-POOL-SIZE --prompt-prefix-pool-size  The total size of the prefix prompt pool to select prefixes from. If this value is not zero, these are prompts that are prepended  │
-│   --prefix-prompt-pool-size --num-prefix-prompts   to input prompts. This is useful for benchmarking models that use a K-V cache. [default: 0]                                        │
-│ PROMPT-PREFIX-LENGTH --prompt-prefix-length        The number of tokens in each prefix prompt. This is only used if "num" is greater than zero. Note that due to the prefix and user  │
-│   --prefix-prompt-length                           prompts being concatenated, the number of tokens in the final prompt may be off by one. [default: 0]                               │
-╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
-```
-```
-╭─ Audio Input ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
-│ AUDIO-BATCH-SIZE --audio-batch-size --batch-size-audio  The batch size of audio requests AIPerf should send. This is currently supported with the OpenAI chat endpoint type [default: │
-│                                                         1]                                                                                                                            │
-│ AUDIO-LENGTH-MEAN --audio-length-mean                   The mean length of the audio in seconds. [default: 0.0]                                                                       │
-│ AUDIO-LENGTH-STDDEV --audio-length-stddev               The standard deviation of the length of the audio in seconds. [default: 0.0]                                                  │
-│ AUDIO-FORMAT --audio-format                             The format of the audio files (wav or mp3). [choices: wav, mp3] [default: wav]                                                │
-│ AUDIO-DEPTHS --audio-depths                             A list of audio bit depths to randomly select from in bits. [default: [16]]                                                   │
-│ AUDIO-SAMPLE-RATES --audio-sample-rates                 A list of audio sample rates to randomly select from in kHz. Common sample rates are 16, 44.1, 48, 96, etc. [default: [16.0]] │
-│ AUDIO-NUM-CHANNELS --audio-num-channels                 The number of audio channels to use for the audio data generation. [default: 1]                                               │
-╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
-```
-```
-╭─ Image Input ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
-│ IMAGE-WIDTH-MEAN --image-width-mean                     The mean width of images when generating synthetic image data. [default: 0.0]                                                 │
-│ IMAGE-WIDTH-STDDEV --image-width-stddev                 The standard deviation of width of images when generating synthetic image data. [default: 0.0]                                │
-│ IMAGE-HEIGHT-MEAN --image-height-mean                   The mean height of images when generating synthetic image data. [default: 0.0]                                                │
-│ IMAGE-HEIGHT-STDDEV --image-height-stddev               The standard deviation of height of images when generating synthetic image data. [default: 0.0]                               │
-│ IMAGE-BATCH-SIZE --image-batch-size --batch-size-image  The image batch size of the requests AIPerf should send.  [default: 1]                                                        │
-│ IMAGE-FORMAT --image-format                             The compression format of the images. [choices: png, jpeg, random] [default: png]                                             │
-╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
-```
-```
-╭─ Service ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
-│ LOG-LEVEL --log-level                                                 Logging level [choices: trace, debug, info, notice, warning, success, error, critical] [default: info]          │
-│ VERBOSE --verbose                                                -v   Equivalent to --log-level DEBUG. Enables more verbose logging output, but lacks some raw message logging.       │
-│                                                                       [default: False]                                                                                                │
-│ EXTRA-VERBOSE --extra-verbose                                    -vv  Equivalent to --log-level TRACE. Enables the most verbose logging output possible. [default: False]             │
-│ RECORD-PROCESSOR-SERVICE-COUNT --record-processor-service-count       Number of services to spawn for processing records. The higher the request rate, the more services should be    │
-│   --record-processors                                                 spawned in order to keep up with the incoming records. If not specified, the number of services will be         │
-│                                                                       automatically determined based on the worker count.                                                             │
-│ UI-TYPE --ui-type --ui                                                Type of UI to use [choices: dashboard, simple, none] [default: dashboard]                                       │
-╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
-```
-```
-╭─ Workers ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
-│ WORKERS-MAX --workers-max --max-workers  Maximum number of workers to create. If not specified, the number of workers will be determined by the formula             │
-│                                          min(concurrency, (num CPUs * 0.75) - 1),  with a default max cap of 32. Any value provided will still be capped by the     │
-│                                          concurrency value (if specified), but not by the max cap.                                                                  │
-╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
+╭─ Endpoint ───────────────────────────────────────────────────────────────────╮
+│ *  MODEL-NAMES --model-names     -m  Model name(s) to be benchmarked. Can be │
+│      --model                         a comma-separated list or a single      │
+│                                      model name. [required]                  │
+│    MODEL-SELECTION-STRATEGY          When multiple models are specified,     │
+│      --model-selection-strategy      this is how a specific model should be  │
+│                                      assigned to a prompt. round_robin: nth  │
+│                                      prompt in the list gets assigned to     │
+│                                      n-mod len(models). random: assignment   │
+│                                      is uniformly random [choices:           │
+│                                      round-robin, random] [default:          │
+│                                      round-robin]                            │
+│    CUSTOM-ENDPOINT                   Set a custom endpoint that differs from │
+│      --custom-endpoint               the OpenAI defaults.                    │
+│      --endpoint                                                              │
+│    ENDPOINT-TYPE                     The endpoint type to send requests to   │
+│      --endpoint-type                 on the server. [choices: chat,          │
+│                                      completions, cohere-rankings,           │
+│                                      embeddings, hf-tei-rankings,            │
+│                                      huggingface-generate, nim-rankings,     │
+│                                      solido-rag, template] [default: chat]   │
+│    STREAMING --streaming             An option to enable the use of the      │
+│                                      streaming API. [default: False]         │
+│    URL --url                     -u  URL of the endpoint to target for       │
+│                                      benchmarking. [default: localhost:8000] │
+│    REQUEST-TIMEOUT-SECONDS           The timeout in floating-point seconds   │
+│      --request-timeout-seconds       for each request to the endpoint.       │
+│                                      [default: 600.0]                        │
+│    API-KEY --api-key                 The API key to use for the endpoint. If │
+│                                      provided, it will be sent with every    │
+│                                      request as a header: Authorization:     │
+│                                      Bearer <api_key>.                       │
+│    TRANSPORT --transport             The transport to use for the endpoint.  │
+│      --transport-type                If not provided, it will be             │
+│                                      auto-detected from the URL.This can     │
+│                                      also be used to force an alternative    │
+│                                      transport or implementation. [choices:  │
+│                                      http]                                   │
+╰──────────────────────────────────────────────────────────────────────────────╯
 ```
+```
+╭─ Input ──────────────────────────────────────────────────────────────────────╮
+│ EXTRA-INPUTS --extra-inputs       Provide additional inputs to include with  │
+│                                   every request. Inputs should be in an      │
+│                                   'input_name:value' format. Alternatively,  │
+│                                   a string representing a json formatted     │
+│                                   dict can be provided. [default: []]        │
+│ HEADER --header               -H  Adds a custom header to the requests.      │
+│                                   Headers must be specified as               │
+│                                   'Header:Value' pairs. Alternatively, a     │
+│                                   string representing a json formatted dict  │
+│                                   can be provided. [default: []]             │
+│ INPUT-FILE --input-file           The file or directory path that contains   │
+│                                   the dataset to use for profiling. This     │
+│                                   parameter is used in conjunction with the  │
+│                                   custom_dataset_type parameter to support   │
+│                                   different types of user provided datasets. │
+│ FIXED-SCHEDULE                    Specifies to run a fixed schedule of       │
+│   --fixed-schedule                requests. This is normally inferred from   │
+│                                   the --input-file parameter, but can be set │
+│                                   manually here. [default: False]            │
+│ FIXED-SCHEDULE-AUTO-OFFSET        Specifies to automatically offset the      │
+│   --fixed-schedule-auto-offs      timestamps in the fixed schedule, such     │
+│   et                              that the first timestamp is considered 0,  │
+│                                   and the rest are shifted accordingly. If   │
+│                                   disabled, the timestamps will be assumed   │
+│                                   to be relative to 0. [default: False]      │
+│ FIXED-SCHEDULE-START-OFFSET       Specifies the offset in milliseconds to    │
+│   --fixed-schedule-start-off      start the fixed schedule at. By default,   │
+│   set                             the schedule starts at 0, but this option  │
+│                                   can be used to start at a reference point  │
+│                                   further in the schedule. This option       │
+│                                   cannot be used in conjunction with the     │
+│                                   --fixed-schedule-auto-offset. The schedule │
+│                                   will include any requests at the start     │
+│                                   offset.                                    │
+│ FIXED-SCHEDULE-END-OFFSET         Specifies the offset in milliseconds to    │
+│   --fixed-schedule-end-offse      end the fixed schedule at. By default, the │
+│   t                               schedule ends at the last timestamp in the │
+│                                   trace dataset, but this option can be used │
+│                                   to only run a subset of the trace. The     │
+│                                   schedule will include any requests at the  │
+│                                   end offset.                                │
+│ PUBLIC-DATASET                    The public dataset to use for the          │
+│   --public-dataset                requests. [choices: sharegpt]              │
+│ CUSTOM-DATASET-TYPE               The type of custom dataset to use. This    │
+│   --custom-dataset-type           parameter is used in conjunction with the  │
+│                                   --input-file parameter. [choices:          │
+│                                   single_turn, multi_turn, random_pool,      │
+│                                   mooncake_trace]                            │
+│ DATASET-SAMPLING-STRATEGY         The strategy to use for sampling the       │
+│   --dataset-sampling-strateg      dataset. sequential: Iterate through the   │
+│   y                               dataset sequentially, then wrap around to  │
+│                                   the beginning. random: Randomly select a   │
+│                                   conversation from the dataset. Will        │
+│                                   randomly sample with replacement. shuffle: │
+│                                   Shuffle the dataset and iterate through    │
+│                                   it. Will randomly sample without           │
+│                                   replacement. Once the end of the dataset   │
+│                                   is reached, shuffle the dataset again and  │
+│                                   start over. [choices: sequential, random,  │
+│                                   shuffle]                                   │
+│ RANDOM-SEED --random-seed         The seed used to generate random values.   │
+│                                   Set to some value to make the synthetic    │
+│                                   data generation deterministic. It will use │
+│                                   system default if not provided.            │
+│ GOODPUT --goodput                 Specify service level objectives (SLOs)    │
+│                                   for goodput as space-separated 'KEY:VALUE' │
+│                                   pairs, where KEY is a metric tag and VALUE │
+│                                   is a number in the metric’s display unit   │
+│                                   (falls back to its base unit if no display │
+│                                   unit is defined). Examples:                │
+│                                   'request_latency:250' (ms),                │
+│                                   'inter_token_latency:10' (ms),             │
+│                                   output_token_throughput_per_user:600       │
+│                                   (tokens/s). Only metrics applicable to the │
+│                                   current endpoint/config are considered.    │
+│                                   For more context on the definition of      │
+│                                   goodput, refer to DistServe paper:         │
+│                                   https://arxiv.org/pdf/2401.09670 and the   │
+│                                   blog:                                      │
+│                                   https://hao-ai-lab.github.io/blogs/distser │
+│                                   ve                                         │
+╰──────────────────────────────────────────────────────────────────────────────╯
+```
+```
+╭─ Output ─────────────────────────────────────────────────────────────────────╮
+│ OUTPUT-ARTIFACT-DIR          The directory to store all the (output)         │
+│   --output-artifact-dir      artifacts generated by AIPerf. [default:        │
+│   --artifact-dir             artifacts]                                      │
+│ PROFILE-EXPORT-PREFIX        The prefix for the profile export file names.   │
+│   --profile-export-prefix    Will be suffixed with .csv, .json, .jsonl, and  │
+│   --profile-export-file      _raw.jsonl.If not provided, the default profile │
+│                              export file names will be used:                 │
+│                              profile_export_aiperf.csv,                      │
+│                              profile_export_aiperf.json,                     │
+│                              profile_export.jsonl, and                       │
+│                              profile_export_raw.jsonl.                       │
+│ EXPORT-LEVEL --export-level  The level of profile export files to create.    │
+│   --profile-export-level     [choices: summary, records, raw] [default:      │
+│                              records]                                        │
+╰──────────────────────────────────────────────────────────────────────────────╯
+```
+```
+╭─ Tokenizer ──────────────────────────────────────────────────────────────────╮
+│ TOKENIZER --tokenizer         The HuggingFace tokenizer to use to interpret  │
+│                               token metrics from prompts and responses. The  │
+│                               value can be the name of a tokenizer or the    │
+│                               filepath of the tokenizer. The default value   │
+│                               is the model name.                             │
+│ TOKENIZER-REVISION            The specific model version to use. It can be a │
+│   --tokenizer-revision        branch name, tag name, or commit ID. [default: │
+│                               main]                                          │
+│ TOKENIZER-TRUST-REMOTE-CODE   Allows custom tokenizer to be downloaded and   │
+│   --tokenizer-trust-remote-c  executed. This carries security risks and      │
+│   ode                         should only be used for repositories you       │
+│                               trust. This is only necessary for custom       │
+│                               tokenizers stored in HuggingFace Hub.          │
+│                               [default: False]                               │
+╰──────────────────────────────────────────────────────────────────────────────╯
+```
+```
+╭─ Load Generator ─────────────────────────────────────────────────────────────╮
+│ BENCHMARK-DURATION            The duration in seconds for benchmarking.      │
+│   --benchmark-duration                                                       │
+│ BENCHMARK-GRACE-PERIOD        The grace period in seconds to wait for        │
+│   --benchmark-grace-period    responses after benchmark duration ends. Only  │
+│                               applies when --benchmark-duration is set.      │
+│                               Responses received within this period are      │
+│                               included in metrics. [default: 30.0]           │
+│ CONCURRENCY --concurrency     The concurrency value to benchmark.            │
+│ REQUEST-RATE --request-rate   Sets the request rate for the load generated   │
+│                               by AIPerf. Unit: requests/second               │
+│ REQUEST-RATE-MODE             Sets the request rate mode for the load        │
+│   --request-rate-mode         generated by AIPerf. Valid values: constant,   │
+│                               poisson. constant: Generate requests at a      │
+│                               fixed rate. poisson: Generate requests using a │
+│                               poisson distribution. [default: poisson]       │
+│ REQUEST-COUNT                 The number of requests to use for measurement. │
+│   --request-count             [default: 10]                                  │
+│   --num-requests                                                             │
+│ WARMUP-REQUEST-COUNT          The number of warmup requests to send before   │
+│   --warmup-request-count      benchmarking. [default: 0]                     │
+│   --num-warmup-requests                                                      │
+│ REQUEST-CANCELLATION-RATE     The percentage of requests to cancel.          │
+│   --request-cancellation-rat  [default: 0.0]                                 │
+│   e                                                                          │
+│ REQUEST-CANCELLATION-DELAY    The delay in seconds before cancelling         │
+│   --request-cancellation-del  requests. This is used when                    │
+│   ay                          --request-cancellation-rate is greater than 0. │
+│                               [default: 0.0]                                 │
+╰──────────────────────────────────────────────────────────────────────────────╯
+```
+```
+╭─ Conversation Input ─────────────────────────────────────────────────────────╮
+│ CONVERSATION-NUM              The total number of unique conversations to    │
+│   --conversation-num          generate. Each conversation represents a       │
+│   --num-conversations         single request session between client and      │
+│   --num-sessions              server. Supported on synthetic mode and the    │
+│                               custom random_pool dataset. The number of      │
+│                               conversations will be used to determine the    │
+│                               number of entries in both the custom           │
+│                               random_pool and synthetic datasets and will be │
+│                               reused until benchmarking is complete.         │
+│ NUM-DATASET-ENTRIES           The total number of unique dataset entries to  │
+│   --num-dataset-entries       generate for the dataset. Each entry           │
+│   --num-prompts               represents a single turn used in a request.    │
+│                               [default: 100]                                 │
+│ CONVERSATION-TURN-MEAN        The mean number of turns within a              │
+│   --conversation-turn-mean    conversation. [default: 1]                     │
+│   --session-turns-mean                                                       │
+│ CONVERSATION-TURN-STDDEV      The standard deviation of the number of turns  │
+│   --conversation-turn-stddev  within a conversation. [default: 0]            │
+│   --session-turns-stddev                                                     │
+│ CONVERSATION-TURN-DELAY-MEAN  The mean delay between turns within a          │
+│   --conversation-turn-delay-  conversation in milliseconds. [default: 0.0]   │
+│   mean                                                                       │
+│   --session-turn-delay-mean                                                  │
+│ CONVERSATION-TURN-DELAY-STDD  The standard deviation of the delay between    │
+│   EV --conversation-turn-del  turns within a conversation in milliseconds.   │
+│   ay-stddev --session-turn-d  [default: 0.0]                                 │
+│   elay-stddev                                                                │
+│ CONVERSATION-TURN-DELAY-RATI  A ratio to scale multi-turn delays. [default:  │
+│   O --conversation-turn-dela  1.0]                                           │
+│   y-ratio                                                                    │
+│   --session-delay-ratio                                                      │
+╰──────────────────────────────────────────────────────────────────────────────╯
+```
+```
+╭─ Input Sequence Length (ISL) ────────────────────────────────────────────────╮
+│ PROMPT-INPUT-TOKENS-MEAN      The mean of number of tokens in the generated  │
+│   --prompt-input-tokens-mean  prompts when using synthetic data. [default:   │
+│   --synthetic-input-tokens-m  550]                                           │
+│   ean --isl                                                                  │
+│ PROMPT-INPUT-TOKENS-STDDEV    The standard deviation of number of tokens in  │
+│   --prompt-input-tokens-stdd  the generated prompts when using synthetic     │
+│   ev --synthetic-input-token  data. [default: 0.0]                           │
+│   s-stddev --isl-stddev                                                      │
+│ PROMPT-INPUT-TOKENS-BLOCK-SI  The block size of the prompt. [default: 512]   │
+│   ZE --prompt-input-tokens-b                                                 │
+│   lock-size --synthetic-inpu                                                 │
+│   t-tokens-block-size                                                        │
+│   --isl-block-size                                                           │
+│ SEQ-DIST --seq-dist           Sequence length distribution specification for │
+│   --sequence-distribution     varying ISL/OSL pairs                          │
+╰──────────────────────────────────────────────────────────────────────────────╯
+```
+```
+╭─ Output Sequence Length (OSL) ───────────────────────────────────────────────╮
+│ PROMPT-OUTPUT-TOKENS-MEAN     The mean number of tokens in each output.      │
+│   --prompt-output-tokens-mea                                                 │
+│   n --output-tokens-mean                                                     │
+│   --osl                                                                      │
+│ PROMPT-OUTPUT-TOKENS-STDDEV   The standard deviation of the number of tokens │
+│   --prompt-output-tokens-std  in each output. [default: 0]                   │
+│   dev --output-tokens-stddev                                                 │
+│   --osl-stddev                                                               │
+╰──────────────────────────────────────────────────────────────────────────────╯
+```
+```
+╭─ Prompt ─────────────────────────────────────────────────────────────────────╮
+│ PROMPT-BATCH-SIZE      -b  The batch size of text requests AIPerf should     │
+│   --prompt-batch-size      send. This is currently supported with the        │
+│   --batch-size-text        embeddings and rankings endpoint types [default:  │
+│   --batch-size             1]                                                │
+╰──────────────────────────────────────────────────────────────────────────────╯
+```
+```
+╭─ Prefix Prompt ──────────────────────────────────────────────────────────────╮
+│ PROMPT-PREFIX-POOL-SIZE      The total size of the prefix prompt pool to     │
+│   --prompt-prefix-pool-size  select prefixes from. If this value is not      │
+│   --prefix-prompt-pool-size  zero, these are prompts that are prepended to   │
+│   --num-prefix-prompts       input prompts. This is useful for benchmarking  │
+│                              models that use a K-V cache. [default: 0]       │
+│ PROMPT-PREFIX-LENGTH         The number of tokens in each prefix prompt.     │
+│   --prompt-prefix-length     This is only used if "num" is greater than      │
+│   --prefix-prompt-length     zero. Note that due to the prefix and user      │
+│                              prompts being concatenated, the number of       │
+│                              tokens in the final prompt may be off by one.   │
+│                              [default: 0]                                    │
+╰──────────────────────────────────────────────────────────────────────────────╯
+```
+```
+╭─ Audio Input ────────────────────────────────────────────────────────────────╮
+│ AUDIO-BATCH-SIZE             The batch size of audio requests AIPerf should  │
+│   --audio-batch-size         send. This is currently supported with the      │
+│   --batch-size-audio         OpenAI chat endpoint type [default: 1]          │
+│ AUDIO-LENGTH-MEAN            The mean length of the audio in seconds.        │
+│   --audio-length-mean        [default: 0.0]                                  │
+│ AUDIO-LENGTH-STDDEV          The standard deviation of the length of the     │
+│   --audio-length-stddev      audio in seconds. [default: 0.0]                │
+│ AUDIO-FORMAT --audio-format  The format of the audio files (wav or mp3).     │
+│                              [choices: wav, mp3] [default: wav]              │
+│ AUDIO-DEPTHS --audio-depths  A list of audio bit depths to randomly select   │
+│                              from in bits. [default: [16]]                   │
+│ AUDIO-SAMPLE-RATES           A list of audio sample rates to randomly select │
+│   --audio-sample-rates       from in kHz. Common sample rates are 16, 44.1,  │
+│                              48, 96, etc. [default: [16.0]]                  │
+│ AUDIO-NUM-CHANNELS           The number of audio channels to use for the     │
+│   --audio-num-channels       audio data generation. [default: 1]             │
+╰──────────────────────────────────────────────────────────────────────────────╯
+```
+```
+╭─ Image Input ────────────────────────────────────────────────────────────────╮
+│ IMAGE-WIDTH-MEAN             The mean width of images when generating        │
+│   --image-width-mean         synthetic image data. [default: 0.0]            │
+│ IMAGE-WIDTH-STDDEV           The standard deviation of width of images when  │
+│   --image-width-stddev       generating synthetic image data. [default: 0.0] │
+│ IMAGE-HEIGHT-MEAN            The mean height of images when generating       │
+│   --image-height-mean        synthetic image data. [default: 0.0]            │
+│ IMAGE-HEIGHT-STDDEV          The standard deviation of height of images when │
+│   --image-height-stddev      generating synthetic image data. [default: 0.0] │
+│ IMAGE-BATCH-SIZE             The image batch size of the requests AIPerf     │
+│   --image-batch-size         should send. [default: 1]                       │
+│   --batch-size-image                                                         │
+│ IMAGE-FORMAT --image-format  The compression format of the images. [choices: │
+│                              png, jpeg, random] [default: png]               │
+╰──────────────────────────────────────────────────────────────────────────────╯
+```
+```
+╭─ Video Input ────────────────────────────────────────────────────────────────╮
+│ VIDEO-BATCH-SIZE             The video batch size of the requests AIPerf     │
+│   --video-batch-size         should send. [default: 1]                       │
+│   --batch-size-video                                                         │
+│ VIDEO-DURATION               Seconds per clip (default: 5.0). [default: 5.0] │
+│   --video-duration                                                           │
+│ VIDEO-FPS --video-fps        Frames per second (default/recommended for      │
+│                              Cosmos: 4). [default: 4]                        │
+│ VIDEO-WIDTH --video-width    Video width in pixels.                          │
+│ VIDEO-HEIGHT --video-height  Video height in pixels.                         │
+│ VIDEO-SYNTH-TYPE             Synthetic generator type. [choices:             │
+│   --video-synth-type         moving-shapes, grid-clock] [default:            │
+│                              moving-shapes]                                  │
+│ VIDEO-FORMAT --video-format  The video format of the generated files.        │
+│                              [choices: mp4] [default: mp4]                   │
+│ VIDEO-CODEC --video-codec    The video codec to use for encoding. Common     │
+│                              options: libx264 (CPU, widely compatible),      │
+│                              libx265 (CPU, smaller files), h264_nvenc        │
+│                              (NVIDIA GPU), hevc_nvenc (NVIDIA GPU, smaller   │
+│                              files). Any FFmpeg-supported codec can be used. │
+│                              [default: libx264]                              │
+╰──────────────────────────────────────────────────────────────────────────────╯
+```
+```
+╭─ Service ────────────────────────────────────────────────────────────────────╮
+│ LOG-LEVEL --log-level              Logging level [choices: trace, debug,     │
+│                                    info, notice, warning, success, error,    │
+│                                    critical] [default: info]                 │
+│ VERBOSE --verbose             -v   Equivalent to --log-level DEBUG. Enables  │
+│                                    more verbose logging output, but lacks    │
+│                                    some raw message logging. [default:       │
+│                                    False]                                    │
+│ EXTRA-VERBOSE                 -vv  Equivalent to --log-level TRACE. Enables  │
+│   --extra-verbose                  the most verbose logging output possible. │
+│                                    [default: False]                          │
+│ RECORD-PROCESSOR-SERVICE-COU       Number of services to spawn for           │
+│   NT --record-processor-serv       processing records. The higher the        │
+│   ice-count                        request rate, the more services should be │
+│   --record-processors              spawned in order to keep up with the      │
+│                                    incoming records. If not specified, the   │
+│                                    number of services will be automatically  │
+│                                    determined based on the worker count.     │
+│ UI-TYPE --ui-type --ui             Type of UI to use [choices: none, simple, │
+│                                    dashboard] [default: dashboard]           │
+╰──────────────────────────────────────────────────────────────────────────────╯
+```
+```
+╭─ Telemetry ──────────────────────────────────────────────────────────────────╮
+│ GPU-TELEMETRY      Enable GPU telemetry console display and optionally       │
+│   --gpu-telemetry  specify custom DCGM exporter URLs (e.g.,                  │
+│                    http://node1:9401/metrics http://node2:9401/metrics).     │
+│                    Default localhost:9400 and localhost:9401 are always      │
+│                    attempted                                                 │
+╰──────────────────────────────────────────────────────────────────────────────╯
+```
+```
+╭─ Workers ────────────────────────────────────────────────────────────────────╮
+│ WORKERS-MAX --workers-max  Maximum number of workers to create. If not       │
+│   --max-workers            specified, the number of workers will be          │
+│                            determined by the formula min(concurrency, (num   │
+│                            CPUs * 0.75) - 1),  with a default max cap of 32. │
+│                            Any value provided will still be capped by the    │
+│                            concurrency value (if specified), but not by the  │
+│                            max cap.                                          │
+╰──────────────────────────────────────────────────────────────────────────────╯
+```
+```
+╭─ ZMQ Communication ──────────────────────────────────────────────────────────╮
+│ ZMQ-HOST --zmq-host          Host address for TCP connections [default:      │
+│                              127.0.0.1]                                      │
+│ ZMQ-IPC-PATH --zmq-ipc-path  Path for IPC sockets                            │
+╰──────────────────────────────────────────────────────────────────────────────╯
+```
\ No newline at end of file
diff --git a/tools/generate_cli_options_md.py b/tools/generate_cli_options_md.py
new file mode 100755
index 000000000..b4b8b74ac
--- /dev/null
+++ b/tools/generate_cli_options_md.py
@@ -0,0 +1,197 @@
+#!/usr/bin/env python3
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Generate the CLI options documentation from the aiperf profile --help output.
+
+This script runs `aiperf profile --help` and formats the output into a markdown file.
+It should be run from the repository root.
+
+Usage:
+    python tools/generate_cli_options_md.py [--check]
+
+Options:
+    --check     Check if the current cli_options.md matches the generated output.
+                Returns exit code 1 if they differ, 0 if they match.
+"""
+
+import argparse
+import subprocess
+import sys
+from pathlib import Path
+
+
+def get_help_output() -> str:
+    """Run aiperf profile --help and return the output."""
+    try:
+        result = subprocess.run(
+            ["aiperf", "profile", "--help"],
+            capture_output=True,
+            text=True,
+            check=True,
+        )
+        return result.stdout
+    except subprocess.CalledProcessError as e:
+        print(f"Error running aiperf profile --help: {e}", file=sys.stderr)
+        print(f"stderr: {e.stderr}", file=sys.stderr)
+        sys.exit(1)
+    except FileNotFoundError:
+        print(
+            "Error: aiperf command not found. Make sure it's installed and in your PATH.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+
+def format_help_as_markdown(help_output: str) -> str:
+    """Format the help output as markdown with proper header and code blocks."""
+    # Split the output into sections based on the ╭─ ... ─╮ headers
+    lines = help_output.split("\n")
+    markdown_lines = [
+        "<!--",
+        "SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.",
+        "SPDX-License-Identifier: Apache-2.0",
+        "-->",
+        "",
+        "# CLI Options",
+        "Use these options to profile with AIPerf.",
+        "",
+    ]
+
+    current_section = []
+    in_section = False
+
+    for line in lines:
+        # Check if this is a section header line (starts with ╭─)
+        if line.strip().startswith("╭─"):
+            # If we were in a previous section, add it
+            if current_section:
+                markdown_lines.append("```")
+                markdown_lines.extend(current_section)
+                markdown_lines.append("```")
+                current_section = []
+            # Start new section
+            in_section = True
+            current_section.append(line)
+        elif line.strip().startswith("╰─"):
+            # End of section
+            current_section.append(line)
+            markdown_lines.append("```")
+            markdown_lines.extend(current_section)
+            markdown_lines.append("```")
+            current_section = []
+            in_section = False
+        elif in_section:
+            current_section.append(line)
+
+    # Add any remaining section
+    if current_section:
+        markdown_lines.append("```")
+        markdown_lines.extend(current_section)
+        markdown_lines.append("```")
+
+    return "\n".join(markdown_lines)
+
+
+def stage_file(file_path: Path) -> None:
+    """Stage a file using git add."""
+    try:
+        subprocess.run(
+            ["git", "add", str(file_path)],
+            check=True,
+            capture_output=True,
+            text=True,
+        )
+        print(f"✓ Staged {file_path} for commit", file=sys.stderr)
+    except subprocess.CalledProcessError as e:
+        print(
+            f"Warning: Could not stage {file_path}: {e.stderr}",
+            file=sys.stderr,
+        )
+    except FileNotFoundError:
+        print(
+            "Warning: git command not found, file not staged",
+            file=sys.stderr,
+        )
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Generate CLI options documentation from aiperf profile --help"
+    )
+    parser.add_argument(
+        "--check",
+        action="store_true",
+        help="Check if the current cli_options.md matches the generated output",
+    )
+    parser.add_argument(
+        "--output",
+        type=Path,
+        default=Path("docs/cli_options.md"),
+        help="Output file path (default: docs/cli_options.md)",
+    )
+    parser.add_argument(
+        "--no-stage",
+        action="store_true",
+        help="Don't automatically stage the file with git add (default: auto-stage)",
+    )
+    args = parser.parse_args()
+
+    # Get the help output
+    print("Running aiperf profile --help...", file=sys.stderr)
+    help_output = get_help_output()
+
+    # Format as markdown
+    print("Formatting output as markdown...", file=sys.stderr)
+    markdown = format_help_as_markdown(help_output)
+
+    if args.check:
+        # Check mode: compare with existing file
+        if not args.output.exists():
+            print(
+                f"Error: {args.output} does not exist. Run without --check to generate it.",
+                file=sys.stderr,
+            )
+            sys.exit(1)
+
+        current_content = args.output.read_text()
+        if current_content.strip() == markdown.strip():
+            print(f"✓ {args.output} is up to date!", file=sys.stderr)
+            sys.exit(0)
+        else:
+            print(
+                f"✗ {args.output} is out of sync with aiperf profile --help output!",
+                file=sys.stderr,
+            )
+            print(
+                "  Run 'make update-cli-docs' or 'python tools/generate_cli_options_md.py' to update it.",
+                file=sys.stderr,
+            )
+            sys.exit(1)
+    else:
+        # Write mode: write to file and optionally stage it
+        file_existed = args.output.exists()
+        if file_existed:
+            old_content = args.output.read_text()
+
+        args.output.parent.mkdir(parents=True, exist_ok=True)
+        args.output.write_text(markdown)
+
+        # Check if content actually changed
+        content_changed = not file_existed or old_content != markdown
+
+        if content_changed:
+            print(f"✓ Generated {args.output}", file=sys.stderr)
+
+            # Auto-stage the file unless --no-stage is specified
+            if not args.no_stage:
+                stage_file(args.output)
+        else:
+            print(f"✓ {args.output} already up to date", file=sys.stderr)
+
+        sys.exit(0)
+
+
+if __name__ == "__main__":
+    main()

From 7fb48b9e544e5858b6e69e048bc92e774b02f8b7 Mon Sep 17 00:00:00 2001
From: Elias Bermudez <dbermudez@nvidia.com>
Date: Fri, 31 Oct 2025 16:57:44 -0700
Subject: [PATCH 2/2] Iterate on the cli options table

---
 docs/cli_options.md              | 560 +++++++++----------------------
 tools/generate_cli_options_md.py | 239 +++++++++++--
 2 files changed, 378 insertions(+), 421 deletions(-)

diff --git a/docs/cli_options.md b/docs/cli_options.md
index 7705ec904..f69b4a9c9 100644
--- a/docs/cli_options.md
+++ b/docs/cli_options.md
@@ -6,397 +6,169 @@ SPDX-License-Identifier: Apache-2.0
 # CLI Options
 Use these options to profile with AIPerf.
 
-```
-╭─ Endpoint ───────────────────────────────────────────────────────────────────╮
-│ *  MODEL-NAMES --model-names     -m  Model name(s) to be benchmarked. Can be │
-│      --model                         a comma-separated list or a single      │
-│                                      model name. [required]                  │
-│    MODEL-SELECTION-STRATEGY          When multiple models are specified,     │
-│      --model-selection-strategy      this is how a specific model should be  │
-│                                      assigned to a prompt. round_robin: nth  │
-│                                      prompt in the list gets assigned to     │
-│                                      n-mod len(models). random: assignment   │
-│                                      is uniformly random [choices:           │
-│                                      round-robin, random] [default:          │
-│                                      round-robin]                            │
-│    CUSTOM-ENDPOINT                   Set a custom endpoint that differs from │
-│      --custom-endpoint               the OpenAI defaults.                    │
-│      --endpoint                                                              │
-│    ENDPOINT-TYPE                     The endpoint type to send requests to   │
-│      --endpoint-type                 on the server. [choices: chat,          │
-│                                      completions, cohere-rankings,           │
-│                                      embeddings, hf-tei-rankings,            │
-│                                      huggingface-generate, nim-rankings,     │
-│                                      solido-rag, template] [default: chat]   │
-│    STREAMING --streaming             An option to enable the use of the      │
-│                                      streaming API. [default: False]         │
-│    URL --url                     -u  URL of the endpoint to target for       │
-│                                      benchmarking. [default: localhost:8000] │
-│    REQUEST-TIMEOUT-SECONDS           The timeout in floating-point seconds   │
-│      --request-timeout-seconds       for each request to the endpoint.       │
-│                                      [default: 600.0]                        │
-│    API-KEY --api-key                 The API key to use for the endpoint. If │
-│                                      provided, it will be sent with every    │
-│                                      request as a header: Authorization:     │
-│                                      Bearer <api_key>.                       │
-│    TRANSPORT --transport             The transport to use for the endpoint.  │
-│      --transport-type                If not provided, it will be             │
-│                                      auto-detected from the URL.This can     │
-│                                      also be used to force an alternative    │
-│                                      transport or implementation. [choices:  │
-│                                      http]                                   │
-╰──────────────────────────────────────────────────────────────────────────────╯
-```
-```
-╭─ Input ──────────────────────────────────────────────────────────────────────╮
-│ EXTRA-INPUTS --extra-inputs       Provide additional inputs to include with  │
-│                                   every request. Inputs should be in an      │
-│                                   'input_name:value' format. Alternatively,  │
-│                                   a string representing a json formatted     │
-│                                   dict can be provided. [default: []]        │
-│ HEADER --header               -H  Adds a custom header to the requests.      │
-│                                   Headers must be specified as               │
-│                                   'Header:Value' pairs. Alternatively, a     │
-│                                   string representing a json formatted dict  │
-│                                   can be provided. [default: []]             │
-│ INPUT-FILE --input-file           The file or directory path that contains   │
-│                                   the dataset to use for profiling. This     │
-│                                   parameter is used in conjunction with the  │
-│                                   custom_dataset_type parameter to support   │
-│                                   different types of user provided datasets. │
-│ FIXED-SCHEDULE                    Specifies to run a fixed schedule of       │
-│   --fixed-schedule                requests. This is normally inferred from   │
-│                                   the --input-file parameter, but can be set │
-│                                   manually here. [default: False]            │
-│ FIXED-SCHEDULE-AUTO-OFFSET        Specifies to automatically offset the      │
-│   --fixed-schedule-auto-offs      timestamps in the fixed schedule, such     │
-│   et                              that the first timestamp is considered 0,  │
-│                                   and the rest are shifted accordingly. If   │
-│                                   disabled, the timestamps will be assumed   │
-│                                   to be relative to 0. [default: False]      │
-│ FIXED-SCHEDULE-START-OFFSET       Specifies the offset in milliseconds to    │
-│   --fixed-schedule-start-off      start the fixed schedule at. By default,   │
-│   set                             the schedule starts at 0, but this option  │
-│                                   can be used to start at a reference point  │
-│                                   further in the schedule. This option       │
-│                                   cannot be used in conjunction with the     │
-│                                   --fixed-schedule-auto-offset. The schedule │
-│                                   will include any requests at the start     │
-│                                   offset.                                    │
-│ FIXED-SCHEDULE-END-OFFSET         Specifies the offset in milliseconds to    │
-│   --fixed-schedule-end-offse      end the fixed schedule at. By default, the │
-│   t                               schedule ends at the last timestamp in the │
-│                                   trace dataset, but this option can be used │
-│                                   to only run a subset of the trace. The     │
-│                                   schedule will include any requests at the  │
-│                                   end offset.                                │
-│ PUBLIC-DATASET                    The public dataset to use for the          │
-│   --public-dataset                requests. [choices: sharegpt]              │
-│ CUSTOM-DATASET-TYPE               The type of custom dataset to use. This    │
-│   --custom-dataset-type           parameter is used in conjunction with the  │
-│                                   --input-file parameter. [choices:          │
-│                                   single_turn, multi_turn, random_pool,      │
-│                                   mooncake_trace]                            │
-│ DATASET-SAMPLING-STRATEGY         The strategy to use for sampling the       │
-│   --dataset-sampling-strateg      dataset. sequential: Iterate through the   │
-│   y                               dataset sequentially, then wrap around to  │
-│                                   the beginning. random: Randomly select a   │
-│                                   conversation from the dataset. Will        │
-│                                   randomly sample with replacement. shuffle: │
-│                                   Shuffle the dataset and iterate through    │
-│                                   it. Will randomly sample without           │
-│                                   replacement. Once the end of the dataset   │
-│                                   is reached, shuffle the dataset again and  │
-│                                   start over. [choices: sequential, random,  │
-│                                   shuffle]                                   │
-│ RANDOM-SEED --random-seed         The seed used to generate random values.   │
-│                                   Set to some value to make the synthetic    │
-│                                   data generation deterministic. It will use │
-│                                   system default if not provided.            │
-│ GOODPUT --goodput                 Specify service level objectives (SLOs)    │
-│                                   for goodput as space-separated 'KEY:VALUE' │
-│                                   pairs, where KEY is a metric tag and VALUE │
-│                                   is a number in the metric’s display unit   │
-│                                   (falls back to its base unit if no display │
-│                                   unit is defined). Examples:                │
-│                                   'request_latency:250' (ms),                │
-│                                   'inter_token_latency:10' (ms),             │
-│                                   output_token_throughput_per_user:600       │
-│                                   (tokens/s). Only metrics applicable to the │
-│                                   current endpoint/config are considered.    │
-│                                   For more context on the definition of      │
-│                                   goodput, refer to DistServe paper:         │
-│                                   https://arxiv.org/pdf/2401.09670 and the   │
-│                                   blog:                                      │
-│                                   https://hao-ai-lab.github.io/blogs/distser │
-│                                   ve                                         │
-╰──────────────────────────────────────────────────────────────────────────────╯
-```
-```
-╭─ Output ─────────────────────────────────────────────────────────────────────╮
-│ OUTPUT-ARTIFACT-DIR          The directory to store all the (output)         │
-│   --output-artifact-dir      artifacts generated by AIPerf. [default:        │
-│   --artifact-dir             artifacts]                                      │
-│ PROFILE-EXPORT-PREFIX        The prefix for the profile export file names.   │
-│   --profile-export-prefix    Will be suffixed with .csv, .json, .jsonl, and  │
-│   --profile-export-file      _raw.jsonl.If not provided, the default profile │
-│                              export file names will be used:                 │
-│                              profile_export_aiperf.csv,                      │
-│                              profile_export_aiperf.json,                     │
-│                              profile_export.jsonl, and                       │
-│                              profile_export_raw.jsonl.                       │
-│ EXPORT-LEVEL --export-level  The level of profile export files to create.    │
-│   --profile-export-level     [choices: summary, records, raw] [default:      │
-│                              records]                                        │
-╰──────────────────────────────────────────────────────────────────────────────╯
-```
-```
-╭─ Tokenizer ──────────────────────────────────────────────────────────────────╮
-│ TOKENIZER --tokenizer         The HuggingFace tokenizer to use to interpret  │
-│                               token metrics from prompts and responses. The  │
-│                               value can be the name of a tokenizer or the    │
-│                               filepath of the tokenizer. The default value   │
-│                               is the model name.                             │
-│ TOKENIZER-REVISION            The specific model version to use. It can be a │
-│   --tokenizer-revision        branch name, tag name, or commit ID. [default: │
-│                               main]                                          │
-│ TOKENIZER-TRUST-REMOTE-CODE   Allows custom tokenizer to be downloaded and   │
-│   --tokenizer-trust-remote-c  executed. This carries security risks and      │
-│   ode                         should only be used for repositories you       │
-│                               trust. This is only necessary for custom       │
-│                               tokenizers stored in HuggingFace Hub.          │
-│                               [default: False]                               │
-╰──────────────────────────────────────────────────────────────────────────────╯
-```
-```
-╭─ Load Generator ─────────────────────────────────────────────────────────────╮
-│ BENCHMARK-DURATION            The duration in seconds for benchmarking.      │
-│   --benchmark-duration                                                       │
-│ BENCHMARK-GRACE-PERIOD        The grace period in seconds to wait for        │
-│   --benchmark-grace-period    responses after benchmark duration ends. Only  │
-│                               applies when --benchmark-duration is set.      │
-│                               Responses received within this period are      │
-│                               included in metrics. [default: 30.0]           │
-│ CONCURRENCY --concurrency     The concurrency value to benchmark.            │
-│ REQUEST-RATE --request-rate   Sets the request rate for the load generated   │
-│                               by AIPerf. Unit: requests/second               │
-│ REQUEST-RATE-MODE             Sets the request rate mode for the load        │
-│   --request-rate-mode         generated by AIPerf. Valid values: constant,   │
-│                               poisson. constant: Generate requests at a      │
-│                               fixed rate. poisson: Generate requests using a │
-│                               poisson distribution. [default: poisson]       │
-│ REQUEST-COUNT                 The number of requests to use for measurement. │
-│   --request-count             [default: 10]                                  │
-│   --num-requests                                                             │
-│ WARMUP-REQUEST-COUNT          The number of warmup requests to send before   │
-│   --warmup-request-count      benchmarking. [default: 0]                     │
-│   --num-warmup-requests                                                      │
-│ REQUEST-CANCELLATION-RATE     The percentage of requests to cancel.          │
-│   --request-cancellation-rat  [default: 0.0]                                 │
-│   e                                                                          │
-│ REQUEST-CANCELLATION-DELAY    The delay in seconds before cancelling         │
-│   --request-cancellation-del  requests. This is used when                    │
-│   ay                          --request-cancellation-rate is greater than 0. │
-│                               [default: 0.0]                                 │
-╰──────────────────────────────────────────────────────────────────────────────╯
-```
-```
-╭─ Conversation Input ─────────────────────────────────────────────────────────╮
-│ CONVERSATION-NUM              The total number of unique conversations to    │
-│   --conversation-num          generate. Each conversation represents a       │
-│   --num-conversations         single request session between client and      │
-│   --num-sessions              server. Supported on synthetic mode and the    │
-│                               custom random_pool dataset. The number of      │
-│                               conversations will be used to determine the    │
-│                               number of entries in both the custom           │
-│                               random_pool and synthetic datasets and will be │
-│                               reused until benchmarking is complete.         │
-│ NUM-DATASET-ENTRIES           The total number of unique dataset entries to  │
-│   --num-dataset-entries       generate for the dataset. Each entry           │
-│   --num-prompts               represents a single turn used in a request.    │
-│                               [default: 100]                                 │
-│ CONVERSATION-TURN-MEAN        The mean number of turns within a              │
-│   --conversation-turn-mean    conversation. [default: 1]                     │
-│   --session-turns-mean                                                       │
-│ CONVERSATION-TURN-STDDEV      The standard deviation of the number of turns  │
-│   --conversation-turn-stddev  within a conversation. [default: 0]            │
-│   --session-turns-stddev                                                     │
-│ CONVERSATION-TURN-DELAY-MEAN  The mean delay between turns within a          │
-│   --conversation-turn-delay-  conversation in milliseconds. [default: 0.0]   │
-│   mean                                                                       │
-│   --session-turn-delay-mean                                                  │
-│ CONVERSATION-TURN-DELAY-STDD  The standard deviation of the delay between    │
-│   EV --conversation-turn-del  turns within a conversation in milliseconds.   │
-│   ay-stddev --session-turn-d  [default: 0.0]                                 │
-│   elay-stddev                                                                │
-│ CONVERSATION-TURN-DELAY-RATI  A ratio to scale multi-turn delays. [default:  │
-│   O --conversation-turn-dela  1.0]                                           │
-│   y-ratio                                                                    │
-│   --session-delay-ratio                                                      │
-╰──────────────────────────────────────────────────────────────────────────────╯
-```
-```
-╭─ Input Sequence Length (ISL) ────────────────────────────────────────────────╮
-│ PROMPT-INPUT-TOKENS-MEAN      The mean of number of tokens in the generated  │
-│   --prompt-input-tokens-mean  prompts when using synthetic data. [default:   │
-│   --synthetic-input-tokens-m  550]                                           │
-│   ean --isl                                                                  │
-│ PROMPT-INPUT-TOKENS-STDDEV    The standard deviation of number of tokens in  │
-│   --prompt-input-tokens-stdd  the generated prompts when using synthetic     │
-│   ev --synthetic-input-token  data. [default: 0.0]                           │
-│   s-stddev --isl-stddev                                                      │
-│ PROMPT-INPUT-TOKENS-BLOCK-SI  The block size of the prompt. [default: 512]   │
-│   ZE --prompt-input-tokens-b                                                 │
-│   lock-size --synthetic-inpu                                                 │
-│   t-tokens-block-size                                                        │
-│   --isl-block-size                                                           │
-│ SEQ-DIST --seq-dist           Sequence length distribution specification for │
-│   --sequence-distribution     varying ISL/OSL pairs                          │
-╰──────────────────────────────────────────────────────────────────────────────╯
-```
-```
-╭─ Output Sequence Length (OSL) ───────────────────────────────────────────────╮
-│ PROMPT-OUTPUT-TOKENS-MEAN     The mean number of tokens in each output.      │
-│   --prompt-output-tokens-mea                                                 │
-│   n --output-tokens-mean                                                     │
-│   --osl                                                                      │
-│ PROMPT-OUTPUT-TOKENS-STDDEV   The standard deviation of the number of tokens │
-│   --prompt-output-tokens-std  in each output. [default: 0]                   │
-│   dev --output-tokens-stddev                                                 │
-│   --osl-stddev                                                               │
-╰──────────────────────────────────────────────────────────────────────────────╯
-```
-```
-╭─ Prompt ─────────────────────────────────────────────────────────────────────╮
-│ PROMPT-BATCH-SIZE      -b  The batch size of text requests AIPerf should     │
-│   --prompt-batch-size      send. This is currently supported with the        │
-│   --batch-size-text        embeddings and rankings endpoint types [default:  │
-│   --batch-size             1]                                                │
-╰──────────────────────────────────────────────────────────────────────────────╯
-```
-```
-╭─ Prefix Prompt ──────────────────────────────────────────────────────────────╮
-│ PROMPT-PREFIX-POOL-SIZE      The total size of the prefix prompt pool to     │
-│   --prompt-prefix-pool-size  select prefixes from. If this value is not      │
-│   --prefix-prompt-pool-size  zero, these are prompts that are prepended to   │
-│   --num-prefix-prompts       input prompts. This is useful for benchmarking  │
-│                              models that use a K-V cache. [default: 0]       │
-│ PROMPT-PREFIX-LENGTH         The number of tokens in each prefix prompt.     │
-│   --prompt-prefix-length     This is only used if "num" is greater than      │
-│   --prefix-prompt-length     zero. Note that due to the prefix and user      │
-│                              prompts being concatenated, the number of       │
-│                              tokens in the final prompt may be off by one.   │
-│                              [default: 0]                                    │
-╰──────────────────────────────────────────────────────────────────────────────╯
-```
-```
-╭─ Audio Input ────────────────────────────────────────────────────────────────╮
-│ AUDIO-BATCH-SIZE             The batch size of audio requests AIPerf should  │
-│   --audio-batch-size         send. This is currently supported with the      │
-│   --batch-size-audio         OpenAI chat endpoint type [default: 1]          │
-│ AUDIO-LENGTH-MEAN            The mean length of the audio in seconds.        │
-│   --audio-length-mean        [default: 0.0]                                  │
-│ AUDIO-LENGTH-STDDEV          The standard deviation of the length of the     │
-│   --audio-length-stddev      audio in seconds. [default: 0.0]                │
-│ AUDIO-FORMAT --audio-format  The format of the audio files (wav or mp3).     │
-│                              [choices: wav, mp3] [default: wav]              │
-│ AUDIO-DEPTHS --audio-depths  A list of audio bit depths to randomly select   │
-│                              from in bits. [default: [16]]                   │
-│ AUDIO-SAMPLE-RATES           A list of audio sample rates to randomly select │
-│   --audio-sample-rates       from in kHz. Common sample rates are 16, 44.1,  │
-│                              48, 96, etc. [default: [16.0]]                  │
-│ AUDIO-NUM-CHANNELS           The number of audio channels to use for the     │
-│   --audio-num-channels       audio data generation. [default: 1]             │
-╰──────────────────────────────────────────────────────────────────────────────╯
-```
-```
-╭─ Image Input ────────────────────────────────────────────────────────────────╮
-│ IMAGE-WIDTH-MEAN             The mean width of images when generating        │
-│   --image-width-mean         synthetic image data. [default: 0.0]            │
-│ IMAGE-WIDTH-STDDEV           The standard deviation of width of images when  │
-│   --image-width-stddev       generating synthetic image data. [default: 0.0] │
-│ IMAGE-HEIGHT-MEAN            The mean height of images when generating       │
-│   --image-height-mean        synthetic image data. [default: 0.0]            │
-│ IMAGE-HEIGHT-STDDEV          The standard deviation of height of images when │
-│   --image-height-stddev      generating synthetic image data. [default: 0.0] │
-│ IMAGE-BATCH-SIZE             The image batch size of the requests AIPerf     │
-│   --image-batch-size         should send. [default: 1]                       │
-│   --batch-size-image                                                         │
-│ IMAGE-FORMAT --image-format  The compression format of the images. [choices: │
-│                              png, jpeg, random] [default: png]               │
-╰──────────────────────────────────────────────────────────────────────────────╯
-```
-```
-╭─ Video Input ────────────────────────────────────────────────────────────────╮
-│ VIDEO-BATCH-SIZE             The video batch size of the requests AIPerf     │
-│   --video-batch-size         should send. [default: 1]                       │
-│   --batch-size-video                                                         │
-│ VIDEO-DURATION               Seconds per clip (default: 5.0). [default: 5.0] │
-│   --video-duration                                                           │
-│ VIDEO-FPS --video-fps        Frames per second (default/recommended for      │
-│                              Cosmos: 4). [default: 4]                        │
-│ VIDEO-WIDTH --video-width    Video width in pixels.                          │
-│ VIDEO-HEIGHT --video-height  Video height in pixels.                         │
-│ VIDEO-SYNTH-TYPE             Synthetic generator type. [choices:             │
-│   --video-synth-type         moving-shapes, grid-clock] [default:            │
-│                              moving-shapes]                                  │
-│ VIDEO-FORMAT --video-format  The video format of the generated files.        │
-│                              [choices: mp4] [default: mp4]                   │
-│ VIDEO-CODEC --video-codec    The video codec to use for encoding. Common     │
-│                              options: libx264 (CPU, widely compatible),      │
-│                              libx265 (CPU, smaller files), h264_nvenc        │
-│                              (NVIDIA GPU), hevc_nvenc (NVIDIA GPU, smaller   │
-│                              files). Any FFmpeg-supported codec can be used. │
-│                              [default: libx264]                              │
-╰──────────────────────────────────────────────────────────────────────────────╯
-```
-```
-╭─ Service ────────────────────────────────────────────────────────────────────╮
-│ LOG-LEVEL --log-level              Logging level [choices: trace, debug,     │
-│                                    info, notice, warning, success, error,    │
-│                                    critical] [default: info]                 │
-│ VERBOSE --verbose             -v   Equivalent to --log-level DEBUG. Enables  │
-│                                    more verbose logging output, but lacks    │
-│                                    some raw message logging. [default:       │
-│                                    False]                                    │
-│ EXTRA-VERBOSE                 -vv  Equivalent to --log-level TRACE. Enables  │
-│   --extra-verbose                  the most verbose logging output possible. │
-│                                    [default: False]                          │
-│ RECORD-PROCESSOR-SERVICE-COU       Number of services to spawn for           │
-│   NT --record-processor-serv       processing records. The higher the        │
-│   ice-count                        request rate, the more services should be │
-│   --record-processors              spawned in order to keep up with the      │
-│                                    incoming records. If not specified, the   │
-│                                    number of services will be automatically  │
-│                                    determined based on the worker count.     │
-│ UI-TYPE --ui-type --ui             Type of UI to use [choices: none, simple, │
-│                                    dashboard] [default: dashboard]           │
-╰──────────────────────────────────────────────────────────────────────────────╯
-```
-```
-╭─ Telemetry ──────────────────────────────────────────────────────────────────╮
-│ GPU-TELEMETRY      Enable GPU telemetry console display and optionally       │
-│   --gpu-telemetry  specify custom DCGM exporter URLs (e.g.,                  │
-│                    http://node1:9401/metrics http://node2:9401/metrics).     │
-│                    Default localhost:9400 and localhost:9401 are always      │
-│                    attempted                                                 │
-╰──────────────────────────────────────────────────────────────────────────────╯
-```
-```
-╭─ Workers ────────────────────────────────────────────────────────────────────╮
-│ WORKERS-MAX --workers-max  Maximum number of workers to create. If not       │
-│   --max-workers            specified, the number of workers will be          │
-│                            determined by the formula min(concurrency, (num   │
-│                            CPUs * 0.75) - 1),  with a default max cap of 32. │
-│                            Any value provided will still be capped by the    │
-│                            concurrency value (if specified), but not by the  │
-│                            max cap.                                          │
-╰──────────────────────────────────────────────────────────────────────────────╯
-```
-```
-╭─ ZMQ Communication ──────────────────────────────────────────────────────────╮
-│ ZMQ-HOST --zmq-host          Host address for TCP connections [default:      │
-│                              127.0.0.1]                                      │
-│ ZMQ-IPC-PATH --zmq-ipc-path  Path for IPC sockets                            │
-╰──────────────────────────────────────────────────────────────────────────────╯
-```
\ No newline at end of file
+## Endpoint
+
+| Option | Description |
+|:-------|:-----------:|
+| **`*`**<br>**MODEL-NAMES**<br>`--model-names` | -m Model name(s) to be benchmarked. Can be --model                         a comma-separated list or a single model name. [required] |
+| **MODEL-SELECTION-STRATEGY** | When multiple models are specified, --model-selection-strategy      this is how a specific model should be assigned to a prompt. round_robin: nth prompt in the list gets assigned to n-mod len(models). random: assignment is uniformly random [choices: round-robin, random] [default: round-robin] |
+| **CUSTOM-ENDPOINT** | Set a custom endpoint that differs from --custom-endpoint               the OpenAI defaults. --endpoint |
+| **ENDPOINT-TYPE** | The endpoint type to send requests to --endpoint-type                 on the server. [choices: chat, completions, cohere-rankings, embeddings, hf-tei-rankings, huggingface-generate, nim-rankings, solido-rag, template] [default: chat] |
+| **STREAMING**<br>`--streaming` | An option to enable the use of the streaming API. [default: False] |
+| **URL**<br>`--url` | -u URL of the endpoint to target for benchmarking. [default: localhost:8000] |
+| **REQUEST-TIMEOUT-SECONDS** | The timeout in floating-point seconds --request-timeout-seconds       for each request to the endpoint. [default: 600.0] |
+| **API-KEY**<br>`--api-key` | The API key to use for the endpoint. If provided, it will be sent with every request as a header: Authorization: Bearer <api_key>. |
+| **TRANSPORT**<br>`--transport` | The transport to use for the endpoint. --transport-type                If not provided, it will be auto-detected from the URL.This can also be used to force an alternative transport or implementation. [choices: http] |
+
+## Input
+
+| Option | Description |
+|:-------|:-----------:|
+| **EXTRA-INPUTS**<br>`--extra-inputs` | Provide additional inputs to include with every request. Inputs should be in an 'input_name:value' format. Alternatively, a string representing a json formatted dict can be provided. [default: []] |
+| **HEADER**<br>`--header` | -H Adds a custom header to the requests. Headers must be specified as 'Header:Value' pairs. Alternatively, a string representing a json formatted dict can be provided. [default: []] |
+| **INPUT-FILE**<br>`--input-file` | The file or directory path that contains the dataset to use for profiling. This parameter is used in conjunction with the custom_dataset_type parameter to support different types of user provided datasets. |
+| **FIXED-SCHEDULE** | Specifies to run a fixed schedule of --fixed-schedule                requests. This is normally inferred from the --input-file parameter, but can be set manually here. [default: False] |
+| **FIXED-SCHEDULE-AUTO-OFFSET** | Specifies to automatically offset the --fixed-schedule-auto-offs      timestamps in the fixed schedule, such et                              that the first timestamp is considered 0, and the rest are shifted accordingly. If disabled, the timestamps will be assumed to be relative to 0. [default: False] |
+| **FIXED-SCHEDULE-START-OFFSET** | Specifies the offset in milliseconds to --fixed-schedule-start-off      start the fixed schedule at. By default, set                             the schedule starts at 0, but this option can be used to start at a reference point further in the schedule. This option cannot be used in conjunction with the --fixed-schedule-auto-offset. The schedule will include any requests at the start offset. |
+| **FIXED-SCHEDULE-END-OFFSET** | Specifies the offset in milliseconds to --fixed-schedule-end-offse      end the fixed schedule at. By default, the t                               schedule ends at the last timestamp in the trace dataset, but this option can be used to only run a subset of the trace. The schedule will include any requests at the end offset. |
+| **PUBLIC-DATASET** | The public dataset to use for the --public-dataset                requests. [choices: sharegpt] |
+| **CUSTOM-DATASET-TYPE** | The type of custom dataset to use. This --custom-dataset-type           parameter is used in conjunction with the --input-file parameter. [choices: single_turn, multi_turn, random_pool, mooncake_trace] |
+| **DATASET-SAMPLING-STRATEGY** | The strategy to use for sampling the --dataset-sampling-strateg      dataset. sequential: Iterate through the y                               dataset sequentially, then wrap around to the beginning. random: Randomly select a conversation from the dataset. Will randomly sample with replacement. shuffle: Shuffle the dataset and iterate through it. Will randomly sample without replacement. Once the end of the dataset is reached, shuffle the dataset again and start over. [choices: sequential, random, shuffle] |
+| **RANDOM-SEED**<br>`--random-seed` | The seed used to generate random values. Set to some value to make the synthetic data generation deterministic. It will use system default if not provided. |
+| **GOODPUT**<br>`--goodput` | Specify service level objectives (SLOs) for goodput as space-separated 'KEY:VALUE' pairs, where KEY is a metric tag and VALUE is a number in the metric’s display unit (falls back to its base unit if no display unit is defined). Examples: 'request_latency:250' (ms), 'inter_token_latency:10' (ms), output_token_throughput_per_user:600 (tokens/s). Only metrics applicable to the current endpoint/config are considered. For more context on the definition of goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 and the blog: https://hao-ai-lab.github.io/blogs/distser ve |
+
+## Output
+
+| Option | Description |
+|:-------|:-----------:|
+| **OUTPUT-ARTIFACT-DIR** | The directory to store all the (output) --output-artifact-dir      artifacts generated by AIPerf. [default: --artifact-dir             artifacts] |
+| **PROFILE-EXPORT-PREFIX** | The prefix for the profile export file names. --profile-export-prefix    Will be suffixed with .csv, .json, .jsonl, and --profile-export-file      _raw.jsonl.If not provided, the default profile export file names will be used: profile_export_aiperf.csv, profile_export_aiperf.json, profile_export.jsonl, and profile_export_raw.jsonl. |
+| **EXPORT-LEVEL**<br>`--export-level` | The level of profile export files to create. --profile-export-level     [choices: summary, records, raw] [default: records] |
+
+## Tokenizer
+
+| Option | Description |
+|:-------|:-----------:|
+| **TOKENIZER**<br>`--tokenizer` | The HuggingFace tokenizer to use to interpret token metrics from prompts and responses. The value can be the name of a tokenizer or the filepath of the tokenizer. The default value is the model name. |
+| **TOKENIZER-REVISION** | The specific model version to use. It can be a --tokenizer-revision        branch name, tag name, or commit ID. [default: main] |
+| **TOKENIZER-TRUST-REMOTE-CODE** | Allows custom tokenizer to be downloaded and --tokenizer-trust-remote-c  executed. This carries security risks and ode                         should only be used for repositories you trust. This is only necessary for custom tokenizers stored in HuggingFace Hub. [default: False] |
+
+## Load Generator
+
+| Option | Description |
+|:-------|:-----------:|
+| **BENCHMARK-DURATION** | The duration in seconds for benchmarking. --benchmark-duration |
+| **BENCHMARK-GRACE-PERIOD** | The grace period in seconds to wait for --benchmark-grace-period    responses after benchmark duration ends. Only applies when --benchmark-duration is set. Responses received within this period are included in metrics. [default: 30.0] |
+| **CONCURRENCY**<br>`--concurrency` | The concurrency value to benchmark. |
+| **REQUEST-RATE**<br>`--request-rate` | Sets the request rate for the load generated by AIPerf. Unit: requests/second |
+| **REQUEST-RATE-MODE** | Sets the request rate mode for the load --request-rate-mode         generated by AIPerf. Valid values: constant, poisson. constant: Generate requests at a fixed rate. poisson: Generate requests using a poisson distribution. [default: poisson] |
+| **REQUEST-COUNT** | The number of requests to use for measurement. --request-count             [default: 10] --num-requests |
+| **WARMUP-REQUEST-COUNT** | The number of warmup requests to send before --warmup-request-count      benchmarking. [default: 0] --num-warmup-requests |
+| **REQUEST-CANCELLATION-RATE** | The percentage of requests to cancel. --request-cancellation-rat  [default: 0.0] e |
+| **REQUEST-CANCELLATION-DELAY** | The delay in seconds before cancelling --request-cancellation-del  requests. This is used when ay                          --request-cancellation-rate is greater than 0. [default: 0.0] |
+
+## Conversation Input
+
+| Option | Description |
+|:-------|:-----------:|
+| **CONVERSATION-NUM** | The total number of unique conversations to --conversation-num          generate. Each conversation represents a --num-conversations         single request session between client and --num-sessions              server. Supported on synthetic mode and the custom random_pool dataset. The number of conversations will be used to determine the number of entries in both the custom random_pool and synthetic datasets and will be reused until benchmarking is complete. |
+| **NUM-DATASET-ENTRIES** | The total number of unique dataset entries to --num-dataset-entries       generate for the dataset. Each entry --num-prompts               represents a single turn used in a request. [default: 100] |
+| **CONVERSATION-TURN-MEAN** | The mean number of turns within a --conversation-turn-mean    conversation. [default: 1] --session-turns-mean |
+| **CONVERSATION-TURN-STDDEV** | The standard deviation of the number of turns --conversation-turn-stddev  within a conversation. [default: 0] --session-turns-stddev |
+| **CONVERSATION-TURN-DELAY-MEAN** | The mean delay between turns within a --conversation-turn-delay-  conversation in milliseconds. [default: 0.0] mean --session-turn-delay-mean |
+| **CONVERSATION-TURN-DELAY-STDD** | The standard deviation of the delay between EV --conversation-turn-del  turns within a conversation in milliseconds. ay-stddev --session-turn-d  [default: 0.0] elay-stddev |
+| **CONVERSATION-TURN-DELAY-RATI** | A ratio to scale multi-turn delays. [default: O --conversation-turn-dela  1.0] y-ratio --session-delay-ratio |
+
+## Input Sequence Length (ISL)
+
+| Option | Description |
+|:-------|:-----------:|
+| **PROMPT-INPUT-TOKENS-MEAN** | The mean of number of tokens in the generated --prompt-input-tokens-mean  prompts when using synthetic data. [default: --synthetic-input-tokens-m  550] ean --isl |
+| **PROMPT-INPUT-TOKENS-STDDEV** | The standard deviation of number of tokens in --prompt-input-tokens-stdd  the generated prompts when using synthetic ev --synthetic-input-token  data. [default: 0.0] s-stddev --isl-stddev |
+| **PROMPT-INPUT-TOKENS-BLOCK-SI** | The block size of the prompt. [default: 512] ZE --prompt-input-tokens-b lock-size --synthetic-inpu t-tokens-block-size --isl-block-size |
+| **SEQ-DIST**<br>`--seq-dist` | Sequence length distribution specification for --sequence-distribution     varying ISL/OSL pairs |
+
+## Output Sequence Length (OSL)
+
+| Option | Description |
+|:-------|:-----------:|
+| **PROMPT-OUTPUT-TOKENS-MEAN** | The mean number of tokens in each output. --prompt-output-tokens-mea n --output-tokens-mean --osl |
+| **PROMPT-OUTPUT-TOKENS-STDDEV** | The standard deviation of the number of tokens --prompt-output-tokens-std  in each output. [default: 0] dev --output-tokens-stddev --osl-stddev |
+
+## Prompt
+
+| Option | Description |
+|:-------|:-----------:|
+| **PROMPT-BATCH-SIZE** | -b The batch size of text requests AIPerf should --prompt-batch-size      send. This is currently supported with the --batch-size-text        embeddings and rankings endpoint types [default: --batch-size             1] |
+
+## Prefix Prompt
+
+| Option | Description |
+|:-------|:-----------:|
+| **PROMPT-PREFIX-POOL-SIZE** | The total size of the prefix prompt pool to --prompt-prefix-pool-size  select prefixes from. If this value is not --prefix-prompt-pool-size  zero, these are prompts that are prepended to --num-prefix-prompts       input prompts. This is useful for benchmarking models that use a K-V cache. [default: 0] |
+| **PROMPT-PREFIX-LENGTH** | The number of tokens in each prefix prompt. --prompt-prefix-length     This is only used if "num" is greater than --prefix-prompt-length     zero. Note that due to the prefix and user prompts being concatenated, the number of tokens in the final prompt may be off by one. [default: 0] |
+
+## Audio Input
+
+| Option | Description |
+|:-------|:-----------:|
+| **AUDIO-BATCH-SIZE** | The batch size of audio requests AIPerf should --audio-batch-size         send. This is currently supported with the --batch-size-audio         OpenAI chat endpoint type [default: 1] |
+| **AUDIO-LENGTH-MEAN** | The mean length of the audio in seconds. --audio-length-mean        [default: 0.0] |
+| **AUDIO-LENGTH-STDDEV** | The standard deviation of the length of the --audio-length-stddev      audio in seconds. [default: 0.0] |
+| **AUDIO-FORMAT**<br>`--audio-format` | The format of the audio files (wav or mp3). [choices: wav, mp3] [default: wav] |
+| **AUDIO-DEPTHS**<br>`--audio-depths` | A list of audio bit depths to randomly select from in bits. [default: [16]] |
+| **AUDIO-SAMPLE-RATES** | A list of audio sample rates to randomly select --audio-sample-rates       from in kHz. Common sample rates are 16, 44.1, 48, 96, etc. [default: [16.0]] |
+| **AUDIO-NUM-CHANNELS** | The number of audio channels to use for the --audio-num-channels       audio data generation. [default: 1] |
+
+## Image Input
+
+| Option | Description |
+|:-------|:-----------:|
+| **IMAGE-WIDTH-MEAN** | The mean width of images when generating --image-width-mean         synthetic image data. [default: 0.0] |
+| **IMAGE-WIDTH-STDDEV** | The standard deviation of width of images when --image-width-stddev       generating synthetic image data. [default: 0.0] |
+| **IMAGE-HEIGHT-MEAN** | The mean height of images when generating --image-height-mean        synthetic image data. [default: 0.0] |
+| **IMAGE-HEIGHT-STDDEV** | The standard deviation of height of images when --image-height-stddev      generating synthetic image data. [default: 0.0] |
+| **IMAGE-BATCH-SIZE** | The image batch size of the requests AIPerf --image-batch-size         should send. [default: 1] --batch-size-image |
+| **IMAGE-FORMAT**<br>`--image-format` | The compression format of the images. [choices: png, jpeg, random] [default: png] |
+
+## Video Input
+
+| Option | Description |
+|:-------|:-----------:|
+| **VIDEO-BATCH-SIZE** | The video batch size of the requests AIPerf --video-batch-size         should send. [default: 1] --batch-size-video |
+| **VIDEO-DURATION** | Seconds per clip (default: 5.0). [default: 5.0] --video-duration |
+| **VIDEO-FPS**<br>`--video-fps` | Frames per second (default/recommended for Cosmos: 4). [default: 4] |
+| **VIDEO-WIDTH**<br>`--video-width` | Video width in pixels. |
+| **VIDEO-HEIGHT**<br>`--video-height` | Video height in pixels. |
+| **VIDEO-SYNTH-TYPE** | Synthetic generator type. [choices: --video-synth-type         moving-shapes, grid-clock] [default: moving-shapes] |
+| **VIDEO-FORMAT**<br>`--video-format` | The video format of the generated files. [choices: mp4] [default: mp4] |
+| **VIDEO-CODEC**<br>`--video-codec` | The video codec to use for encoding. Common options: libx264 (CPU, widely compatible), libx265 (CPU, smaller files), h264_nvenc (NVIDIA GPU), hevc_nvenc (NVIDIA GPU, smaller files). Any FFmpeg-supported codec can be used. [default: libx264] |
+
+## Service
+
+| Option | Description |
+|:-------|:-----------:|
+| **LOG-LEVEL**<br>`--log-level` | Logging level [choices: trace, debug, info, notice, warning, success, error, critical] [default: info] |
+| **VERBOSE**<br>`--verbose` | -v Equivalent to --log-level DEBUG. Enables more verbose logging output, but lacks some raw message logging. [default: False] |
+| **EXTRA-VERBOSE** | -vv Equivalent to --log-level TRACE. Enables --extra-verbose                  the most verbose logging output possible. [default: False] |
+| **RECORD-PROCESSOR-SERVICE-COU** | Number of services to spawn for NT --record-processor-serv       processing records. The higher the ice-count                        request rate, the more services should be --record-processors              spawned in order to keep up with the incoming records. If not specified, the number of services will be automatically determined based on the worker count. |
+| **UI-TYPE**<br>`--ui-type`<br>`--ui` | Type of UI to use [choices: none, simple, dashboard] [default: dashboard] |
+
+## Telemetry
+
+| Option | Description |
+|:-------|:-----------:|
+| **GPU-TELEMETRY** | Enable GPU telemetry console display and optionally --gpu-telemetry  specify custom DCGM exporter URLs (e.g., http://node1:9401/metrics http://node2:9401/metrics). Default localhost:9400 and localhost:9401 are always attempted |
+
+## Workers
+
+| Option | Description |
+|:-------|:-----------:|
+| **WORKERS-MAX**<br>`--workers-max` | Maximum number of workers to create. If not --max-workers            specified, the number of workers will be determined by the formula min(concurrency, (num CPUs * 0.75) - 1),  with a default max cap of 32. Any value provided will still be capped by the concurrency value (if specified), but not by the max cap. |
+
+## ZMQ Communication
+
+| Option | Description |
+|:-------|:-----------:|
+| **ZMQ-HOST**<br>`--zmq-host` | Host address for TCP connections [default: 127.0.0.1] |
+| **ZMQ-IPC-PATH**<br>`--zmq-ipc-path` | Path for IPC sockets |
diff --git a/tools/generate_cli_options_md.py b/tools/generate_cli_options_md.py
index b4b8b74ac..1be5b4e1d 100755
--- a/tools/generate_cli_options_md.py
+++ b/tools/generate_cli_options_md.py
@@ -45,9 +45,10 @@ def get_help_output() -> str:
 
 
 def format_help_as_markdown(help_output: str) -> str:
-    """Format the help output as markdown with proper header and code blocks."""
-    # Split the output into sections based on the ╭─ ... ─╮ headers
-    lines = help_output.split("\n")
+    """Format the help output as markdown tables."""
+    # Parse the help output into sections
+    sections = parse_help_sections(help_output)
+
     markdown_lines = [
         "<!--",
         "SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.",
@@ -59,39 +60,223 @@ def format_help_as_markdown(help_output: str) -> str:
         "",
     ]
 
-    current_section = []
+    # Convert each section to a markdown table
+    for section_name, options in sections.items():
+        if not options:
+            continue
+
+        markdown_lines.append(f"## {section_name}")
+        markdown_lines.append("")
+
+        # Create table header
+        markdown_lines.append("| Option | Description |")
+        markdown_lines.append("|:-------|:-----------:|")
+
+        # Add each option as a table row
+        for option in options:
+            option_col = format_option_column(option)
+            desc_col = option["description"]
+            markdown_lines.append(f"| {option_col} | {desc_col} |")
+
+        markdown_lines.append("")
+
+    return "\n".join(markdown_lines)
+
+
+def parse_help_sections(help_output: str) -> dict:
+    """Parse CLI help output into sections with options.
+
+    Returns:
+        Dict mapping section names to lists of option dicts
+    """
+    lines = help_output.split("\n")
+    sections = {}
+    current_section = None
+    current_section_lines = []
     in_section = False
 
     for line in lines:
-        # Check if this is a section header line (starts with ╭─)
+        # Check if this is a section header (╭─ ... ─╮)
         if line.strip().startswith("╭─"):
-            # If we were in a previous section, add it
-            if current_section:
-                markdown_lines.append("```")
-                markdown_lines.extend(current_section)
-                markdown_lines.append("```")
-                current_section = []
-            # Start new section
+            # Save previous section if exists
+            if current_section and current_section_lines:
+                sections[current_section] = parse_section_options(current_section_lines)
+
+            # Extract section name
+            title_start = line.find("─ ") + 2
+            title_end = line.rfind(" ─")
+            if title_start > 1 and title_end > title_start:
+                current_section = line[title_start:title_end].strip()
+            else:
+                current_section = "Options"
+
+            current_section_lines = []
             in_section = True
-            current_section.append(line)
+
         elif line.strip().startswith("╰─"):
-            # End of section
-            current_section.append(line)
-            markdown_lines.append("```")
-            markdown_lines.extend(current_section)
-            markdown_lines.append("```")
-            current_section = []
+            # End of section - save it
+            if current_section and current_section_lines:
+                sections[current_section] = parse_section_options(current_section_lines)
+            current_section = None
+            current_section_lines = []
             in_section = False
-        elif in_section:
-            current_section.append(line)
 
-    # Add any remaining section
-    if current_section:
-        markdown_lines.append("```")
-        markdown_lines.extend(current_section)
-        markdown_lines.append("```")
+        elif in_section and line.strip():
+            # Content line - strip box borders
+            if len(line) > 4 and line.startswith("│") and line.endswith("│"):
+                content = line[2:-2]  # Remove │ and surrounding spaces
+                current_section_lines.append(content)
 
-    return "\n".join(markdown_lines)
+    # Add any remaining section (shouldn't happen with proper box format)
+    if current_section and current_section_lines:
+        sections[current_section] = parse_section_options(current_section_lines)
+
+    return sections
+
+
+def parse_section_options(lines: list[str]) -> list[dict]:
+    """Parse option lines into structured option dictionaries.
+
+    Returns:
+        List of dicts with 'name', 'aliases', 'short', 'description', 'required'
+    """
+    options = []
+    current_option = None
+
+    for line in lines:
+        # Detect new option based on indentation:
+        # - Starts with * (required, no leading space)
+        # - Starts with exactly 3 spaces (some options in Endpoint section)
+        # - Starts with NO spaces and uppercase (options in Input, Output, etc.)
+        # - More than 3 spaces or starts with many spaces = continuation
+        is_new_option = False
+
+        if not line or line.isspace():
+            continue
+
+        if line[0] == "*":
+            # Required option (no leading space)
+            is_new_option = True
+        elif line.startswith("   ") and not line.startswith("    "):
+            # Exactly 3 spaces = new option (Endpoint section style)
+            is_new_option = True
+        elif not line.startswith(" ") and line[0].isupper():
+            # No leading space and uppercase = new option (Input/Output section style)
+            is_new_option = True
+
+        if is_new_option:
+            # Save previous option
+            if current_option:
+                options.append(current_option)
+
+            # Start new option
+            current_option = parse_option_line(line.lstrip())
+        elif current_option:
+            # Continuation of description
+            desc = line.strip()
+            if desc:
+                # Add space only if description already has content
+                if current_option["description"]:
+                    current_option["description"] += " " + desc
+                else:
+                    current_option["description"] = desc
+
+    # Add the last option
+    if current_option:
+        options.append(current_option)
+
+    return options
+
+
+def parse_option_line(line: str) -> dict:
+    """Parse a single option line into components.
+
+    Returns:
+        Dict with 'name', 'aliases', 'short', 'description', 'required'
+    """
+    import re
+
+    option = {
+        "name": "",
+        "aliases": [],
+        "short": "",
+        "description": "",
+        "required": False,
+    }
+
+    # Check if required (starts with *)
+    if line.lstrip().startswith("*"):
+        option["required"] = True
+        line = line.lstrip()[1:].lstrip()  # Remove the *
+
+    # Split on multiple spaces to separate option names from description
+    parts = re.split(r"\s{2,}", line.strip())
+
+    if not parts:
+        return option
+
+    # First part contains option names
+    option_names = parts[0]
+
+    # Extract option name and aliases
+    # Pattern: OPTION-NAME --long-name --alias -s
+    tokens = option_names.split()
+
+    for token in tokens:
+        token = token.strip()
+        if not token:
+            continue
+
+        if token.startswith("--"):
+            # Long option
+            if not option["name"]:
+                option["name"] = token
+            else:
+                option["aliases"].append(token)
+        elif token.startswith("-") and len(token) == 2:
+            # Short option
+            option["short"] = token
+        elif (token.isupper() or (token and token[0].isupper())) and not option["name"]:
+            # Environment variable style name
+            option["name"] = token
+
+    # Description is everything after the option names
+    if len(parts) > 1:
+        option["description"] = " ".join(parts[1:])
+
+    return option
+
+
+def format_option_column(option: dict) -> str:
+    """Format the option column with name, aliases, and required marker.
+
+    Returns:
+        Formatted string for the option column
+    """
+    parts = []
+
+    # Required marker
+    if option["required"]:
+        parts.append("**`*`**")
+
+    # Main option name
+    if option["name"]:
+        name = option["name"]
+        # Format as code
+        if name.startswith("--") or name.startswith("-"):
+            parts.append(f"`{name}`")
+        else:
+            parts.append(f"**{name}**")
+
+    # Aliases
+    for alias in option["aliases"]:
+        parts.append(f"`{alias}`")
+
+    # Short option
+    if option["short"]:
+        parts.append(f"`{option['short']}`")
+
+    return "<br>".join(parts) if parts else ""
 
 
 def stage_file(file_path: Path) -> None: